PEFT
Safetensors
English
cybersecurity
malware-analysis
att&ck
threat-intelligence
mixtral
lora
expert-adapters
cape-sandbox
digital-forensics
Instructions to use umer07/fathom-mixtral with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use umer07/fathom-mixtral with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1") model = PeftModel.from_pretrained(base_model, "umer07/fathom-mixtral") - Notebooks
- Google Colab
- Kaggle
| [ | |
| { | |
| "loss": 1.8227, | |
| "grad_norm": 1.0382107496261597, | |
| "learning_rate": 9e-06, | |
| "entropy": 1.3477498710155487, | |
| "num_tokens": 212422.0, | |
| "mean_token_accuracy": 0.6026485413312912, | |
| "epoch": 0.0025823111684958036, | |
| "step": 10 | |
| }, | |
| { | |
| "loss": 1.7607, | |
| "grad_norm": 0.5150073766708374, | |
| "learning_rate": 1.9e-05, | |
| "entropy": 1.4502712726593017, | |
| "num_tokens": 439120.0, | |
| "mean_token_accuracy": 0.6050827592611313, | |
| "epoch": 0.005164622336991607, | |
| "step": 20 | |
| }, | |
| { | |
| "loss": 1.6637, | |
| "grad_norm": 0.49609556794166565, | |
| "learning_rate": 2.9e-05, | |
| "entropy": 1.6171577751636506, | |
| "num_tokens": 659373.0, | |
| "mean_token_accuracy": 0.6135089665651321, | |
| "epoch": 0.007746933505487412, | |
| "step": 30 | |
| }, | |
| { | |
| "loss": 1.5172, | |
| "grad_norm": 0.5204024314880371, | |
| "learning_rate": 3.9000000000000006e-05, | |
| "entropy": 1.5392299711704254, | |
| "num_tokens": 892258.0, | |
| "mean_token_accuracy": 0.6360862851142883, | |
| "epoch": 0.010329244673983214, | |
| "step": 40 | |
| }, | |
| { | |
| "loss": 1.2945, | |
| "grad_norm": 0.5096025466918945, | |
| "learning_rate": 4.9e-05, | |
| "entropy": 1.2994968891143799, | |
| "num_tokens": 1105690.0, | |
| "mean_token_accuracy": 0.6796375066041946, | |
| "epoch": 0.012911555842479019, | |
| "step": 50 | |
| }, | |
| { | |
| "loss": 1.0857, | |
| "grad_norm": 0.46880456805229187, | |
| "learning_rate": 5.9e-05, | |
| "entropy": 1.0753329753875733, | |
| "num_tokens": 1330615.0, | |
| "mean_token_accuracy": 0.7262010276317596, | |
| "epoch": 0.015493867010974823, | |
| "step": 60 | |
| }, | |
| { | |
| "loss": 1.0636, | |
| "grad_norm": 0.34878015518188477, | |
| "learning_rate": 6.9e-05, | |
| "entropy": 1.0663362741470337, | |
| "num_tokens": 1550884.0, | |
| "mean_token_accuracy": 0.7337213784456253, | |
| "epoch": 0.018076178179470628, | |
| "step": 70 | |
| }, | |
| { | |
| "loss": 0.9654, | |
| "grad_norm": 0.3773444592952728, | |
| "learning_rate": 7.900000000000001e-05, | |
| "entropy": 0.968677031993866, | |
| "num_tokens": 1771016.0, | |
| "mean_token_accuracy": 0.7530804932117462, | |
| "epoch": 0.02065848934796643, | |
| "step": 80 | |
| }, | |
| { | |
| "loss": 0.9163, | |
| "grad_norm": 0.35969284176826477, | |
| "learning_rate": 8.900000000000001e-05, | |
| "entropy": 0.9248217672109604, | |
| "num_tokens": 1989636.0, | |
| "mean_token_accuracy": 0.7628412812948226, | |
| "epoch": 0.023240800516462233, | |
| "step": 90 | |
| }, | |
| { | |
| "loss": 0.9032, | |
| "grad_norm": 0.30777958035469055, | |
| "learning_rate": 9.900000000000001e-05, | |
| "entropy": 0.9049190044403076, | |
| "num_tokens": 2216762.0, | |
| "mean_token_accuracy": 0.7661656141281128, | |
| "epoch": 0.025823111684958037, | |
| "step": 100 | |
| }, | |
| { | |
| "loss": 0.8708, | |
| "grad_norm": 0.3418641686439514, | |
| "learning_rate": 9.999859605811758e-05, | |
| "entropy": 0.8667158752679824, | |
| "num_tokens": 2449222.0, | |
| "mean_token_accuracy": 0.7731464982032776, | |
| "epoch": 0.028405422853453842, | |
| "step": 110 | |
| }, | |
| { | |
| "loss": 0.876, | |
| "grad_norm": 0.3700331449508667, | |
| "learning_rate": 9.999374302690578e-05, | |
| "entropy": 0.881954425573349, | |
| "num_tokens": 2679757.0, | |
| "mean_token_accuracy": 0.771819531917572, | |
| "epoch": 0.030987734021949646, | |
| "step": 120 | |
| }, | |
| { | |
| "loss": 0.8714, | |
| "grad_norm": 0.3392849266529083, | |
| "learning_rate": 9.998542391013121e-05, | |
| "entropy": 0.881954836845398, | |
| "num_tokens": 2899369.0, | |
| "mean_token_accuracy": 0.7725747585296631, | |
| "epoch": 0.03357004519044545, | |
| "step": 130 | |
| }, | |
| { | |
| "loss": 0.8221, | |
| "grad_norm": 0.313702791929245, | |
| "learning_rate": 9.997363928456144e-05, | |
| "entropy": 0.8353541791439056, | |
| "num_tokens": 3117811.0, | |
| "mean_token_accuracy": 0.7830256283283233, | |
| "epoch": 0.036152356358941255, | |
| "step": 140 | |
| }, | |
| { | |
| "loss": 0.8141, | |
| "grad_norm": 0.34913378953933716, | |
| "learning_rate": 9.995838996722914e-05, | |
| "entropy": 0.8243900567293168, | |
| "num_tokens": 3335852.0, | |
| "mean_token_accuracy": 0.7871499985456467, | |
| "epoch": 0.03873466752743705, | |
| "step": 150 | |
| }, | |
| { | |
| "loss": 0.9178, | |
| "grad_norm": 0.3551943302154541, | |
| "learning_rate": 9.99396770153754e-05, | |
| "entropy": 0.9281795680522918, | |
| "num_tokens": 3547153.0, | |
| "mean_token_accuracy": 0.7648159861564636, | |
| "epoch": 0.04131697869593286, | |
| "step": 160 | |
| }, | |
| { | |
| "loss": 0.8314, | |
| "grad_norm": 0.3343612849712372, | |
| "learning_rate": 9.991750172637635e-05, | |
| "entropy": 0.8414248585700989, | |
| "num_tokens": 3774587.0, | |
| "mean_token_accuracy": 0.7814970403909683, | |
| "epoch": 0.04389928986442866, | |
| "step": 170 | |
| }, | |
| { | |
| "loss": 0.8441, | |
| "grad_norm": 0.38748034834861755, | |
| "learning_rate": 9.989186563765343e-05, | |
| "entropy": 0.8512898027896881, | |
| "num_tokens": 3987027.0, | |
| "mean_token_accuracy": 0.7771281927824021, | |
| "epoch": 0.046481601032924466, | |
| "step": 180 | |
| }, | |
| { | |
| "loss": 0.8497, | |
| "grad_norm": 0.3136347830295563, | |
| "learning_rate": 9.986277052656658e-05, | |
| "entropy": 0.8494892001152039, | |
| "num_tokens": 4224162.0, | |
| "mean_token_accuracy": 0.7764846205711364, | |
| "epoch": 0.04906391220142027, | |
| "step": 190 | |
| }, | |
| { | |
| "loss": 0.8767, | |
| "grad_norm": 0.2760690152645111, | |
| "learning_rate": 9.983021841029111e-05, | |
| "entropy": 0.8813158571720123, | |
| "num_tokens": 4461540.0, | |
| "mean_token_accuracy": 0.7718124866485596, | |
| "epoch": 0.051646223369916075, | |
| "step": 200 | |
| }, | |
| { | |
| "loss": 0.8616, | |
| "grad_norm": 0.3373132646083832, | |
| "learning_rate": 9.979421154567785e-05, | |
| "entropy": 0.8658105224370957, | |
| "num_tokens": 4675517.0, | |
| "mean_token_accuracy": 0.7753888577222824, | |
| "epoch": 0.05422853453841188, | |
| "step": 210 | |
| }, | |
| { | |
| "loss": 0.8315, | |
| "grad_norm": 0.3122693598270416, | |
| "learning_rate": 9.975475242909667e-05, | |
| "entropy": 0.8357755273580552, | |
| "num_tokens": 4893926.0, | |
| "mean_token_accuracy": 0.7797913908958435, | |
| "epoch": 0.056810845706907684, | |
| "step": 220 | |
| }, | |
| { | |
| "loss": 0.777, | |
| "grad_norm": 0.31195691227912903, | |
| "learning_rate": 9.971184379626339e-05, | |
| "entropy": 0.7857547640800476, | |
| "num_tokens": 5119968.0, | |
| "mean_token_accuracy": 0.7917549222707748, | |
| "epoch": 0.05939315687540349, | |
| "step": 230 | |
| }, | |
| { | |
| "loss": 0.8304, | |
| "grad_norm": 0.39890217781066895, | |
| "learning_rate": 9.966548862205016e-05, | |
| "entropy": 0.8422511845827103, | |
| "num_tokens": 5342685.0, | |
| "mean_token_accuracy": 0.7815836250782013, | |
| "epoch": 0.06197546804389929, | |
| "step": 240 | |
| }, | |
| { | |
| "loss": 0.7829, | |
| "grad_norm": 0.295213907957077, | |
| "learning_rate": 9.961569012027915e-05, | |
| "entropy": 0.7870919585227967, | |
| "num_tokens": 5570339.0, | |
| "mean_token_accuracy": 0.7925563097000122, | |
| "epoch": 0.0645577792123951, | |
| "step": 250 | |
| }, | |
| { | |
| "loss": 0.8125, | |
| "grad_norm": 0.3115305006504059, | |
| "learning_rate": 9.956245174349976e-05, | |
| "entropy": 0.8125756323337555, | |
| "num_tokens": 5811219.0, | |
| "mean_token_accuracy": 0.78687684237957, | |
| "epoch": 0.0671400903808909, | |
| "step": 260 | |
| }, | |
| { | |
| "loss": 0.808, | |
| "grad_norm": 0.3087800443172455, | |
| "learning_rate": 9.950577718274927e-05, | |
| "entropy": 0.8085541218519211, | |
| "num_tokens": 6027751.0, | |
| "mean_token_accuracy": 0.7848722189664841, | |
| "epoch": 0.0697224015493867, | |
| "step": 270 | |
| }, | |
| { | |
| "loss": 0.793, | |
| "grad_norm": 0.3529994487762451, | |
| "learning_rate": 9.944567036729695e-05, | |
| "entropy": 0.8006815254688263, | |
| "num_tokens": 6244346.0, | |
| "mean_token_accuracy": 0.78866326212883, | |
| "epoch": 0.07230471271788251, | |
| "step": 280 | |
| }, | |
| { | |
| "loss": 0.8033, | |
| "grad_norm": 0.33018946647644043, | |
| "learning_rate": 9.938213546437154e-05, | |
| "entropy": 0.8124665051698685, | |
| "num_tokens": 6468953.0, | |
| "mean_token_accuracy": 0.7864397615194321, | |
| "epoch": 0.07488702388637831, | |
| "step": 290 | |
| }, | |
| { | |
| "loss": 0.8236, | |
| "grad_norm": 0.3586885929107666, | |
| "learning_rate": 9.93151768788725e-05, | |
| "entropy": 0.822173684835434, | |
| "num_tokens": 6683224.0, | |
| "mean_token_accuracy": 0.783802455663681, | |
| "epoch": 0.0774693350548741, | |
| "step": 300 | |
| }, | |
| { | |
| "loss": 0.803, | |
| "grad_norm": 0.36473768949508667, | |
| "learning_rate": 9.924479925306448e-05, | |
| "entropy": 0.8035257875919342, | |
| "num_tokens": 6899814.0, | |
| "mean_token_accuracy": 0.7880063384771347, | |
| "epoch": 0.08005164622336991, | |
| "step": 310 | |
| }, | |
| { | |
| "loss": 0.7717, | |
| "grad_norm": 0.29985126852989197, | |
| "learning_rate": 9.917100746625552e-05, | |
| "entropy": 0.7767777234315872, | |
| "num_tokens": 7127264.0, | |
| "mean_token_accuracy": 0.7941146910190582, | |
| "epoch": 0.08263395739186571, | |
| "step": 320 | |
| }, | |
| { | |
| "loss": 0.8628, | |
| "grad_norm": 0.38821646571159363, | |
| "learning_rate": 9.90938066344588e-05, | |
| "entropy": 0.8686172008514405, | |
| "num_tokens": 7346312.0, | |
| "mean_token_accuracy": 0.7750474840402604, | |
| "epoch": 0.08521626856036152, | |
| "step": 330 | |
| }, | |
| { | |
| "loss": 0.7993, | |
| "grad_norm": 0.2704952657222748, | |
| "learning_rate": 9.901320211003786e-05, | |
| "entropy": 0.8089859664440155, | |
| "num_tokens": 7568639.0, | |
| "mean_token_accuracy": 0.7889086365699768, | |
| "epoch": 0.08779857972885732, | |
| "step": 340 | |
| }, | |
| { | |
| "loss": 0.8385, | |
| "grad_norm": 0.3369337022304535, | |
| "learning_rate": 9.892919948133558e-05, | |
| "entropy": 0.8445587635040284, | |
| "num_tokens": 7782903.0, | |
| "mean_token_accuracy": 0.7784519046545029, | |
| "epoch": 0.09038089089735313, | |
| "step": 350 | |
| }, | |
| { | |
| "loss": 0.8203, | |
| "grad_norm": 0.3692948818206787, | |
| "learning_rate": 9.884180457228678e-05, | |
| "entropy": 0.8273489683866501, | |
| "num_tokens": 7992733.0, | |
| "mean_token_accuracy": 0.7822743952274323, | |
| "epoch": 0.09296320206584893, | |
| "step": 360 | |
| }, | |
| { | |
| "loss": 0.7603, | |
| "grad_norm": 0.27002501487731934, | |
| "learning_rate": 9.875102344201428e-05, | |
| "entropy": 0.7642450213432312, | |
| "num_tokens": 8229494.0, | |
| "mean_token_accuracy": 0.7939036816358567, | |
| "epoch": 0.09554551323434474, | |
| "step": 370 | |
| }, | |
| { | |
| "loss": 0.8318, | |
| "grad_norm": 0.29771047830581665, | |
| "learning_rate": 9.865686238440905e-05, | |
| "entropy": 0.8396833568811417, | |
| "num_tokens": 8451547.0, | |
| "mean_token_accuracy": 0.7823069781064987, | |
| "epoch": 0.09812782440284054, | |
| "step": 380 | |
| }, | |
| { | |
| "loss": 0.8068, | |
| "grad_norm": 0.26213449239730835, | |
| "learning_rate": 9.855932792769367e-05, | |
| "entropy": 0.8070472866296768, | |
| "num_tokens": 8659006.0, | |
| "mean_token_accuracy": 0.7854753315448761, | |
| "epoch": 0.10071013557133635, | |
| "step": 390 | |
| }, | |
| { | |
| "loss": 0.8116, | |
| "grad_norm": 0.3281964361667633, | |
| "learning_rate": 9.845842683396975e-05, | |
| "entropy": 0.8121274918317795, | |
| "num_tokens": 8880841.0, | |
| "mean_token_accuracy": 0.785520127415657, | |
| "epoch": 0.10329244673983215, | |
| "step": 400 | |
| }, | |
| { | |
| "loss": 0.7812, | |
| "grad_norm": 0.28127405047416687, | |
| "learning_rate": 9.835416609874923e-05, | |
| "entropy": 0.782671543955803, | |
| "num_tokens": 9105513.0, | |
| "mean_token_accuracy": 0.7911499351263046, | |
| "epoch": 0.10587475790832795, | |
| "step": 410 | |
| }, | |
| { | |
| "loss": 0.7683, | |
| "grad_norm": 0.3277055323123932, | |
| "learning_rate": 9.824655295046925e-05, | |
| "entropy": 0.7686008930206298, | |
| "num_tokens": 9328555.0, | |
| "mean_token_accuracy": 0.7945983529090881, | |
| "epoch": 0.10845706907682376, | |
| "step": 420 | |
| }, | |
| { | |
| "loss": 0.797, | |
| "grad_norm": 0.339370459318161, | |
| "learning_rate": 9.813559484999102e-05, | |
| "entropy": 0.7977692514657975, | |
| "num_tokens": 9548239.0, | |
| "mean_token_accuracy": 0.7889350891113281, | |
| "epoch": 0.11103938024531956, | |
| "step": 430 | |
| }, | |
| { | |
| "loss": 0.7888, | |
| "grad_norm": 0.2581377923488617, | |
| "learning_rate": 9.802129949008265e-05, | |
| "entropy": 0.7947029381990433, | |
| "num_tokens": 9773006.0, | |
| "mean_token_accuracy": 0.7894155353307724, | |
| "epoch": 0.11362169141381537, | |
| "step": 440 | |
| }, | |
| { | |
| "loss": 0.7939, | |
| "grad_norm": 0.2780890464782715, | |
| "learning_rate": 9.790367479488567e-05, | |
| "entropy": 0.7900374501943588, | |
| "num_tokens": 10007849.0, | |
| "mean_token_accuracy": 0.7889153778553009, | |
| "epoch": 0.11620400258231117, | |
| "step": 450 | |
| }, | |
| { | |
| "loss": 0.7843, | |
| "grad_norm": 0.35706827044487, | |
| "learning_rate": 9.778272891936578e-05, | |
| "entropy": 0.7888166964054107, | |
| "num_tokens": 10228340.0, | |
| "mean_token_accuracy": 0.7894989937543869, | |
| "epoch": 0.11878631375080698, | |
| "step": 460 | |
| }, | |
| { | |
| "loss": 0.7818, | |
| "grad_norm": 0.28400421142578125, | |
| "learning_rate": 9.765847024874732e-05, | |
| "entropy": 0.788197135925293, | |
| "num_tokens": 10449248.0, | |
| "mean_token_accuracy": 0.7903656899929047, | |
| "epoch": 0.12136862491930278, | |
| "step": 470 | |
| }, | |
| { | |
| "loss": 0.7724, | |
| "grad_norm": 0.3045465648174286, | |
| "learning_rate": 9.753090739793208e-05, | |
| "entropy": 0.7795355767011642, | |
| "num_tokens": 10693608.0, | |
| "mean_token_accuracy": 0.792780527472496, | |
| "epoch": 0.12395093608779859, | |
| "step": 480 | |
| }, | |
| { | |
| "loss": 0.8257, | |
| "grad_norm": 0.2519194781780243, | |
| "learning_rate": 9.740004921090188e-05, | |
| "entropy": 0.8308536618947983, | |
| "num_tokens": 10916236.0, | |
| "mean_token_accuracy": 0.7820773988962173, | |
| "epoch": 0.1265332472562944, | |
| "step": 490 | |
| }, | |
| { | |
| "loss": 0.89, | |
| "grad_norm": 0.26520147919654846, | |
| "learning_rate": 9.726590476010548e-05, | |
| "entropy": 0.891560536623001, | |
| "num_tokens": 11141566.0, | |
| "mean_token_accuracy": 0.7719975858926773, | |
| "epoch": 0.1291155584247902, | |
| "step": 500 | |
| }, | |
| { | |
| "loss": 0.7747, | |
| "grad_norm": 0.3187003433704376, | |
| "learning_rate": 9.712848334582965e-05, | |
| "entropy": 0.7743502765893936, | |
| "num_tokens": 11355026.0, | |
| "mean_token_accuracy": 0.7932853370904922, | |
| "epoch": 0.131697869593286, | |
| "step": 510 | |
| }, | |
| { | |
| "loss": 0.7687, | |
| "grad_norm": 0.2777857780456543, | |
| "learning_rate": 9.698779449555421e-05, | |
| "entropy": 0.7742303222417831, | |
| "num_tokens": 11568694.0, | |
| "mean_token_accuracy": 0.7947982579469681, | |
| "epoch": 0.1342801807617818, | |
| "step": 520 | |
| }, | |
| { | |
| "loss": 0.7613, | |
| "grad_norm": 0.26509273052215576, | |
| "learning_rate": 9.684384796329165e-05, | |
| "entropy": 0.7646809101104737, | |
| "num_tokens": 11801619.0, | |
| "mean_token_accuracy": 0.7968462467193603, | |
| "epoch": 0.1368624919302776, | |
| "step": 530 | |
| }, | |
| { | |
| "loss": 0.7922, | |
| "grad_norm": 0.30422940850257874, | |
| "learning_rate": 9.669665372891076e-05, | |
| "entropy": 0.797106859087944, | |
| "num_tokens": 12028866.0, | |
| "mean_token_accuracy": 0.7882502853870392, | |
| "epoch": 0.1394448030987734, | |
| "step": 540 | |
| }, | |
| { | |
| "loss": 0.7981, | |
| "grad_norm": 0.3401212692260742, | |
| "learning_rate": 9.654622199744485e-05, | |
| "entropy": 0.802043342590332, | |
| "num_tokens": 12231531.0, | |
| "mean_token_accuracy": 0.788466426730156, | |
| "epoch": 0.14202711426726922, | |
| "step": 550 | |
| }, | |
| { | |
| "loss": 0.7113, | |
| "grad_norm": 0.2457897961139679, | |
| "learning_rate": 9.639256319838408e-05, | |
| "entropy": 0.713046595454216, | |
| "num_tokens": 12467239.0, | |
| "mean_token_accuracy": 0.8070330142974853, | |
| "epoch": 0.14460942543576502, | |
| "step": 560 | |
| }, | |
| { | |
| "loss": 0.7966, | |
| "grad_norm": 0.36826762557029724, | |
| "learning_rate": 9.62356879849525e-05, | |
| "entropy": 0.8055638283491134, | |
| "num_tokens": 12691681.0, | |
| "mean_token_accuracy": 0.789168867468834, | |
| "epoch": 0.14719173660426083, | |
| "step": 570 | |
| }, | |
| { | |
| "loss": 0.77, | |
| "grad_norm": 0.23838452994823456, | |
| "learning_rate": 9.607560723336941e-05, | |
| "entropy": 0.7768332690000535, | |
| "num_tokens": 12916444.0, | |
| "mean_token_accuracy": 0.795351967215538, | |
| "epoch": 0.14977404777275663, | |
| "step": 580 | |
| }, | |
| { | |
| "loss": 0.7308, | |
| "grad_norm": 0.2863340377807617, | |
| "learning_rate": 9.591233204209532e-05, | |
| "entropy": 0.7349940836429596, | |
| "num_tokens": 13132831.0, | |
| "mean_token_accuracy": 0.8030821114778519, | |
| "epoch": 0.15235635894125243, | |
| "step": 590 | |
| }, | |
| { | |
| "loss": 0.7557, | |
| "grad_norm": 0.2508617639541626, | |
| "learning_rate": 9.574587373106241e-05, | |
| "entropy": 0.7568652123212815, | |
| "num_tokens": 13351405.0, | |
| "mean_token_accuracy": 0.7971890032291412, | |
| "epoch": 0.1549386701097482, | |
| "step": 600 | |
| }, | |
| { | |
| "loss": 0.7656, | |
| "grad_norm": 0.2594195008277893, | |
| "learning_rate": 9.557624384088988e-05, | |
| "entropy": 0.7718629479408264, | |
| "num_tokens": 13586644.0, | |
| "mean_token_accuracy": 0.796920120716095, | |
| "epoch": 0.15752098127824402, | |
| "step": 610 | |
| }, | |
| { | |
| "loss": 0.8103, | |
| "grad_norm": 0.26011860370635986, | |
| "learning_rate": 9.540345413208371e-05, | |
| "entropy": 0.8115774095058441, | |
| "num_tokens": 13807188.0, | |
| "mean_token_accuracy": 0.7873918890953064, | |
| "epoch": 0.16010329244673982, | |
| "step": 620 | |
| }, | |
| { | |
| "loss": 0.7465, | |
| "grad_norm": 0.26401853561401367, | |
| "learning_rate": 9.522751658422128e-05, | |
| "entropy": 0.7476193904876709, | |
| "num_tokens": 14037773.0, | |
| "mean_token_accuracy": 0.7999627381563187, | |
| "epoch": 0.16268560361523562, | |
| "step": 630 | |
| }, | |
| { | |
| "loss": 0.7885, | |
| "grad_norm": 0.2792288064956665, | |
| "learning_rate": 9.504844339512095e-05, | |
| "entropy": 0.7934227615594864, | |
| "num_tokens": 14250931.0, | |
| "mean_token_accuracy": 0.7914494186639786, | |
| "epoch": 0.16526791478373143, | |
| "step": 640 | |
| }, | |
| { | |
| "loss": 0.8174, | |
| "grad_norm": 0.265168696641922, | |
| "learning_rate": 9.486624697999625e-05, | |
| "entropy": 0.8207276076078415, | |
| "num_tokens": 14472590.0, | |
| "mean_token_accuracy": 0.7836927026510239, | |
| "epoch": 0.16785022595222723, | |
| "step": 650 | |
| }, | |
| { | |
| "loss": 0.7785, | |
| "grad_norm": 0.30072885751724243, | |
| "learning_rate": 9.468093997059518e-05, | |
| "entropy": 0.7893655002117157, | |
| "num_tokens": 14696942.0, | |
| "mean_token_accuracy": 0.7929514676332474, | |
| "epoch": 0.17043253712072304, | |
| "step": 660 | |
| }, | |
| { | |
| "loss": 0.803, | |
| "grad_norm": 0.2837442457675934, | |
| "learning_rate": 9.449253521432447e-05, | |
| "entropy": 0.8055104643106461, | |
| "num_tokens": 14917939.0, | |
| "mean_token_accuracy": 0.7881365656852722, | |
| "epoch": 0.17301484828921884, | |
| "step": 670 | |
| }, | |
| { | |
| "loss": 0.7755, | |
| "grad_norm": 0.3586549460887909, | |
| "learning_rate": 9.430104577335881e-05, | |
| "entropy": 0.785092431306839, | |
| "num_tokens": 15126055.0, | |
| "mean_token_accuracy": 0.7926269322633743, | |
| "epoch": 0.17559715945771465, | |
| "step": 680 | |
| }, | |
| { | |
| "loss": 0.7333, | |
| "grad_norm": 0.2736169695854187, | |
| "learning_rate": 9.41064849237353e-05, | |
| "entropy": 0.7377870678901672, | |
| "num_tokens": 15345502.0, | |
| "mean_token_accuracy": 0.800521832704544, | |
| "epoch": 0.17817947062621045, | |
| "step": 690 | |
| }, | |
| { | |
| "loss": 0.7181, | |
| "grad_norm": 0.24931730329990387, | |
| "learning_rate": 9.390886615443299e-05, | |
| "entropy": 0.7208522915840149, | |
| "num_tokens": 15576219.0, | |
| "mean_token_accuracy": 0.8049475431442261, | |
| "epoch": 0.18076178179470626, | |
| "step": 700 | |
| }, | |
| { | |
| "loss": 0.738, | |
| "grad_norm": 0.28357696533203125, | |
| "learning_rate": 9.370820316643768e-05, | |
| "entropy": 0.735668683052063, | |
| "num_tokens": 15799601.0, | |
| "mean_token_accuracy": 0.8012147158384323, | |
| "epoch": 0.18334409296320206, | |
| "step": 710 | |
| }, | |
| { | |
| "loss": 0.778, | |
| "grad_norm": 0.2858926057815552, | |
| "learning_rate": 9.350450987179202e-05, | |
| "entropy": 0.781666374206543, | |
| "num_tokens": 16018324.0, | |
| "mean_token_accuracy": 0.7933406472206116, | |
| "epoch": 0.18592640413169786, | |
| "step": 720 | |
| }, | |
| { | |
| "loss": 0.7659, | |
| "grad_norm": 0.2707382142543793, | |
| "learning_rate": 9.329780039263101e-05, | |
| "entropy": 0.7684377580881119, | |
| "num_tokens": 16249193.0, | |
| "mean_token_accuracy": 0.7941454380750657, | |
| "epoch": 0.18850871530019367, | |
| "step": 730 | |
| }, | |
| { | |
| "loss": 0.7866, | |
| "grad_norm": 0.272834450006485, | |
| "learning_rate": 9.308808906020287e-05, | |
| "entropy": 0.7898376554250717, | |
| "num_tokens": 16474694.0, | |
| "mean_token_accuracy": 0.7909801512956619, | |
| "epoch": 0.19109102646868947, | |
| "step": 740 | |
| }, | |
| { | |
| "loss": 0.7656, | |
| "grad_norm": 0.2885092794895172, | |
| "learning_rate": 9.287539041387547e-05, | |
| "entropy": 0.7683475404977799, | |
| "num_tokens": 16704204.0, | |
| "mean_token_accuracy": 0.7954743087291718, | |
| "epoch": 0.19367333763718528, | |
| "step": 750 | |
| }, | |
| { | |
| "loss": 0.7307, | |
| "grad_norm": 0.3181609809398651, | |
| "learning_rate": 9.265971920012837e-05, | |
| "entropy": 0.735710608959198, | |
| "num_tokens": 16933045.0, | |
| "mean_token_accuracy": 0.8025461912155152, | |
| "epoch": 0.19625564880568108, | |
| "step": 760 | |
| }, | |
| { | |
| "loss": 0.7112, | |
| "grad_norm": 0.2486344277858734, | |
| "learning_rate": 9.244109037153029e-05, | |
| "entropy": 0.7170950889587402, | |
| "num_tokens": 17164834.0, | |
| "mean_token_accuracy": 0.8034955650568009, | |
| "epoch": 0.1988379599741769, | |
| "step": 770 | |
| }, | |
| { | |
| "loss": 0.708, | |
| "grad_norm": 0.27479955554008484, | |
| "learning_rate": 9.221951908570258e-05, | |
| "entropy": 0.7130055457353592, | |
| "num_tokens": 17391047.0, | |
| "mean_token_accuracy": 0.8064507722854615, | |
| "epoch": 0.2014202711426727, | |
| "step": 780 | |
| }, | |
| { | |
| "loss": 0.7404, | |
| "grad_norm": 0.27712559700012207, | |
| "learning_rate": 9.199502070426833e-05, | |
| "entropy": 0.7458436578512192, | |
| "num_tokens": 17600014.0, | |
| "mean_token_accuracy": 0.8010766774415969, | |
| "epoch": 0.2040025823111685, | |
| "step": 790 | |
| }, | |
| { | |
| "loss": 0.7459, | |
| "grad_norm": 0.23461352288722992, | |
| "learning_rate": 9.176761079178717e-05, | |
| "entropy": 0.7532986223697662, | |
| "num_tokens": 17839375.0, | |
| "mean_token_accuracy": 0.7996405005455017, | |
| "epoch": 0.2065848934796643, | |
| "step": 800 | |
| }, | |
| { | |
| "loss": 0.7889, | |
| "grad_norm": 0.3072088658809662, | |
| "learning_rate": 9.153730511467646e-05, | |
| "entropy": 0.7842026621103286, | |
| "num_tokens": 18051756.0, | |
| "mean_token_accuracy": 0.7949094295501709, | |
| "epoch": 0.2091672046481601, | |
| "step": 810 | |
| }, | |
| { | |
| "loss": 0.7256, | |
| "grad_norm": 0.2747524380683899, | |
| "learning_rate": 9.130411964011795e-05, | |
| "entropy": 0.7265078127384186, | |
| "num_tokens": 18271944.0, | |
| "mean_token_accuracy": 0.8043490052223206, | |
| "epoch": 0.2117495158166559, | |
| "step": 820 | |
| }, | |
| { | |
| "loss": 0.7522, | |
| "grad_norm": 0.2839372456073761, | |
| "learning_rate": 9.10680705349509e-05, | |
| "entropy": 0.7600590527057648, | |
| "num_tokens": 18493573.0, | |
| "mean_token_accuracy": 0.7979195952415467, | |
| "epoch": 0.2143318269851517, | |
| "step": 830 | |
| }, | |
| { | |
| "loss": 0.7184, | |
| "grad_norm": 0.26603153347969055, | |
| "learning_rate": 9.082917416455114e-05, | |
| "entropy": 0.727937662601471, | |
| "num_tokens": 18732461.0, | |
| "mean_token_accuracy": 0.8036612659692765, | |
| "epoch": 0.21691413815364752, | |
| "step": 840 | |
| }, | |
| { | |
| "loss": 0.7606, | |
| "grad_norm": 0.28745344281196594, | |
| "learning_rate": 9.058744709169657e-05, | |
| "entropy": 0.7691597014665603, | |
| "num_tokens": 18948836.0, | |
| "mean_token_accuracy": 0.7954544007778168, | |
| "epoch": 0.21949644932214332, | |
| "step": 850 | |
| }, | |
| { | |
| "loss": 0.7432, | |
| "grad_norm": 0.24271559715270996, | |
| "learning_rate": 9.034290607541875e-05, | |
| "entropy": 0.7455548018217086, | |
| "num_tokens": 19175411.0, | |
| "mean_token_accuracy": 0.7987944543361664, | |
| "epoch": 0.22207876049063913, | |
| "step": 860 | |
| }, | |
| { | |
| "loss": 0.7808, | |
| "grad_norm": 0.26113608479499817, | |
| "learning_rate": 9.009556806984102e-05, | |
| "entropy": 0.7865129858255386, | |
| "num_tokens": 19401674.0, | |
| "mean_token_accuracy": 0.7918182015419006, | |
| "epoch": 0.22466107165913493, | |
| "step": 870 | |
| }, | |
| { | |
| "loss": 0.7627, | |
| "grad_norm": 0.2622512876987457, | |
| "learning_rate": 8.984545022300308e-05, | |
| "entropy": 0.7637326002120972, | |
| "num_tokens": 19616114.0, | |
| "mean_token_accuracy": 0.796609079837799, | |
| "epoch": 0.22724338282763074, | |
| "step": 880 | |
| }, | |
| { | |
| "loss": 0.7635, | |
| "grad_norm": 0.28230419754981995, | |
| "learning_rate": 8.95925698756721e-05, | |
| "entropy": 0.7649184703826905, | |
| "num_tokens": 19833283.0, | |
| "mean_token_accuracy": 0.7957229435443878, | |
| "epoch": 0.22982569399612654, | |
| "step": 890 | |
| }, | |
| { | |
| "loss": 0.7139, | |
| "grad_norm": 0.2590973675251007, | |
| "learning_rate": 8.93369445601405e-05, | |
| "entropy": 0.7141411513090133, | |
| "num_tokens": 20067846.0, | |
| "mean_token_accuracy": 0.8063762694597244, | |
| "epoch": 0.23240800516462234, | |
| "step": 900 | |
| }, | |
| { | |
| "loss": 0.7258, | |
| "grad_norm": 0.29914793372154236, | |
| "learning_rate": 8.907859199901036e-05, | |
| "entropy": 0.7249648362398148, | |
| "num_tokens": 20282958.0, | |
| "mean_token_accuracy": 0.8041715890169143, | |
| "epoch": 0.23499031633311815, | |
| "step": 910 | |
| }, | |
| { | |
| "loss": 0.7468, | |
| "grad_norm": 0.33904722332954407, | |
| "learning_rate": 8.881753010396483e-05, | |
| "entropy": 0.7552884995937348, | |
| "num_tokens": 20514217.0, | |
| "mean_token_accuracy": 0.7984365493059158, | |
| "epoch": 0.23757262750161395, | |
| "step": 920 | |
| }, | |
| { | |
| "loss": 0.7469, | |
| "grad_norm": 0.2916295528411865, | |
| "learning_rate": 8.855377697452613e-05, | |
| "entropy": 0.7542608201503753, | |
| "num_tokens": 20742976.0, | |
| "mean_token_accuracy": 0.8005632907152176, | |
| "epoch": 0.24015493867010976, | |
| "step": 930 | |
| }, | |
| { | |
| "loss": 0.7635, | |
| "grad_norm": 0.2670243978500366, | |
| "learning_rate": 8.828735089680089e-05, | |
| "entropy": 0.7616350531578064, | |
| "num_tokens": 20967064.0, | |
| "mean_token_accuracy": 0.796535101532936, | |
| "epoch": 0.24273724983860556, | |
| "step": 940 | |
| }, | |
| { | |
| "loss": 0.7721, | |
| "grad_norm": 0.32767635583877563, | |
| "learning_rate": 8.801827034221222e-05, | |
| "entropy": 0.7816158264875412, | |
| "num_tokens": 21198991.0, | |
| "mean_token_accuracy": 0.7934033811092377, | |
| "epoch": 0.24531956100710137, | |
| "step": 950 | |
| }, | |
| { | |
| "loss": 0.7324, | |
| "grad_norm": 0.2466670572757721, | |
| "learning_rate": 8.77465539662192e-05, | |
| "entropy": 0.7298496663570404, | |
| "num_tokens": 21416342.0, | |
| "mean_token_accuracy": 0.8032382041215896, | |
| "epoch": 0.24790187217559717, | |
| "step": 960 | |
| }, | |
| { | |
| "loss": 0.7203, | |
| "grad_norm": 0.32663026452064514, | |
| "learning_rate": 8.747222060702336e-05, | |
| "entropy": 0.7248791009187698, | |
| "num_tokens": 21639020.0, | |
| "mean_token_accuracy": 0.8066118597984314, | |
| "epoch": 0.25048418334409295, | |
| "step": 970 | |
| }, | |
| { | |
| "loss": 0.787, | |
| "grad_norm": 0.2980765700340271, | |
| "learning_rate": 8.719528928426273e-05, | |
| "entropy": 0.7887016743421554, | |
| "num_tokens": 21848300.0, | |
| "mean_token_accuracy": 0.7913496702909469, | |
| "epoch": 0.2530664945125888, | |
| "step": 980 | |
| }, | |
| { | |
| "loss": 0.7164, | |
| "grad_norm": 0.2752622067928314, | |
| "learning_rate": 8.691577919769316e-05, | |
| "entropy": 0.7202609479427338, | |
| "num_tokens": 22067594.0, | |
| "mean_token_accuracy": 0.8054278463125228, | |
| "epoch": 0.25564880568108456, | |
| "step": 990 | |
| }, | |
| { | |
| "loss": 0.7741, | |
| "grad_norm": 0.26739493012428284, | |
| "learning_rate": 8.663370972585717e-05, | |
| "entropy": 0.7841310113668442, | |
| "num_tokens": 22292216.0, | |
| "mean_token_accuracy": 0.7926570892333984, | |
| "epoch": 0.2582311168495804, | |
| "step": 1000 | |
| }, | |
| { | |
| "loss": 0.7975, | |
| "grad_norm": 0.2629258334636688, | |
| "learning_rate": 8.634910042474047e-05, | |
| "entropy": 0.792700219154358, | |
| "num_tokens": 22513422.0, | |
| "mean_token_accuracy": 0.7921054184436798, | |
| "epoch": 0.26081342801807617, | |
| "step": 1010 | |
| }, | |
| { | |
| "loss": 0.7759, | |
| "grad_norm": 0.2839512228965759, | |
| "learning_rate": 8.606197102641614e-05, | |
| "entropy": 0.7746599346399308, | |
| "num_tokens": 22738553.0, | |
| "mean_token_accuracy": 0.7940926223993301, | |
| "epoch": 0.263395739186572, | |
| "step": 1020 | |
| }, | |
| { | |
| "loss": 0.7525, | |
| "grad_norm": 0.3013017177581787, | |
| "learning_rate": 8.577234143767649e-05, | |
| "entropy": 0.7538807690143585, | |
| "num_tokens": 22959380.0, | |
| "mean_token_accuracy": 0.7982607930898666, | |
| "epoch": 0.2659780503550678, | |
| "step": 1030 | |
| }, | |
| { | |
| "loss": 0.7531, | |
| "grad_norm": 0.24685463309288025, | |
| "learning_rate": 8.548023173865313e-05, | |
| "entropy": 0.75996915102005, | |
| "num_tokens": 23183220.0, | |
| "mean_token_accuracy": 0.7970141679048538, | |
| "epoch": 0.2685603615235636, | |
| "step": 1040 | |
| }, | |
| { | |
| "loss": 0.7856, | |
| "grad_norm": 0.2837998867034912, | |
| "learning_rate": 8.518566218142454e-05, | |
| "entropy": 0.7838736146688461, | |
| "num_tokens": 23404006.0, | |
| "mean_token_accuracy": 0.7927649199962616, | |
| "epoch": 0.2711426726920594, | |
| "step": 1050 | |
| }, | |
| { | |
| "loss": 0.8175, | |
| "grad_norm": 0.29661616683006287, | |
| "learning_rate": 8.488865318861224e-05, | |
| "entropy": 0.8234259188175201, | |
| "num_tokens": 23628937.0, | |
| "mean_token_accuracy": 0.7849996358156204, | |
| "epoch": 0.2737249838605552, | |
| "step": 1060 | |
| }, | |
| { | |
| "loss": 0.722, | |
| "grad_norm": 0.25703901052474976, | |
| "learning_rate": 8.458922535196467e-05, | |
| "entropy": 0.7301439672708512, | |
| "num_tokens": 23865644.0, | |
| "mean_token_accuracy": 0.8037035495042801, | |
| "epoch": 0.276307295029051, | |
| "step": 1070 | |
| }, | |
| { | |
| "loss": 0.7135, | |
| "grad_norm": 0.2932882010936737, | |
| "learning_rate": 8.428739943092971e-05, | |
| "entropy": 0.7183157622814178, | |
| "num_tokens": 24084332.0, | |
| "mean_token_accuracy": 0.8063235968351364, | |
| "epoch": 0.2788896061975468, | |
| "step": 1080 | |
| }, | |
| { | |
| "loss": 0.7216, | |
| "grad_norm": 0.27650290727615356, | |
| "learning_rate": 8.398319635121534e-05, | |
| "entropy": 0.726809737086296, | |
| "num_tokens": 24302911.0, | |
| "mean_token_accuracy": 0.8047617524862289, | |
| "epoch": 0.2814719173660426, | |
| "step": 1090 | |
| }, | |
| { | |
| "loss": 0.7758, | |
| "grad_norm": 0.30116310715675354, | |
| "learning_rate": 8.367663720333887e-05, | |
| "entropy": 0.7749881535768509, | |
| "num_tokens": 24536462.0, | |
| "mean_token_accuracy": 0.792735579609871, | |
| "epoch": 0.28405422853453843, | |
| "step": 1100 | |
| }, | |
| { | |
| "loss": 0.7322, | |
| "grad_norm": 0.268449991941452, | |
| "learning_rate": 8.336774324116472e-05, | |
| "entropy": 0.7460200548171997, | |
| "num_tokens": 24746352.0, | |
| "mean_token_accuracy": 0.8020590513944625, | |
| "epoch": 0.2866365397030342, | |
| "step": 1110 | |
| }, | |
| { | |
| "loss": 0.7794, | |
| "grad_norm": 0.2835862934589386, | |
| "learning_rate": 8.305653588043092e-05, | |
| "entropy": 0.778589203953743, | |
| "num_tokens": 24970626.0, | |
| "mean_token_accuracy": 0.794257864356041, | |
| "epoch": 0.28921885087153004, | |
| "step": 1120 | |
| }, | |
| { | |
| "loss": 0.7859, | |
| "grad_norm": 0.2543643116950989, | |
| "learning_rate": 8.274303669726426e-05, | |
| "entropy": 0.7844608157873154, | |
| "num_tokens": 25202908.0, | |
| "mean_token_accuracy": 0.7923130601644516, | |
| "epoch": 0.2918011620400258, | |
| "step": 1130 | |
| }, | |
| { | |
| "loss": 0.7586, | |
| "grad_norm": 0.29340291023254395, | |
| "learning_rate": 8.242726742668453e-05, | |
| "entropy": 0.7621672093868256, | |
| "num_tokens": 25430498.0, | |
| "mean_token_accuracy": 0.7958371937274933, | |
| "epoch": 0.29438347320852165, | |
| "step": 1140 | |
| }, | |
| { | |
| "loss": 0.796, | |
| "grad_norm": 0.2856576442718506, | |
| "learning_rate": 8.210924996109752e-05, | |
| "entropy": 0.7996839165687561, | |
| "num_tokens": 25655907.0, | |
| "mean_token_accuracy": 0.7891602605581284, | |
| "epoch": 0.2969657843770174, | |
| "step": 1150 | |
| }, | |
| { | |
| "loss": 0.7431, | |
| "grad_norm": 0.31065818667411804, | |
| "learning_rate": 8.178900634877728e-05, | |
| "entropy": 0.7477968245744705, | |
| "num_tokens": 25877824.0, | |
| "mean_token_accuracy": 0.8015917330980301, | |
| "epoch": 0.29954809554551326, | |
| "step": 1160 | |
| }, | |
| { | |
| "loss": 0.7728, | |
| "grad_norm": 0.24908140301704407, | |
| "learning_rate": 8.146655879233744e-05, | |
| "entropy": 0.7769822120666504, | |
| "num_tokens": 26110819.0, | |
| "mean_token_accuracy": 0.7936273038387298, | |
| "epoch": 0.30213040671400904, | |
| "step": 1170 | |
| }, | |
| { | |
| "loss": 0.8123, | |
| "grad_norm": 0.3190557658672333, | |
| "learning_rate": 8.114192964719197e-05, | |
| "entropy": 0.8135595470666885, | |
| "num_tokens": 26324524.0, | |
| "mean_token_accuracy": 0.7863456726074218, | |
| "epoch": 0.30471271788250487, | |
| "step": 1180 | |
| }, | |
| { | |
| "loss": 0.7591, | |
| "grad_norm": 0.28709912300109863, | |
| "learning_rate": 8.081514142000517e-05, | |
| "entropy": 0.76651451587677, | |
| "num_tokens": 26543338.0, | |
| "mean_token_accuracy": 0.7968069106340409, | |
| "epoch": 0.30729502905100065, | |
| "step": 1190 | |
| }, | |
| { | |
| "loss": 0.713, | |
| "grad_norm": 0.3083307445049286, | |
| "learning_rate": 8.04862167671314e-05, | |
| "entropy": 0.7108349561691284, | |
| "num_tokens": 26768445.0, | |
| "mean_token_accuracy": 0.8063337951898575, | |
| "epoch": 0.3098773402194964, | |
| "step": 1200 | |
| }, | |
| { | |
| "loss": 0.7237, | |
| "grad_norm": 0.2864898443222046, | |
| "learning_rate": 8.015517849304419e-05, | |
| "entropy": 0.730633094906807, | |
| "num_tokens": 26986834.0, | |
| "mean_token_accuracy": 0.8043313831090927, | |
| "epoch": 0.31245965138799225, | |
| "step": 1210 | |
| }, | |
| { | |
| "loss": 0.827, | |
| "grad_norm": 0.26509392261505127, | |
| "learning_rate": 7.98220495487553e-05, | |
| "entropy": 0.8305549472570419, | |
| "num_tokens": 27212130.0, | |
| "mean_token_accuracy": 0.7802114427089691, | |
| "epoch": 0.31504196255648803, | |
| "step": 1220 | |
| }, | |
| { | |
| "loss": 0.7437, | |
| "grad_norm": 0.27083709836006165, | |
| "learning_rate": 7.948685303022341e-05, | |
| "entropy": 0.7459201335906982, | |
| "num_tokens": 27440385.0, | |
| "mean_token_accuracy": 0.7997552633285523, | |
| "epoch": 0.31762427372498386, | |
| "step": 1230 | |
| }, | |
| { | |
| "loss": 0.7803, | |
| "grad_norm": 0.27397236227989197, | |
| "learning_rate": 7.914961217675294e-05, | |
| "entropy": 0.7889262080192566, | |
| "num_tokens": 27660653.0, | |
| "mean_token_accuracy": 0.7899086564779282, | |
| "epoch": 0.32020658489347964, | |
| "step": 1240 | |
| }, | |
| { | |
| "loss": 0.6871, | |
| "grad_norm": 0.3037557303905487, | |
| "learning_rate": 7.881035036938287e-05, | |
| "entropy": 0.6911757588386536, | |
| "num_tokens": 27879066.0, | |
| "mean_token_accuracy": 0.8117859929800033, | |
| "epoch": 0.32278889606197547, | |
| "step": 1250 | |
| }, | |
| { | |
| "loss": 0.7546, | |
| "grad_norm": 0.28858184814453125, | |
| "learning_rate": 7.846909112926564e-05, | |
| "entropy": 0.7656300634145736, | |
| "num_tokens": 28099197.0, | |
| "mean_token_accuracy": 0.7982858657836914, | |
| "epoch": 0.32537120723047125, | |
| "step": 1260 | |
| }, | |
| { | |
| "loss": 0.7821, | |
| "grad_norm": 0.2936239540576935, | |
| "learning_rate": 7.812585811603655e-05, | |
| "entropy": 0.781898695230484, | |
| "num_tokens": 28318775.0, | |
| "mean_token_accuracy": 0.7906658828258515, | |
| "epoch": 0.3279535183989671, | |
| "step": 1270 | |
| }, | |
| { | |
| "loss": 0.7853, | |
| "grad_norm": 0.28847548365592957, | |
| "learning_rate": 7.77806751261733e-05, | |
| "entropy": 0.7884863972663879, | |
| "num_tokens": 28535279.0, | |
| "mean_token_accuracy": 0.7927415996789933, | |
| "epoch": 0.33053582956746286, | |
| "step": 1280 | |
| }, | |
| { | |
| "loss": 0.7315, | |
| "grad_norm": 0.2842661440372467, | |
| "learning_rate": 7.743356609134622e-05, | |
| "entropy": 0.7407895147800445, | |
| "num_tokens": 28776109.0, | |
| "mean_token_accuracy": 0.8013403862714767, | |
| "epoch": 0.3331181407359587, | |
| "step": 1290 | |
| }, | |
| { | |
| "loss": 0.7278, | |
| "grad_norm": 0.2504037320613861, | |
| "learning_rate": 7.708455507675913e-05, | |
| "entropy": 0.7367011368274688, | |
| "num_tokens": 28991068.0, | |
| "mean_token_accuracy": 0.8018357723951339, | |
| "epoch": 0.33570045190445447, | |
| "step": 1300 | |
| }, | |
| { | |
| "loss": 0.7557, | |
| "grad_norm": 0.2458297312259674, | |
| "learning_rate": 7.673366627948082e-05, | |
| "entropy": 0.7577834516763687, | |
| "num_tokens": 29210689.0, | |
| "mean_token_accuracy": 0.7981645226478576, | |
| "epoch": 0.3382827630729503, | |
| "step": 1310 | |
| }, | |
| { | |
| "loss": 0.7664, | |
| "grad_norm": 0.300417959690094, | |
| "learning_rate": 7.638092402676747e-05, | |
| "entropy": 0.7715034753084182, | |
| "num_tokens": 29429547.0, | |
| "mean_token_accuracy": 0.79455007314682, | |
| "epoch": 0.3408650742414461, | |
| "step": 1320 | |
| }, | |
| { | |
| "loss": 0.7038, | |
| "grad_norm": 0.31124284863471985, | |
| "learning_rate": 7.602635277437602e-05, | |
| "entropy": 0.7138365268707275, | |
| "num_tokens": 29655589.0, | |
| "mean_token_accuracy": 0.8058148920536041, | |
| "epoch": 0.3434473854099419, | |
| "step": 1330 | |
| }, | |
| { | |
| "loss": 0.7297, | |
| "grad_norm": 0.249782532453537, | |
| "learning_rate": 7.566997710486873e-05, | |
| "entropy": 0.732063102722168, | |
| "num_tokens": 29870517.0, | |
| "mean_token_accuracy": 0.8033244669437408, | |
| "epoch": 0.3460296965784377, | |
| "step": 1340 | |
| }, | |
| { | |
| "loss": 0.7096, | |
| "grad_norm": 0.2807137966156006, | |
| "learning_rate": 7.531182172590876e-05, | |
| "entropy": 0.7162998780608177, | |
| "num_tokens": 30106464.0, | |
| "mean_token_accuracy": 0.806077653169632, | |
| "epoch": 0.3486120077469335, | |
| "step": 1350 | |
| }, | |
| { | |
| "loss": 0.7289, | |
| "grad_norm": 0.26910290122032166, | |
| "learning_rate": 7.495191146854721e-05, | |
| "entropy": 0.7291241198778152, | |
| "num_tokens": 30327778.0, | |
| "mean_token_accuracy": 0.8027620315551758, | |
| "epoch": 0.3511943189154293, | |
| "step": 1360 | |
| }, | |
| { | |
| "loss": 0.6953, | |
| "grad_norm": 0.28704628348350525, | |
| "learning_rate": 7.45902712855016e-05, | |
| "entropy": 0.6994198083877563, | |
| "num_tokens": 30559521.0, | |
| "mean_token_accuracy": 0.8095674574375152, | |
| "epoch": 0.3537766300839251, | |
| "step": 1370 | |
| }, | |
| { | |
| "loss": 0.7287, | |
| "grad_norm": 0.29035428166389465, | |
| "learning_rate": 7.422692624942585e-05, | |
| "entropy": 0.7311005294322968, | |
| "num_tokens": 30789659.0, | |
| "mean_token_accuracy": 0.8028050363063812, | |
| "epoch": 0.3563589412524209, | |
| "step": 1380 | |
| }, | |
| { | |
| "loss": 0.7664, | |
| "grad_norm": 0.3093847930431366, | |
| "learning_rate": 7.386190155117199e-05, | |
| "entropy": 0.7770371347665787, | |
| "num_tokens": 31002624.0, | |
| "mean_token_accuracy": 0.795510447025299, | |
| "epoch": 0.35894125242091673, | |
| "step": 1390 | |
| }, | |
| { | |
| "loss": 0.686, | |
| "grad_norm": 0.25263503193855286, | |
| "learning_rate": 7.349522249804373e-05, | |
| "entropy": 0.6919069796800613, | |
| "num_tokens": 31234430.0, | |
| "mean_token_accuracy": 0.8122060149908066, | |
| "epoch": 0.3615235635894125, | |
| "step": 1400 | |
| }, | |
| { | |
| "loss": 0.7581, | |
| "grad_norm": 0.3199212849140167, | |
| "learning_rate": 7.312691451204178e-05, | |
| "entropy": 0.7603460341691971, | |
| "num_tokens": 31451877.0, | |
| "mean_token_accuracy": 0.7980766743421555, | |
| "epoch": 0.36410587475790834, | |
| "step": 1410 | |
| }, | |
| { | |
| "loss": 0.7254, | |
| "grad_norm": 0.28610947728157043, | |
| "learning_rate": 7.275700312810144e-05, | |
| "entropy": 0.7191128998994827, | |
| "num_tokens": 31674092.0, | |
| "mean_token_accuracy": 0.8061201065778733, | |
| "epoch": 0.3666881859264041, | |
| "step": 1420 | |
| }, | |
| { | |
| "loss": 0.6975, | |
| "grad_norm": 0.2749674320220947, | |
| "learning_rate": 7.238551399232218e-05, | |
| "entropy": 0.6995274364948273, | |
| "num_tokens": 31906321.0, | |
| "mean_token_accuracy": 0.8110896348953247, | |
| "epoch": 0.36927049709489995, | |
| "step": 1430 | |
| }, | |
| { | |
| "loss": 0.7399, | |
| "grad_norm": 0.2573988735675812, | |
| "learning_rate": 7.201247286018967e-05, | |
| "entropy": 0.738545709848404, | |
| "num_tokens": 32125362.0, | |
| "mean_token_accuracy": 0.8022010952234269, | |
| "epoch": 0.37185280826339573, | |
| "step": 1440 | |
| }, | |
| { | |
| "loss": 0.6956, | |
| "grad_norm": 0.2564477026462555, | |
| "learning_rate": 7.163790559479003e-05, | |
| "entropy": 0.703942996263504, | |
| "num_tokens": 32355592.0, | |
| "mean_token_accuracy": 0.809988585114479, | |
| "epoch": 0.37443511943189156, | |
| "step": 1450 | |
| }, | |
| { | |
| "loss": 0.7422, | |
| "grad_norm": 0.2665322721004486, | |
| "learning_rate": 7.12618381650168e-05, | |
| "entropy": 0.7470666587352752, | |
| "num_tokens": 32589331.0, | |
| "mean_token_accuracy": 0.7986473143100739, | |
| "epoch": 0.37701743060038734, | |
| "step": 1460 | |
| }, | |
| { | |
| "loss": 0.7288, | |
| "grad_norm": 0.2737554907798767, | |
| "learning_rate": 7.088429664377051e-05, | |
| "entropy": 0.737991139292717, | |
| "num_tokens": 32816610.0, | |
| "mean_token_accuracy": 0.8015279024839401, | |
| "epoch": 0.37959974176888317, | |
| "step": 1470 | |
| }, | |
| { | |
| "loss": 0.7404, | |
| "grad_norm": 0.28172188997268677, | |
| "learning_rate": 7.050530720615104e-05, | |
| "entropy": 0.7469445317983627, | |
| "num_tokens": 33036716.0, | |
| "mean_token_accuracy": 0.8000003874301911, | |
| "epoch": 0.38218205293737895, | |
| "step": 1480 | |
| }, | |
| { | |
| "loss": 0.7394, | |
| "grad_norm": 0.28893858194351196, | |
| "learning_rate": 7.012489612764284e-05, | |
| "entropy": 0.7446531891822815, | |
| "num_tokens": 33262921.0, | |
| "mean_token_accuracy": 0.7989077031612396, | |
| "epoch": 0.3847643641058748, | |
| "step": 1490 | |
| }, | |
| { | |
| "loss": 0.6756, | |
| "grad_norm": 0.2717651426792145, | |
| "learning_rate": 6.97430897822933e-05, | |
| "entropy": 0.6801834493875504, | |
| "num_tokens": 33489654.0, | |
| "mean_token_accuracy": 0.8149780780076981, | |
| "epoch": 0.38734667527437056, | |
| "step": 1500 | |
| }, | |
| { | |
| "loss": 0.815, | |
| "grad_norm": 0.24628806114196777, | |
| "learning_rate": 6.935991464088416e-05, | |
| "entropy": 0.8130639553070068, | |
| "num_tokens": 33715529.0, | |
| "mean_token_accuracy": 0.7862419575452805, | |
| "epoch": 0.3899289864428664, | |
| "step": 1510 | |
| }, | |
| { | |
| "loss": 0.7699, | |
| "grad_norm": 0.3145904242992401, | |
| "learning_rate": 6.897539726909637e-05, | |
| "entropy": 0.7687238410115242, | |
| "num_tokens": 33937441.0, | |
| "mean_token_accuracy": 0.7945855349302292, | |
| "epoch": 0.39251129761136216, | |
| "step": 1520 | |
| }, | |
| { | |
| "loss": 0.6814, | |
| "grad_norm": 0.2653945982456207, | |
| "learning_rate": 6.85895643256682e-05, | |
| "entropy": 0.6895080089569092, | |
| "num_tokens": 34154899.0, | |
| "mean_token_accuracy": 0.8130895376205445, | |
| "epoch": 0.395093608779858, | |
| "step": 1530 | |
| }, | |
| { | |
| "loss": 0.7194, | |
| "grad_norm": 0.2962811291217804, | |
| "learning_rate": 6.820244256054704e-05, | |
| "entropy": 0.7286921381950379, | |
| "num_tokens": 34376870.0, | |
| "mean_token_accuracy": 0.8044507384300232, | |
| "epoch": 0.3976759199483538, | |
| "step": 1540 | |
| }, | |
| { | |
| "loss": 0.7307, | |
| "grad_norm": 0.2986937463283539, | |
| "learning_rate": 6.781405881303474e-05, | |
| "entropy": 0.7333788439631462, | |
| "num_tokens": 34615762.0, | |
| "mean_token_accuracy": 0.8029743075370789, | |
| "epoch": 0.4002582311168496, | |
| "step": 1550 | |
| }, | |
| { | |
| "loss": 0.7345, | |
| "grad_norm": 0.30212172865867615, | |
| "learning_rate": 6.742444000992694e-05, | |
| "entropy": 0.7404761523008346, | |
| "num_tokens": 34841635.0, | |
| "mean_token_accuracy": 0.8009481459856034, | |
| "epoch": 0.4028405422853454, | |
| "step": 1560 | |
| }, | |
| { | |
| "loss": 0.7055, | |
| "grad_norm": 0.31336578726768494, | |
| "learning_rate": 6.70336131636461e-05, | |
| "entropy": 0.7085718095302582, | |
| "num_tokens": 35065220.0, | |
| "mean_token_accuracy": 0.8091004997491836, | |
| "epoch": 0.4054228534538412, | |
| "step": 1570 | |
| }, | |
| { | |
| "loss": 0.7711, | |
| "grad_norm": 0.2713090181350708, | |
| "learning_rate": 6.66416053703688e-05, | |
| "entropy": 0.7731089383363724, | |
| "num_tokens": 35297893.0, | |
| "mean_token_accuracy": 0.7932068020105362, | |
| "epoch": 0.408005164622337, | |
| "step": 1580 | |
| }, | |
| { | |
| "loss": 0.7632, | |
| "grad_norm": 0.2571941018104553, | |
| "learning_rate": 6.624844380814717e-05, | |
| "entropy": 0.7549047037959099, | |
| "num_tokens": 35529896.0, | |
| "mean_token_accuracy": 0.7992217838764191, | |
| "epoch": 0.41058747579083277, | |
| "step": 1590 | |
| }, | |
| { | |
| "loss": 0.779, | |
| "grad_norm": 0.3086455762386322, | |
| "learning_rate": 6.585415573502455e-05, | |
| "entropy": 0.7779949843883515, | |
| "num_tokens": 35748202.0, | |
| "mean_token_accuracy": 0.795146381855011, | |
| "epoch": 0.4131697869593286, | |
| "step": 1600 | |
| }, | |
| { | |
| "loss": 0.7009, | |
| "grad_norm": 0.28316617012023926, | |
| "learning_rate": 6.545876848714574e-05, | |
| "entropy": 0.7077500820159912, | |
| "num_tokens": 35963953.0, | |
| "mean_token_accuracy": 0.8086227357387543, | |
| "epoch": 0.4157520981278244, | |
| "step": 1610 | |
| }, | |
| { | |
| "loss": 0.6681, | |
| "grad_norm": 0.2951730191707611, | |
| "learning_rate": 6.506230947686172e-05, | |
| "entropy": 0.6698646396398544, | |
| "num_tokens": 36187006.0, | |
| "mean_token_accuracy": 0.8158697307109832, | |
| "epoch": 0.4183344092963202, | |
| "step": 1620 | |
| }, | |
| { | |
| "loss": 0.7176, | |
| "grad_norm": 0.28841638565063477, | |
| "learning_rate": 6.466480619082919e-05, | |
| "entropy": 0.7250782191753388, | |
| "num_tokens": 36397760.0, | |
| "mean_token_accuracy": 0.8054184794425965, | |
| "epoch": 0.420916720464816, | |
| "step": 1630 | |
| }, | |
| { | |
| "loss": 0.689, | |
| "grad_norm": 0.2621630132198334, | |
| "learning_rate": 6.426628618810491e-05, | |
| "entropy": 0.7003191500902176, | |
| "num_tokens": 36618227.0, | |
| "mean_token_accuracy": 0.8108294665813446, | |
| "epoch": 0.4234990316333118, | |
| "step": 1640 | |
| }, | |
| { | |
| "loss": 0.7341, | |
| "grad_norm": 0.25907382369041443, | |
| "learning_rate": 6.386677709823496e-05, | |
| "entropy": 0.7378379344940186, | |
| "num_tokens": 36829652.0, | |
| "mean_token_accuracy": 0.801531919836998, | |
| "epoch": 0.4260813428018076, | |
| "step": 1650 | |
| }, | |
| { | |
| "loss": 0.7483, | |
| "grad_norm": 0.3523082137107849, | |
| "learning_rate": 6.346630661933927e-05, | |
| "entropy": 0.742335370182991, | |
| "num_tokens": 37049020.0, | |
| "mean_token_accuracy": 0.8001793205738068, | |
| "epoch": 0.4286636539703034, | |
| "step": 1660 | |
| }, | |
| { | |
| "loss": 0.7819, | |
| "grad_norm": 0.2581123113632202, | |
| "learning_rate": 6.306490251619121e-05, | |
| "entropy": 0.78986496925354, | |
| "num_tokens": 37270090.0, | |
| "mean_token_accuracy": 0.7922927856445312, | |
| "epoch": 0.4312459651387992, | |
| "step": 1670 | |
| }, | |
| { | |
| "loss": 0.7424, | |
| "grad_norm": 0.28055480122566223, | |
| "learning_rate": 6.266259261829266e-05, | |
| "entropy": 0.7482966244220733, | |
| "num_tokens": 37500241.0, | |
| "mean_token_accuracy": 0.8013508349657059, | |
| "epoch": 0.43382827630729504, | |
| "step": 1680 | |
| }, | |
| { | |
| "loss": 0.8196, | |
| "grad_norm": 0.2631305158138275, | |
| "learning_rate": 6.225940481794463e-05, | |
| "entropy": 0.8206893265247345, | |
| "num_tokens": 37722357.0, | |
| "mean_token_accuracy": 0.7845668315887451, | |
| "epoch": 0.4364105874757908, | |
| "step": 1690 | |
| }, | |
| { | |
| "loss": 0.7689, | |
| "grad_norm": 0.27216827869415283, | |
| "learning_rate": 6.185536706831346e-05, | |
| "entropy": 0.7689682602882385, | |
| "num_tokens": 37950299.0, | |
| "mean_token_accuracy": 0.7950216263532639, | |
| "epoch": 0.43899289864428664, | |
| "step": 1700 | |
| }, | |
| { | |
| "loss": 0.7273, | |
| "grad_norm": 0.30833864212036133, | |
| "learning_rate": 6.145050738149277e-05, | |
| "entropy": 0.7360947251319885, | |
| "num_tokens": 38167912.0, | |
| "mean_token_accuracy": 0.8035003364086151, | |
| "epoch": 0.4415752098127824, | |
| "step": 1710 | |
| }, | |
| { | |
| "loss": 0.7299, | |
| "grad_norm": 0.31663721799850464, | |
| "learning_rate": 6.104485382656142e-05, | |
| "entropy": 0.7425105035305023, | |
| "num_tokens": 38390268.0, | |
| "mean_token_accuracy": 0.8013977319002151, | |
| "epoch": 0.44415752098127825, | |
| "step": 1720 | |
| }, | |
| { | |
| "loss": 0.7602, | |
| "grad_norm": 0.28522592782974243, | |
| "learning_rate": 6.063843452763744e-05, | |
| "entropy": 0.758876496553421, | |
| "num_tokens": 38611911.0, | |
| "mean_token_accuracy": 0.7959836840629577, | |
| "epoch": 0.44673983214977403, | |
| "step": 1730 | |
| }, | |
| { | |
| "loss": 0.747, | |
| "grad_norm": 0.2787739932537079, | |
| "learning_rate": 6.023127766192824e-05, | |
| "entropy": 0.7534547060728073, | |
| "num_tokens": 38830188.0, | |
| "mean_token_accuracy": 0.7991040110588074, | |
| "epoch": 0.44932214331826986, | |
| "step": 1740 | |
| }, | |
| { | |
| "loss": 0.7143, | |
| "grad_norm": 0.3082408010959625, | |
| "learning_rate": 5.982341145777695e-05, | |
| "entropy": 0.7169277727603912, | |
| "num_tokens": 39049777.0, | |
| "mean_token_accuracy": 0.8069923728704452, | |
| "epoch": 0.45190445448676564, | |
| "step": 1750 | |
| }, | |
| { | |
| "loss": 0.7273, | |
| "grad_norm": 0.2924341857433319, | |
| "learning_rate": 5.9414864192705424e-05, | |
| "entropy": 0.7336899906396865, | |
| "num_tokens": 39269635.0, | |
| "mean_token_accuracy": 0.802969342470169, | |
| "epoch": 0.45448676565526147, | |
| "step": 1760 | |
| }, | |
| { | |
| "loss": 0.7215, | |
| "grad_norm": 0.27919134497642517, | |
| "learning_rate": 5.900566419145377e-05, | |
| "entropy": 0.7257662117481232, | |
| "num_tokens": 39487714.0, | |
| "mean_token_accuracy": 0.8041402757167816, | |
| "epoch": 0.45706907682375725, | |
| "step": 1770 | |
| }, | |
| { | |
| "loss": 0.6977, | |
| "grad_norm": 0.25231438875198364, | |
| "learning_rate": 5.859583982401653e-05, | |
| "entropy": 0.6998866051435471, | |
| "num_tokens": 39714170.0, | |
| "mean_token_accuracy": 0.8111774027347565, | |
| "epoch": 0.4596513879922531, | |
| "step": 1780 | |
| }, | |
| { | |
| "loss": 0.742, | |
| "grad_norm": 0.24185897409915924, | |
| "learning_rate": 5.818541950367579e-05, | |
| "entropy": 0.7464011371135711, | |
| "num_tokens": 39927978.0, | |
| "mean_token_accuracy": 0.8016654849052429, | |
| "epoch": 0.46223369916074886, | |
| "step": 1790 | |
| }, | |
| { | |
| "loss": 0.703, | |
| "grad_norm": 0.25616031885147095, | |
| "learning_rate": 5.7774431685031274e-05, | |
| "entropy": 0.7071524858474731, | |
| "num_tokens": 40144816.0, | |
| "mean_token_accuracy": 0.8098747581243515, | |
| "epoch": 0.4648160103292447, | |
| "step": 1800 | |
| }, | |
| { | |
| "loss": 0.6964, | |
| "grad_norm": 0.31371667981147766, | |
| "learning_rate": 5.736290486202759e-05, | |
| "entropy": 0.6939981669187546, | |
| "num_tokens": 40345533.0, | |
| "mean_token_accuracy": 0.8119114637374878, | |
| "epoch": 0.46739832149774047, | |
| "step": 1810 | |
| }, | |
| { | |
| "loss": 0.8156, | |
| "grad_norm": 0.3086910545825958, | |
| "learning_rate": 5.6950867565978735e-05, | |
| "entropy": 0.8133882731199265, | |
| "num_tokens": 40554468.0, | |
| "mean_token_accuracy": 0.7859878808259964, | |
| "epoch": 0.4699806326662363, | |
| "step": 1820 | |
| }, | |
| { | |
| "loss": 0.6717, | |
| "grad_norm": 0.2788376808166504, | |
| "learning_rate": 5.653834836359e-05, | |
| "entropy": 0.6821775257587432, | |
| "num_tokens": 40783644.0, | |
| "mean_token_accuracy": 0.8164048582315445, | |
| "epoch": 0.4725629438347321, | |
| "step": 1830 | |
| }, | |
| { | |
| "loss": 0.7689, | |
| "grad_norm": 0.3284967243671417, | |
| "learning_rate": 5.6125375854977426e-05, | |
| "entropy": 0.7694633185863495, | |
| "num_tokens": 40996183.0, | |
| "mean_token_accuracy": 0.7952580213546753, | |
| "epoch": 0.4751452550032279, | |
| "step": 1840 | |
| }, | |
| { | |
| "loss": 0.7344, | |
| "grad_norm": 0.2739453613758087, | |
| "learning_rate": 5.571197867168493e-05, | |
| "entropy": 0.7373751908540725, | |
| "num_tokens": 41216379.0, | |
| "mean_token_accuracy": 0.8036257177591324, | |
| "epoch": 0.4777275661717237, | |
| "step": 1850 | |
| }, | |
| { | |
| "loss": 0.7846, | |
| "grad_norm": 0.2590038478374481, | |
| "learning_rate": 5.529818547469932e-05, | |
| "entropy": 0.7804774641990662, | |
| "num_tokens": 41444373.0, | |
| "mean_token_accuracy": 0.7919269859790802, | |
| "epoch": 0.4803098773402195, | |
| "step": 1860 | |
| }, | |
| { | |
| "loss": 0.7529, | |
| "grad_norm": 0.34748631715774536, | |
| "learning_rate": 5.488402495246316e-05, | |
| "entropy": 0.7593414902687072, | |
| "num_tokens": 41669265.0, | |
| "mean_token_accuracy": 0.7967518717050552, | |
| "epoch": 0.4828921885087153, | |
| "step": 1870 | |
| }, | |
| { | |
| "loss": 0.7025, | |
| "grad_norm": 0.2958257496356964, | |
| "learning_rate": 5.446952581888586e-05, | |
| "entropy": 0.7056142598390579, | |
| "num_tokens": 41888599.0, | |
| "mean_token_accuracy": 0.8074656426906586, | |
| "epoch": 0.4854744996772111, | |
| "step": 1880 | |
| }, | |
| { | |
| "loss": 0.6623, | |
| "grad_norm": 0.32922279834747314, | |
| "learning_rate": 5.4054716811352816e-05, | |
| "entropy": 0.6658655419945717, | |
| "num_tokens": 42103445.0, | |
| "mean_token_accuracy": 0.8173845767974853, | |
| "epoch": 0.4880568108457069, | |
| "step": 1890 | |
| }, | |
| { | |
| "loss": 0.6835, | |
| "grad_norm": 0.27834922075271606, | |
| "learning_rate": 5.363962668873317e-05, | |
| "entropy": 0.6853355497121811, | |
| "num_tokens": 42324754.0, | |
| "mean_token_accuracy": 0.8134206265211106, | |
| "epoch": 0.49063912201420273, | |
| "step": 1900 | |
| }, | |
| { | |
| "loss": 0.6927, | |
| "grad_norm": 0.36218395829200745, | |
| "learning_rate": 5.322428422938585e-05, | |
| "entropy": 0.6967261880636215, | |
| "num_tokens": 42548676.0, | |
| "mean_token_accuracy": 0.8099091470241546, | |
| "epoch": 0.4932214331826985, | |
| "step": 1910 | |
| }, | |
| { | |
| "loss": 0.7314, | |
| "grad_norm": 0.26530325412750244, | |
| "learning_rate": 5.2808718229164375e-05, | |
| "entropy": 0.7388436585664749, | |
| "num_tokens": 42780674.0, | |
| "mean_token_accuracy": 0.8034113436937332, | |
| "epoch": 0.49580374435119434, | |
| "step": 1920 | |
| }, | |
| { | |
| "loss": 0.7003, | |
| "grad_norm": 0.28567594289779663, | |
| "learning_rate": 5.239295749942049e-05, | |
| "entropy": 0.700775408744812, | |
| "num_tokens": 42993726.0, | |
| "mean_token_accuracy": 0.8110381156206131, | |
| "epoch": 0.4983860555196901, | |
| "step": 1930 | |
| }, | |
| { | |
| "loss": 0.7674, | |
| "grad_norm": 0.2515343725681305, | |
| "learning_rate": 5.197703086500656e-05, | |
| "entropy": 0.7715499877929688, | |
| "num_tokens": 43224427.0, | |
| "mean_token_accuracy": 0.796505457162857, | |
| "epoch": 0.5009683666881859, | |
| "step": 1940 | |
| }, | |
| { | |
| "loss": 0.7176, | |
| "grad_norm": 0.28791186213493347, | |
| "learning_rate": 5.156096716227719e-05, | |
| "entropy": 0.7235445290803909, | |
| "num_tokens": 43435326.0, | |
| "mean_token_accuracy": 0.8063592225313186, | |
| "epoch": 0.5035506778566817, | |
| "step": 1950 | |
| }, | |
| { | |
| "loss": 0.6719, | |
| "grad_norm": 0.2640407383441925, | |
| "learning_rate": 5.114479523709003e-05, | |
| "entropy": 0.6716303527355194, | |
| "num_tokens": 43643191.0, | |
| "mean_token_accuracy": 0.8156255722045899, | |
| "epoch": 0.5061329890251776, | |
| "step": 1960 | |
| }, | |
| { | |
| "loss": 0.7137, | |
| "grad_norm": 0.2662080228328705, | |
| "learning_rate": 5.072854394280578e-05, | |
| "entropy": 0.7221474066376686, | |
| "num_tokens": 43868551.0, | |
| "mean_token_accuracy": 0.8072103232145309, | |
| "epoch": 0.5087153001936734, | |
| "step": 1970 | |
| }, | |
| { | |
| "loss": 0.7388, | |
| "grad_norm": 0.266189306974411, | |
| "learning_rate": 5.031224213828788e-05, | |
| "entropy": 0.7444506168365479, | |
| "num_tokens": 44092695.0, | |
| "mean_token_accuracy": 0.8010527819395066, | |
| "epoch": 0.5112976113621691, | |
| "step": 1980 | |
| }, | |
| { | |
| "loss": 0.7565, | |
| "grad_norm": 0.27445709705352783, | |
| "learning_rate": 4.989591868590162e-05, | |
| "entropy": 0.7575427994132042, | |
| "num_tokens": 44308972.0, | |
| "mean_token_accuracy": 0.7987733721733093, | |
| "epoch": 0.513879922530665, | |
| "step": 1990 | |
| }, | |
| { | |
| "loss": 0.675, | |
| "grad_norm": 0.24522656202316284, | |
| "learning_rate": 4.9479602449513174e-05, | |
| "entropy": 0.6674839437007904, | |
| "num_tokens": 44539892.0, | |
| "mean_token_accuracy": 0.8155701875686645, | |
| "epoch": 0.5164622336991608, | |
| "step": 2000 | |
| }, | |
| { | |
| "loss": 0.6908, | |
| "grad_norm": 0.27964839339256287, | |
| "learning_rate": 4.9063322292488414e-05, | |
| "entropy": 0.6955849170684815, | |
| "num_tokens": 44757659.0, | |
| "mean_token_accuracy": 0.8129804879426956, | |
| "epoch": 0.5190445448676565, | |
| "step": 2010 | |
| }, | |
| { | |
| "loss": 0.714, | |
| "grad_norm": 0.28458577394485474, | |
| "learning_rate": 4.864710707569181e-05, | |
| "entropy": 0.7280423313379287, | |
| "num_tokens": 44969447.0, | |
| "mean_token_accuracy": 0.8045047700405121, | |
| "epoch": 0.5216268560361523, | |
| "step": 2020 | |
| }, | |
| { | |
| "loss": 0.7488, | |
| "grad_norm": 0.3118044435977936, | |
| "learning_rate": 4.8230985655485525e-05, | |
| "entropy": 0.7563327640295029, | |
| "num_tokens": 45191063.0, | |
| "mean_token_accuracy": 0.7996702790260315, | |
| "epoch": 0.5242091672046482, | |
| "step": 2030 | |
| }, | |
| { | |
| "loss": 0.7041, | |
| "grad_norm": 0.2871478796005249, | |
| "learning_rate": 4.781498688172875e-05, | |
| "entropy": 0.7103944063186646, | |
| "num_tokens": 45410657.0, | |
| "mean_token_accuracy": 0.8094416737556458, | |
| "epoch": 0.526791478373144, | |
| "step": 2040 | |
| }, | |
| { | |
| "loss": 0.7267, | |
| "grad_norm": 0.2895406186580658, | |
| "learning_rate": 4.739913959577755e-05, | |
| "entropy": 0.7345693051815033, | |
| "num_tokens": 45633161.0, | |
| "mean_token_accuracy": 0.802816191315651, | |
| "epoch": 0.5293737895416397, | |
| "step": 2050 | |
| }, | |
| { | |
| "loss": 0.7216, | |
| "grad_norm": 0.3061612844467163, | |
| "learning_rate": 4.6983472628485286e-05, | |
| "entropy": 0.7228217035531997, | |
| "num_tokens": 45863461.0, | |
| "mean_token_accuracy": 0.8049194812774658, | |
| "epoch": 0.5319561007101355, | |
| "step": 2060 | |
| }, | |
| { | |
| "loss": 0.7492, | |
| "grad_norm": 0.28822118043899536, | |
| "learning_rate": 4.6568014798203765e-05, | |
| "entropy": 0.7588370621204377, | |
| "num_tokens": 46086225.0, | |
| "mean_token_accuracy": 0.7975336879491806, | |
| "epoch": 0.5345384118786314, | |
| "step": 2070 | |
| }, | |
| { | |
| "loss": 0.7334, | |
| "grad_norm": 0.2947237491607666, | |
| "learning_rate": 4.615279490878522e-05, | |
| "entropy": 0.7364494383335114, | |
| "num_tokens": 46306577.0, | |
| "mean_token_accuracy": 0.8028736978769302, | |
| "epoch": 0.5371207230471272, | |
| "step": 2080 | |
| }, | |
| { | |
| "loss": 0.7402, | |
| "grad_norm": 0.28399375081062317, | |
| "learning_rate": 4.573784174758538e-05, | |
| "entropy": 0.7495404034852982, | |
| "num_tokens": 46521889.0, | |
| "mean_token_accuracy": 0.8020780056715011, | |
| "epoch": 0.5397030342156229, | |
| "step": 2090 | |
| }, | |
| { | |
| "loss": 0.7247, | |
| "grad_norm": 0.3146340548992157, | |
| "learning_rate": 4.532318408346757e-05, | |
| "entropy": 0.732389822602272, | |
| "num_tokens": 46745832.0, | |
| "mean_token_accuracy": 0.8045948296785355, | |
| "epoch": 0.5422853453841188, | |
| "step": 2100 | |
| }, | |
| { | |
| "loss": 0.7203, | |
| "grad_norm": 0.33135199546813965, | |
| "learning_rate": 4.4908850664808245e-05, | |
| "entropy": 0.7190593853592873, | |
| "num_tokens": 46968971.0, | |
| "mean_token_accuracy": 0.8061521053314209, | |
| "epoch": 0.5448676565526146, | |
| "step": 2110 | |
| }, | |
| { | |
| "loss": 0.7253, | |
| "grad_norm": 0.25495028495788574, | |
| "learning_rate": 4.449487021750373e-05, | |
| "entropy": 0.7277108535170556, | |
| "num_tokens": 47177907.0, | |
| "mean_token_accuracy": 0.8039905726909637, | |
| "epoch": 0.5474499677211104, | |
| "step": 2120 | |
| }, | |
| { | |
| "loss": 0.7326, | |
| "grad_norm": 0.2718372046947479, | |
| "learning_rate": 4.4081271442978745e-05, | |
| "entropy": 0.7357953459024429, | |
| "num_tokens": 47405943.0, | |
| "mean_token_accuracy": 0.8034853279590607, | |
| "epoch": 0.5500322788896062, | |
| "step": 2130 | |
| }, | |
| { | |
| "loss": 0.7994, | |
| "grad_norm": 0.29703348875045776, | |
| "learning_rate": 4.3668083016196486e-05, | |
| "entropy": 0.8072989761829377, | |
| "num_tokens": 47625664.0, | |
| "mean_token_accuracy": 0.7880438774824142, | |
| "epoch": 0.552614590058102, | |
| "step": 2140 | |
| }, | |
| { | |
| "loss": 0.7094, | |
| "grad_norm": 0.2337184101343155, | |
| "learning_rate": 4.325533358367058e-05, | |
| "entropy": 0.710915943980217, | |
| "num_tokens": 47847735.0, | |
| "mean_token_accuracy": 0.8080082088708878, | |
| "epoch": 0.5551969012265978, | |
| "step": 2150 | |
| }, | |
| { | |
| "loss": 0.6969, | |
| "grad_norm": 0.25921082496643066, | |
| "learning_rate": 4.284305176147901e-05, | |
| "entropy": 0.7076527655124665, | |
| "num_tokens": 48088651.0, | |
| "mean_token_accuracy": 0.8098387718200684, | |
| "epoch": 0.5577792123950936, | |
| "step": 2160 | |
| }, | |
| { | |
| "loss": 0.727, | |
| "grad_norm": 0.29743558168411255, | |
| "learning_rate": 4.243126613328015e-05, | |
| "entropy": 0.7333346515893936, | |
| "num_tokens": 48311841.0, | |
| "mean_token_accuracy": 0.8033136278390884, | |
| "epoch": 0.5603615235635894, | |
| "step": 2170 | |
| }, | |
| { | |
| "loss": 0.668, | |
| "grad_norm": 0.2839380204677582, | |
| "learning_rate": 4.2020005248331054e-05, | |
| "entropy": 0.680330915749073, | |
| "num_tokens": 48541172.0, | |
| "mean_token_accuracy": 0.8162445783615112, | |
| "epoch": 0.5629438347320852, | |
| "step": 2180 | |
| }, | |
| { | |
| "loss": 0.7175, | |
| "grad_norm": 0.28205934166908264, | |
| "learning_rate": 4.160929761950811e-05, | |
| "entropy": 0.719845575094223, | |
| "num_tokens": 48764865.0, | |
| "mean_token_accuracy": 0.8060573965311051, | |
| "epoch": 0.565526145900581, | |
| "step": 2190 | |
| }, | |
| { | |
| "loss": 0.6914, | |
| "grad_norm": 0.32426223158836365, | |
| "learning_rate": 4.11991717213303e-05, | |
| "entropy": 0.6926346108317375, | |
| "num_tokens": 48994462.0, | |
| "mean_token_accuracy": 0.8123312324285508, | |
| "epoch": 0.5681084570690769, | |
| "step": 2200 | |
| }, | |
| { | |
| "loss": 0.726, | |
| "grad_norm": 0.26659542322158813, | |
| "learning_rate": 4.0789655987984925e-05, | |
| "entropy": 0.7318926438689232, | |
| "num_tokens": 49217593.0, | |
| "mean_token_accuracy": 0.8045945227146148, | |
| "epoch": 0.5706907682375726, | |
| "step": 2210 | |
| }, | |
| { | |
| "loss": 0.7102, | |
| "grad_norm": 0.28396132588386536, | |
| "learning_rate": 4.0380778811356365e-05, | |
| "entropy": 0.7165648102760315, | |
| "num_tokens": 49451191.0, | |
| "mean_token_accuracy": 0.8072537958621979, | |
| "epoch": 0.5732730794060684, | |
| "step": 2220 | |
| }, | |
| { | |
| "loss": 0.6689, | |
| "grad_norm": 0.28661495447158813, | |
| "learning_rate": 3.99725685390576e-05, | |
| "entropy": 0.6788897529244423, | |
| "num_tokens": 49682456.0, | |
| "mean_token_accuracy": 0.815646868944168, | |
| "epoch": 0.5758553905745643, | |
| "step": 2230 | |
| }, | |
| { | |
| "loss": 0.7025, | |
| "grad_norm": 0.2851995825767517, | |
| "learning_rate": 3.956505347246487e-05, | |
| "entropy": 0.7025539427995682, | |
| "num_tokens": 49918044.0, | |
| "mean_token_accuracy": 0.8090035438537597, | |
| "epoch": 0.5784377017430601, | |
| "step": 2240 | |
| }, | |
| { | |
| "loss": 0.6964, | |
| "grad_norm": 0.2995574474334717, | |
| "learning_rate": 3.915826186475555e-05, | |
| "entropy": 0.6998808637261391, | |
| "num_tokens": 50139989.0, | |
| "mean_token_accuracy": 0.8096735835075378, | |
| "epoch": 0.5810200129115558, | |
| "step": 2250 | |
| }, | |
| { | |
| "loss": 0.724, | |
| "grad_norm": 0.29106131196022034, | |
| "learning_rate": 3.8752221918949306e-05, | |
| "entropy": 0.7294921964406967, | |
| "num_tokens": 50359939.0, | |
| "mean_token_accuracy": 0.8044734835624695, | |
| "epoch": 0.5836023240800516, | |
| "step": 2260 | |
| }, | |
| { | |
| "loss": 0.7017, | |
| "grad_norm": 0.32531774044036865, | |
| "learning_rate": 3.834696178595282e-05, | |
| "entropy": 0.7081922739744186, | |
| "num_tokens": 50586979.0, | |
| "mean_token_accuracy": 0.8094498455524445, | |
| "epoch": 0.5861846352485475, | |
| "step": 2270 | |
| }, | |
| { | |
| "loss": 0.7084, | |
| "grad_norm": 0.26263847947120667, | |
| "learning_rate": 3.794250956260805e-05, | |
| "entropy": 0.7074251011013984, | |
| "num_tokens": 50813902.0, | |
| "mean_token_accuracy": 0.808785879611969, | |
| "epoch": 0.5887669464170433, | |
| "step": 2280 | |
| }, | |
| { | |
| "loss": 0.7549, | |
| "grad_norm": 0.29434698820114136, | |
| "learning_rate": 3.753889328974423e-05, | |
| "entropy": 0.7641531646251678, | |
| "num_tokens": 51044696.0, | |
| "mean_token_accuracy": 0.7972988486289978, | |
| "epoch": 0.591349257585539, | |
| "step": 2290 | |
| }, | |
| { | |
| "loss": 0.7627, | |
| "grad_norm": 0.2781457304954529, | |
| "learning_rate": 3.7136140950233864e-05, | |
| "entropy": 0.764645341038704, | |
| "num_tokens": 51267662.0, | |
| "mean_token_accuracy": 0.79625363945961, | |
| "epoch": 0.5939315687540349, | |
| "step": 2300 | |
| }, | |
| { | |
| "loss": 0.7725, | |
| "grad_norm": 0.27123042941093445, | |
| "learning_rate": 3.6734280467052595e-05, | |
| "entropy": 0.7800057739019394, | |
| "num_tokens": 51486424.0, | |
| "mean_token_accuracy": 0.7927343875169754, | |
| "epoch": 0.5965138799225307, | |
| "step": 2310 | |
| }, | |
| { | |
| "loss": 0.7276, | |
| "grad_norm": 0.3310370445251465, | |
| "learning_rate": 3.633333970134334e-05, | |
| "entropy": 0.7255799725651741, | |
| "num_tokens": 51710712.0, | |
| "mean_token_accuracy": 0.8043308049440384, | |
| "epoch": 0.5990961910910265, | |
| "step": 2320 | |
| }, | |
| { | |
| "loss": 0.7186, | |
| "grad_norm": 0.30834323167800903, | |
| "learning_rate": 3.593334645048463e-05, | |
| "entropy": 0.7251825213432312, | |
| "num_tokens": 51929200.0, | |
| "mean_token_accuracy": 0.8063902944326401, | |
| "epoch": 0.6016785022595222, | |
| "step": 2330 | |
| }, | |
| { | |
| "loss": 0.7208, | |
| "grad_norm": 0.2710685431957245, | |
| "learning_rate": 3.5534328446163444e-05, | |
| "entropy": 0.7191109150648117, | |
| "num_tokens": 52150269.0, | |
| "mean_token_accuracy": 0.8055253535509109, | |
| "epoch": 0.6042608134280181, | |
| "step": 2340 | |
| }, | |
| { | |
| "loss": 0.6743, | |
| "grad_norm": 0.29122015833854675, | |
| "learning_rate": 3.5136313352452523e-05, | |
| "entropy": 0.6803427249193191, | |
| "num_tokens": 52373985.0, | |
| "mean_token_accuracy": 0.8142094522714615, | |
| "epoch": 0.6068431245965139, | |
| "step": 2350 | |
| }, | |
| { | |
| "loss": 0.7097, | |
| "grad_norm": 0.28865063190460205, | |
| "learning_rate": 3.473932876389244e-05, | |
| "entropy": 0.7169732004404068, | |
| "num_tokens": 52596955.0, | |
| "mean_token_accuracy": 0.8064839720726014, | |
| "epoch": 0.6094254357650097, | |
| "step": 2360 | |
| }, | |
| { | |
| "loss": 0.7808, | |
| "grad_norm": 0.2779035270214081, | |
| "learning_rate": 3.434340220357843e-05, | |
| "entropy": 0.7823223441839218, | |
| "num_tokens": 52828509.0, | |
| "mean_token_accuracy": 0.7934709966182709, | |
| "epoch": 0.6120077469335055, | |
| "step": 2370 | |
| }, | |
| { | |
| "loss": 0.7063, | |
| "grad_norm": 0.30650147795677185, | |
| "learning_rate": 3.3948561121252254e-05, | |
| "entropy": 0.7046063154935837, | |
| "num_tokens": 53056392.0, | |
| "mean_token_accuracy": 0.8085776209831238, | |
| "epoch": 0.6145900581020013, | |
| "step": 2380 | |
| }, | |
| { | |
| "loss": 0.7573, | |
| "grad_norm": 0.2515726387500763, | |
| "learning_rate": 3.3554832891399015e-05, | |
| "entropy": 0.7701946586370468, | |
| "num_tokens": 53289453.0, | |
| "mean_token_accuracy": 0.7973541855812073, | |
| "epoch": 0.6171723692704971, | |
| "step": 2390 | |
| }, | |
| { | |
| "loss": 0.6877, | |
| "grad_norm": 0.2916530966758728, | |
| "learning_rate": 3.3162244811349366e-05, | |
| "entropy": 0.6941643834114075, | |
| "num_tokens": 53511361.0, | |
| "mean_token_accuracy": 0.8097677916288376, | |
| "epoch": 0.6197546804389928, | |
| "step": 2400 | |
| }, | |
| { | |
| "loss": 0.6698, | |
| "grad_norm": 0.35168713331222534, | |
| "learning_rate": 3.277082409938691e-05, | |
| "entropy": 0.6763507977128029, | |
| "num_tokens": 53733013.0, | |
| "mean_token_accuracy": 0.8161178439855575, | |
| "epoch": 0.6223369916074887, | |
| "step": 2410 | |
| }, | |
| { | |
| "loss": 0.6756, | |
| "grad_norm": 0.2850055396556854, | |
| "learning_rate": 3.2380597892861175e-05, | |
| "entropy": 0.676644204556942, | |
| "num_tokens": 53957269.0, | |
| "mean_token_accuracy": 0.8149404138326645, | |
| "epoch": 0.6249193027759845, | |
| "step": 2420 | |
| }, | |
| { | |
| "loss": 0.7038, | |
| "grad_norm": 0.30560407042503357, | |
| "learning_rate": 3.1991593246306164e-05, | |
| "entropy": 0.7020898759365082, | |
| "num_tokens": 54171717.0, | |
| "mean_token_accuracy": 0.8107654899358749, | |
| "epoch": 0.6275016139444803, | |
| "step": 2430 | |
| }, | |
| { | |
| "loss": 0.6545, | |
| "grad_norm": 0.30098938941955566, | |
| "learning_rate": 3.160383712956466e-05, | |
| "entropy": 0.656835712492466, | |
| "num_tokens": 54398417.0, | |
| "mean_token_accuracy": 0.82038534283638, | |
| "epoch": 0.6300839251129761, | |
| "step": 2440 | |
| }, | |
| { | |
| "loss": 0.7134, | |
| "grad_norm": 0.24808907508850098, | |
| "learning_rate": 3.121735642591838e-05, | |
| "entropy": 0.716155156493187, | |
| "num_tokens": 54616605.0, | |
| "mean_token_accuracy": 0.8066381633281707, | |
| "epoch": 0.6326662362814719, | |
| "step": 2450 | |
| }, | |
| { | |
| "loss": 0.7269, | |
| "grad_norm": 0.32362425327301025, | |
| "learning_rate": 3.083217793022415e-05, | |
| "entropy": 0.7308804154396057, | |
| "num_tokens": 54835949.0, | |
| "mean_token_accuracy": 0.8038643062114715, | |
| "epoch": 0.6352485474499677, | |
| "step": 2460 | |
| }, | |
| { | |
| "loss": 0.6981, | |
| "grad_norm": 0.31787142157554626, | |
| "learning_rate": 3.0448328347056244e-05, | |
| "entropy": 0.6972566187381745, | |
| "num_tokens": 55063212.0, | |
| "mean_token_accuracy": 0.8092873215675354, | |
| "epoch": 0.6378308586184636, | |
| "step": 2470 | |
| }, | |
| { | |
| "loss": 0.7066, | |
| "grad_norm": 0.3164233863353729, | |
| "learning_rate": 3.0065834288854893e-05, | |
| "entropy": 0.7190612822771072, | |
| "num_tokens": 55288008.0, | |
| "mean_token_accuracy": 0.8090256512165069, | |
| "epoch": 0.6404131697869593, | |
| "step": 2480 | |
| }, | |
| { | |
| "loss": 0.7241, | |
| "grad_norm": 0.3039920926094055, | |
| "learning_rate": 2.968472227408127e-05, | |
| "entropy": 0.718104162812233, | |
| "num_tokens": 55501656.0, | |
| "mean_token_accuracy": 0.8054259717464447, | |
| "epoch": 0.6429954809554551, | |
| "step": 2490 | |
| }, | |
| { | |
| "loss": 0.7009, | |
| "grad_norm": 0.2654428482055664, | |
| "learning_rate": 2.930501872537891e-05, | |
| "entropy": 0.7064283490180969, | |
| "num_tokens": 55720044.0, | |
| "mean_token_accuracy": 0.8091771513223648, | |
| "epoch": 0.6455777921239509, | |
| "step": 2500 | |
| }, | |
| { | |
| "loss": 0.672, | |
| "grad_norm": 0.3438260555267334, | |
| "learning_rate": 2.8926749967741908e-05, | |
| "entropy": 0.673621541261673, | |
| "num_tokens": 55945963.0, | |
| "mean_token_accuracy": 0.8174030244350433, | |
| "epoch": 0.6481601032924468, | |
| "step": 2510 | |
| }, | |
| { | |
| "loss": 0.7376, | |
| "grad_norm": 0.2591122090816498, | |
| "learning_rate": 2.8549942226689673e-05, | |
| "entropy": 0.7400184273719788, | |
| "num_tokens": 56180380.0, | |
| "mean_token_accuracy": 0.8010205596685409, | |
| "epoch": 0.6507424144609425, | |
| "step": 2520 | |
| }, | |
| { | |
| "loss": 0.7021, | |
| "grad_norm": 0.30535343289375305, | |
| "learning_rate": 2.8174621626448827e-05, | |
| "entropy": 0.7036702126264572, | |
| "num_tokens": 56403176.0, | |
| "mean_token_accuracy": 0.8093780845403671, | |
| "epoch": 0.6533247256294383, | |
| "step": 2530 | |
| }, | |
| { | |
| "loss": 0.6855, | |
| "grad_norm": 0.30807363986968994, | |
| "learning_rate": 2.78008141881419e-05, | |
| "entropy": 0.6923295110464096, | |
| "num_tokens": 56630304.0, | |
| "mean_token_accuracy": 0.8130094796419144, | |
| "epoch": 0.6559070367979342, | |
| "step": 2540 | |
| }, | |
| { | |
| "loss": 0.6795, | |
| "grad_norm": 0.2982047200202942, | |
| "learning_rate": 2.7428545827983342e-05, | |
| "entropy": 0.6848388686776161, | |
| "num_tokens": 56866175.0, | |
| "mean_token_accuracy": 0.8150498539209365, | |
| "epoch": 0.65848934796643, | |
| "step": 2550 | |
| }, | |
| { | |
| "loss": 0.6908, | |
| "grad_norm": 0.30639222264289856, | |
| "learning_rate": 2.705784235548272e-05, | |
| "entropy": 0.6895878106355667, | |
| "num_tokens": 57092835.0, | |
| "mean_token_accuracy": 0.8119323909282684, | |
| "epoch": 0.6610716591349257, | |
| "step": 2560 | |
| }, | |
| { | |
| "loss": 0.7266, | |
| "grad_norm": 0.3296997845172882, | |
| "learning_rate": 2.6688729471655328e-05, | |
| "entropy": 0.7319988161325455, | |
| "num_tokens": 57300056.0, | |
| "mean_token_accuracy": 0.8027160823345184, | |
| "epoch": 0.6636539703034215, | |
| "step": 2570 | |
| }, | |
| { | |
| "loss": 0.6983, | |
| "grad_norm": 0.2694888114929199, | |
| "learning_rate": 2.632123276724031e-05, | |
| "entropy": 0.7012559026479721, | |
| "num_tokens": 57526144.0, | |
| "mean_token_accuracy": 0.8083841979503632, | |
| "epoch": 0.6662362814719174, | |
| "step": 2580 | |
| }, | |
| { | |
| "loss": 0.6938, | |
| "grad_norm": 0.31541094183921814, | |
| "learning_rate": 2.5955377720926512e-05, | |
| "entropy": 0.6915117859840393, | |
| "num_tokens": 57737388.0, | |
| "mean_token_accuracy": 0.8134361296892166, | |
| "epoch": 0.6688185926404132, | |
| "step": 2590 | |
| }, | |
| { | |
| "loss": 0.7445, | |
| "grad_norm": 0.274829626083374, | |
| "learning_rate": 2.559118969758595e-05, | |
| "entropy": 0.7476611018180848, | |
| "num_tokens": 57966900.0, | |
| "mean_token_accuracy": 0.7994938224554062, | |
| "epoch": 0.6714009038089089, | |
| "step": 2600 | |
| }, | |
| { | |
| "loss": 0.7141, | |
| "grad_norm": 0.3201714754104614, | |
| "learning_rate": 2.5228693946515302e-05, | |
| "entropy": 0.7212372869253159, | |
| "num_tokens": 58193368.0, | |
| "mean_token_accuracy": 0.8060443252325058, | |
| "epoch": 0.6739832149774048, | |
| "step": 2610 | |
| }, | |
| { | |
| "loss": 0.698, | |
| "grad_norm": 0.29826799035072327, | |
| "learning_rate": 2.486791559968536e-05, | |
| "entropy": 0.7093008697032929, | |
| "num_tokens": 58403598.0, | |
| "mean_token_accuracy": 0.8120593637228012, | |
| "epoch": 0.6765655261459006, | |
| "step": 2620 | |
| }, | |
| { | |
| "loss": 0.7146, | |
| "grad_norm": 0.3138905465602875, | |
| "learning_rate": 2.4508879669998614e-05, | |
| "entropy": 0.7180470049381256, | |
| "num_tokens": 58634383.0, | |
| "mean_token_accuracy": 0.8049383610486984, | |
| "epoch": 0.6791478373143964, | |
| "step": 2630 | |
| }, | |
| { | |
| "loss": 0.6845, | |
| "grad_norm": 0.34878674149513245, | |
| "learning_rate": 2.415161104955509e-05, | |
| "entropy": 0.6881518125534057, | |
| "num_tokens": 58858055.0, | |
| "mean_token_accuracy": 0.8141413390636444, | |
| "epoch": 0.6817301484828922, | |
| "step": 2640 | |
| }, | |
| { | |
| "loss": 0.745, | |
| "grad_norm": 0.2723364233970642, | |
| "learning_rate": 2.3796134507926594e-05, | |
| "entropy": 0.7326867192983627, | |
| "num_tokens": 59071166.0, | |
| "mean_token_accuracy": 0.8036218643188476, | |
| "epoch": 0.684312459651388, | |
| "step": 2650 | |
| }, | |
| { | |
| "loss": 0.6955, | |
| "grad_norm": 0.2990936040878296, | |
| "learning_rate": 2.3442474690439398e-05, | |
| "entropy": 0.6928721308708191, | |
| "num_tokens": 59279102.0, | |
| "mean_token_accuracy": 0.8137792259454727, | |
| "epoch": 0.6868947708198838, | |
| "step": 2660 | |
| }, | |
| { | |
| "loss": 0.7412, | |
| "grad_norm": 0.3078380823135376, | |
| "learning_rate": 2.30906561164656e-05, | |
| "entropy": 0.737492860853672, | |
| "num_tokens": 59493613.0, | |
| "mean_token_accuracy": 0.803581839799881, | |
| "epoch": 0.6894770819883796, | |
| "step": 2670 | |
| }, | |
| { | |
| "loss": 0.7322, | |
| "grad_norm": 0.3019523024559021, | |
| "learning_rate": 2.274070317772316e-05, | |
| "entropy": 0.7335268735885621, | |
| "num_tokens": 59705132.0, | |
| "mean_token_accuracy": 0.8033424109220505, | |
| "epoch": 0.6920593931568754, | |
| "step": 2680 | |
| }, | |
| { | |
| "loss": 0.7013, | |
| "grad_norm": 0.2935568392276764, | |
| "learning_rate": 2.2392640136584834e-05, | |
| "entropy": 0.7151971668004989, | |
| "num_tokens": 59914822.0, | |
| "mean_token_accuracy": 0.8081181108951568, | |
| "epoch": 0.6946417043253712, | |
| "step": 2690 | |
| }, | |
| { | |
| "loss": 0.6921, | |
| "grad_norm": 0.3314759135246277, | |
| "learning_rate": 2.204649112439604e-05, | |
| "entropy": 0.6989267513155937, | |
| "num_tokens": 60130448.0, | |
| "mean_token_accuracy": 0.8114324450492859, | |
| "epoch": 0.697224015493867, | |
| "step": 2700 | |
| }, | |
| { | |
| "loss": 0.7304, | |
| "grad_norm": 0.2831194996833801, | |
| "learning_rate": 2.1702280139801835e-05, | |
| "entropy": 0.7311217278242111, | |
| "num_tokens": 60356454.0, | |
| "mean_token_accuracy": 0.8026905834674836, | |
| "epoch": 0.6998063266623629, | |
| "step": 2710 | |
| }, | |
| { | |
| "loss": 0.7007, | |
| "grad_norm": 0.33581551909446716, | |
| "learning_rate": 2.1360031047083045e-05, | |
| "entropy": 0.7018767982721329, | |
| "num_tokens": 60563637.0, | |
| "mean_token_accuracy": 0.8092585116624832, | |
| "epoch": 0.7023886378308586, | |
| "step": 2720 | |
| }, | |
| { | |
| "loss": 0.724, | |
| "grad_norm": 0.29153990745544434, | |
| "learning_rate": 2.101976757450183e-05, | |
| "entropy": 0.7301119029521942, | |
| "num_tokens": 60791248.0, | |
| "mean_token_accuracy": 0.8054573953151702, | |
| "epoch": 0.7049709489993544, | |
| "step": 2730 | |
| }, | |
| { | |
| "loss": 0.7099, | |
| "grad_norm": 0.3348991274833679, | |
| "learning_rate": 2.0681513312656487e-05, | |
| "entropy": 0.7101235866546631, | |
| "num_tokens": 61012834.0, | |
| "mean_token_accuracy": 0.8071693569421768, | |
| "epoch": 0.7075532601678503, | |
| "step": 2740 | |
| }, | |
| { | |
| "loss": 0.7157, | |
| "grad_norm": 0.33254438638687134, | |
| "learning_rate": 2.0345291712845966e-05, | |
| "entropy": 0.7233776718378067, | |
| "num_tokens": 61240008.0, | |
| "mean_token_accuracy": 0.804712375998497, | |
| "epoch": 0.7101355713363461, | |
| "step": 2750 | |
| }, | |
| { | |
| "loss": 0.6452, | |
| "grad_norm": 0.31784573197364807, | |
| "learning_rate": 2.0011126085443978e-05, | |
| "entropy": 0.6461900442838668, | |
| "num_tokens": 61456216.0, | |
| "mean_token_accuracy": 0.8218548208475113, | |
| "epoch": 0.7127178825048418, | |
| "step": 2760 | |
| }, | |
| { | |
| "loss": 0.777, | |
| "grad_norm": 0.3528957664966583, | |
| "learning_rate": 1.9679039598282866e-05, | |
| "entropy": 0.7807377189397812, | |
| "num_tokens": 61671856.0, | |
| "mean_token_accuracy": 0.7943974494934082, | |
| "epoch": 0.7153001936733376, | |
| "step": 2770 | |
| }, | |
| { | |
| "loss": 0.6969, | |
| "grad_norm": 0.27970144152641296, | |
| "learning_rate": 1.9349055275047378e-05, | |
| "entropy": 0.6933271333575248, | |
| "num_tokens": 61901204.0, | |
| "mean_token_accuracy": 0.8111155271530152, | |
| "epoch": 0.7178825048418335, | |
| "step": 2780 | |
| }, | |
| { | |
| "loss": 0.7737, | |
| "grad_norm": 0.2799535393714905, | |
| "learning_rate": 1.9021195993678416e-05, | |
| "entropy": 0.7860652267932892, | |
| "num_tokens": 62115655.0, | |
| "mean_token_accuracy": 0.7939832776784896, | |
| "epoch": 0.7204648160103292, | |
| "step": 2790 | |
| }, | |
| { | |
| "loss": 0.6883, | |
| "grad_norm": 0.2660212516784668, | |
| "learning_rate": 1.8695484484786917e-05, | |
| "entropy": 0.6971263945102691, | |
| "num_tokens": 62338640.0, | |
| "mean_token_accuracy": 0.8095773428678512, | |
| "epoch": 0.723047127178825, | |
| "step": 2800 | |
| }, | |
| { | |
| "loss": 0.6817, | |
| "grad_norm": 0.34406208992004395, | |
| "learning_rate": 1.8371943330077918e-05, | |
| "entropy": 0.6870502740144729, | |
| "num_tokens": 62558349.0, | |
| "mean_token_accuracy": 0.8148900002241135, | |
| "epoch": 0.7256294383473209, | |
| "step": 2810 | |
| }, | |
| { | |
| "loss": 0.6707, | |
| "grad_norm": 0.3069903552532196, | |
| "learning_rate": 1.805059496078495e-05, | |
| "entropy": 0.6698985725641251, | |
| "num_tokens": 62784119.0, | |
| "mean_token_accuracy": 0.8161512583494186, | |
| "epoch": 0.7282117495158167, | |
| "step": 2820 | |
| }, | |
| { | |
| "loss": 0.6912, | |
| "grad_norm": 0.3088005781173706, | |
| "learning_rate": 1.7731461656114893e-05, | |
| "entropy": 0.6937626540660858, | |
| "num_tokens": 63025574.0, | |
| "mean_token_accuracy": 0.8110652893781662, | |
| "epoch": 0.7307940606843124, | |
| "step": 2830 | |
| }, | |
| { | |
| "loss": 0.7005, | |
| "grad_norm": 0.2753876745700836, | |
| "learning_rate": 1.7414565541703342e-05, | |
| "entropy": 0.7088986724615097, | |
| "num_tokens": 63241986.0, | |
| "mean_token_accuracy": 0.8078723192214966, | |
| "epoch": 0.7333763718528082, | |
| "step": 2840 | |
| }, | |
| { | |
| "loss": 0.7379, | |
| "grad_norm": 0.27868911623954773, | |
| "learning_rate": 1.7099928588080605e-05, | |
| "entropy": 0.7435200735926628, | |
| "num_tokens": 63465840.0, | |
| "mean_token_accuracy": 0.8005112290382386, | |
| "epoch": 0.7359586830213041, | |
| "step": 2850 | |
| }, | |
| { | |
| "loss": 0.6757, | |
| "grad_norm": 0.31595101952552795, | |
| "learning_rate": 1.6787572609148523e-05, | |
| "entropy": 0.6833824813365936, | |
| "num_tokens": 63695553.0, | |
| "mean_token_accuracy": 0.8157312601804734, | |
| "epoch": 0.7385409941897999, | |
| "step": 2860 | |
| }, | |
| { | |
| "loss": 0.7016, | |
| "grad_norm": 0.2744700610637665, | |
| "learning_rate": 1.6477519260668062e-05, | |
| "entropy": 0.7068268299102783, | |
| "num_tokens": 63922146.0, | |
| "mean_token_accuracy": 0.8094365686178208, | |
| "epoch": 0.7411233053582956, | |
| "step": 2870 | |
| }, | |
| { | |
| "loss": 0.7036, | |
| "grad_norm": 0.29427847266197205, | |
| "learning_rate": 1.6169790038757932e-05, | |
| "entropy": 0.6996650636196137, | |
| "num_tokens": 64140210.0, | |
| "mean_token_accuracy": 0.8086274355649948, | |
| "epoch": 0.7437056165267915, | |
| "step": 2880 | |
| }, | |
| { | |
| "loss": 0.6855, | |
| "grad_norm": 0.318248987197876, | |
| "learning_rate": 1.586440627840425e-05, | |
| "entropy": 0.6926235496997833, | |
| "num_tokens": 64367668.0, | |
| "mean_token_accuracy": 0.8126532167196274, | |
| "epoch": 0.7462879276952873, | |
| "step": 2890 | |
| }, | |
| { | |
| "loss": 0.6587, | |
| "grad_norm": 0.26778993010520935, | |
| "learning_rate": 1.5561389151981366e-05, | |
| "entropy": 0.6617662817239761, | |
| "num_tokens": 64596890.0, | |
| "mean_token_accuracy": 0.8181241929531098, | |
| "epoch": 0.7488702388637831, | |
| "step": 2900 | |
| }, | |
| { | |
| "loss": 0.7325, | |
| "grad_norm": 0.27768194675445557, | |
| "learning_rate": 1.5260759667783997e-05, | |
| "entropy": 0.7412423819303513, | |
| "num_tokens": 64806702.0, | |
| "mean_token_accuracy": 0.8047612726688385, | |
| "epoch": 0.7514525500322788, | |
| "step": 2910 | |
| }, | |
| { | |
| "loss": 0.6484, | |
| "grad_norm": 0.2950485050678253, | |
| "learning_rate": 1.4962538668570691e-05, | |
| "entropy": 0.6552188992500305, | |
| "num_tokens": 65032975.0, | |
| "mean_token_accuracy": 0.8191455602645874, | |
| "epoch": 0.7540348612007747, | |
| "step": 2920 | |
| }, | |
| { | |
| "loss": 0.7559, | |
| "grad_norm": 0.305338591337204, | |
| "learning_rate": 1.4666746830118804e-05, | |
| "entropy": 0.7579466968774795, | |
| "num_tokens": 65245187.0, | |
| "mean_token_accuracy": 0.796593251824379, | |
| "epoch": 0.7566171723692705, | |
| "step": 2930 | |
| }, | |
| { | |
| "loss": 0.7234, | |
| "grad_norm": 0.29828450083732605, | |
| "learning_rate": 1.4373404659791028e-05, | |
| "entropy": 0.7265091866254807, | |
| "num_tokens": 65478479.0, | |
| "mean_token_accuracy": 0.8035373449325561, | |
| "epoch": 0.7591994835377663, | |
| "step": 2940 | |
| }, | |
| { | |
| "loss": 0.658, | |
| "grad_norm": 0.29163649678230286, | |
| "learning_rate": 1.4082532495113626e-05, | |
| "entropy": 0.6767950043082237, | |
| "num_tokens": 65695750.0, | |
| "mean_token_accuracy": 0.8175939649343491, | |
| "epoch": 0.7617817947062621, | |
| "step": 2950 | |
| }, | |
| { | |
| "loss": 0.6907, | |
| "grad_norm": 0.29603517055511475, | |
| "learning_rate": 1.3794150502366404e-05, | |
| "entropy": 0.6919728189706802, | |
| "num_tokens": 65932520.0, | |
| "mean_token_accuracy": 0.8117793321609497, | |
| "epoch": 0.7643641058747579, | |
| "step": 2960 | |
| }, | |
| { | |
| "loss": 0.7528, | |
| "grad_norm": 0.29973626136779785, | |
| "learning_rate": 1.3508278675184593e-05, | |
| "entropy": 0.7577356785535813, | |
| "num_tokens": 66148085.0, | |
| "mean_token_accuracy": 0.798463636636734, | |
| "epoch": 0.7669464170432537, | |
| "step": 2970 | |
| }, | |
| { | |
| "loss": 0.6982, | |
| "grad_norm": 0.3467027246952057, | |
| "learning_rate": 1.3224936833172657e-05, | |
| "entropy": 0.7075864315032959, | |
| "num_tokens": 66350441.0, | |
| "mean_token_accuracy": 0.8102767705917359, | |
| "epoch": 0.7695287282117496, | |
| "step": 2980 | |
| }, | |
| { | |
| "loss": 0.7345, | |
| "grad_norm": 0.3444955050945282, | |
| "learning_rate": 1.2944144620530229e-05, | |
| "entropy": 0.7401443183422088, | |
| "num_tokens": 66574442.0, | |
| "mean_token_accuracy": 0.802479374408722, | |
| "epoch": 0.7721110393802453, | |
| "step": 2990 | |
| }, | |
| { | |
| "loss": 0.7497, | |
| "grad_norm": 0.2776440382003784, | |
| "learning_rate": 1.266592150469012e-05, | |
| "entropy": 0.7472027570009232, | |
| "num_tokens": 66801152.0, | |
| "mean_token_accuracy": 0.7987334460020066, | |
| "epoch": 0.7746933505487411, | |
| "step": 3000 | |
| }, | |
| { | |
| "loss": 0.6939, | |
| "grad_norm": 0.27906349301338196, | |
| "learning_rate": 1.2390286774968718e-05, | |
| "entropy": 0.6968228965997696, | |
| "num_tokens": 67029964.0, | |
| "mean_token_accuracy": 0.8106195479631424, | |
| "epoch": 0.7772756617172369, | |
| "step": 3010 | |
| }, | |
| { | |
| "loss": 0.69, | |
| "grad_norm": 0.2762759029865265, | |
| "learning_rate": 1.2117259541228549e-05, | |
| "entropy": 0.6997145384550094, | |
| "num_tokens": 67247542.0, | |
| "mean_token_accuracy": 0.812323608994484, | |
| "epoch": 0.7798579728857328, | |
| "step": 3020 | |
| }, | |
| { | |
| "loss": 0.7269, | |
| "grad_norm": 0.3158010244369507, | |
| "learning_rate": 1.1846858732553456e-05, | |
| "entropy": 0.7227586776018142, | |
| "num_tokens": 67461274.0, | |
| "mean_token_accuracy": 0.8032592833042145, | |
| "epoch": 0.7824402840542285, | |
| "step": 3030 | |
| }, | |
| { | |
| "loss": 0.7255, | |
| "grad_norm": 0.31994199752807617, | |
| "learning_rate": 1.1579103095936206e-05, | |
| "entropy": 0.7367910444736481, | |
| "num_tokens": 67672498.0, | |
| "mean_token_accuracy": 0.8042957931756973, | |
| "epoch": 0.7850225952227243, | |
| "step": 3040 | |
| }, | |
| { | |
| "loss": 0.7221, | |
| "grad_norm": 0.3173619508743286, | |
| "learning_rate": 1.1314011194978764e-05, | |
| "entropy": 0.7295337229967117, | |
| "num_tokens": 67896909.0, | |
| "mean_token_accuracy": 0.8035631746053695, | |
| "epoch": 0.7876049063912202, | |
| "step": 3050 | |
| }, | |
| { | |
| "loss": 0.7016, | |
| "grad_norm": 0.26975125074386597, | |
| "learning_rate": 1.1051601408605277e-05, | |
| "entropy": 0.6950832739472389, | |
| "num_tokens": 68127225.0, | |
| "mean_token_accuracy": 0.8104336768388748, | |
| "epoch": 0.790187217559716, | |
| "step": 3060 | |
| }, | |
| { | |
| "loss": 0.7297, | |
| "grad_norm": 0.2895670235157013, | |
| "learning_rate": 1.079189192978785e-05, | |
| "entropy": 0.728104168176651, | |
| "num_tokens": 68354043.0, | |
| "mean_token_accuracy": 0.8038080394268036, | |
| "epoch": 0.7927695287282117, | |
| "step": 3070 | |
| }, | |
| { | |
| "loss": 0.6808, | |
| "grad_norm": 0.31210049986839294, | |
| "learning_rate": 1.0534900764285206e-05, | |
| "entropy": 0.6828968375921249, | |
| "num_tokens": 68568009.0, | |
| "mean_token_accuracy": 0.813372614979744, | |
| "epoch": 0.7953518398967075, | |
| "step": 3080 | |
| }, | |
| { | |
| "loss": 0.7699, | |
| "grad_norm": 0.3012009859085083, | |
| "learning_rate": 1.0280645729394366e-05, | |
| "entropy": 0.7736784100532532, | |
| "num_tokens": 68780475.0, | |
| "mean_token_accuracy": 0.7947412014007569, | |
| "epoch": 0.7979341510652034, | |
| "step": 3090 | |
| }, | |
| { | |
| "loss": 0.7059, | |
| "grad_norm": 0.2901022136211395, | |
| "learning_rate": 1.002914445271535e-05, | |
| "entropy": 0.7141468852758408, | |
| "num_tokens": 68993221.0, | |
| "mean_token_accuracy": 0.8091312497854233, | |
| "epoch": 0.8005164622336992, | |
| "step": 3100 | |
| }, | |
| { | |
| "loss": 0.7216, | |
| "grad_norm": 0.34709054231643677, | |
| "learning_rate": 9.780414370929064e-06, | |
| "entropy": 0.7236628875136375, | |
| "num_tokens": 69217587.0, | |
| "mean_token_accuracy": 0.806429061293602, | |
| "epoch": 0.8030987734021949, | |
| "step": 3110 | |
| }, | |
| { | |
| "loss": 0.7232, | |
| "grad_norm": 0.2870867848396301, | |
| "learning_rate": 9.534472728588383e-06, | |
| "entropy": 0.7272969245910644, | |
| "num_tokens": 69443310.0, | |
| "mean_token_accuracy": 0.8060679912567139, | |
| "epoch": 0.8056810845706908, | |
| "step": 3120 | |
| }, | |
| { | |
| "loss": 0.7545, | |
| "grad_norm": 0.25815922021865845, | |
| "learning_rate": 9.291336576922616e-06, | |
| "entropy": 0.7626336216926575, | |
| "num_tokens": 69672892.0, | |
| "mean_token_accuracy": 0.7972894191741944, | |
| "epoch": 0.8082633957391866, | |
| "step": 3130 | |
| }, | |
| { | |
| "loss": 0.6876, | |
| "grad_norm": 0.29713913798332214, | |
| "learning_rate": 9.051022772655293e-06, | |
| "entropy": 0.6986047714948654, | |
| "num_tokens": 69890743.0, | |
| "mean_token_accuracy": 0.8120619833469391, | |
| "epoch": 0.8108457069076824, | |
| "step": 3140 | |
| }, | |
| { | |
| "loss": 0.7198, | |
| "grad_norm": 0.2686369717121124, | |
| "learning_rate": 8.813547976835528e-06, | |
| "entropy": 0.7254200860857963, | |
| "num_tokens": 70116053.0, | |
| "mean_token_accuracy": 0.8052540808916092, | |
| "epoch": 0.8134280180761781, | |
| "step": 3150 | |
| }, | |
| { | |
| "loss": 0.6943, | |
| "grad_norm": 0.2916862368583679, | |
| "learning_rate": 8.578928653682866e-06, | |
| "entropy": 0.6961161434650421, | |
| "num_tokens": 70329456.0, | |
| "mean_token_accuracy": 0.8126390129327774, | |
| "epoch": 0.816010329244674, | |
| "step": 3160 | |
| }, | |
| { | |
| "loss": 0.7393, | |
| "grad_norm": 0.30221208930015564, | |
| "learning_rate": 8.347181069445836e-06, | |
| "entropy": 0.7420953810214996, | |
| "num_tokens": 70553938.0, | |
| "mean_token_accuracy": 0.7991205245256424, | |
| "epoch": 0.8185926404131698, | |
| "step": 3170 | |
| }, | |
| { | |
| "loss": 0.7443, | |
| "grad_norm": 0.30007004737854004, | |
| "learning_rate": 8.118321291274184e-06, | |
| "entropy": 0.7562640652060508, | |
| "num_tokens": 70776813.0, | |
| "mean_token_accuracy": 0.7976048231124878, | |
| "epoch": 0.8211749515816655, | |
| "step": 3180 | |
| }, | |
| { | |
| "loss": 0.7273, | |
| "grad_norm": 0.28486886620521545, | |
| "learning_rate": 7.892365186104955e-06, | |
| "entropy": 0.7327735960483551, | |
| "num_tokens": 71001316.0, | |
| "mean_token_accuracy": 0.8032251060009002, | |
| "epoch": 0.8237572627501614, | |
| "step": 3190 | |
| }, | |
| { | |
| "loss": 0.673, | |
| "grad_norm": 0.26758044958114624, | |
| "learning_rate": 7.669328419562422e-06, | |
| "entropy": 0.6777101784944535, | |
| "num_tokens": 71225607.0, | |
| "mean_token_accuracy": 0.8158749163150787, | |
| "epoch": 0.8263395739186572, | |
| "step": 3200 | |
| }, | |
| { | |
| "loss": 0.6451, | |
| "grad_norm": 0.31398946046829224, | |
| "learning_rate": 7.4492264548719605e-06, | |
| "entropy": 0.6516683280467988, | |
| "num_tokens": 71460072.0, | |
| "mean_token_accuracy": 0.8190358489751816, | |
| "epoch": 0.828921885087153, | |
| "step": 3210 | |
| }, | |
| { | |
| "loss": 0.687, | |
| "grad_norm": 0.28087955713272095, | |
| "learning_rate": 7.2320745517880205e-06, | |
| "entropy": 0.6877734363079071, | |
| "num_tokens": 71691287.0, | |
| "mean_token_accuracy": 0.8136042594909668, | |
| "epoch": 0.8315041962556488, | |
| "step": 3220 | |
| }, | |
| { | |
| "loss": 0.766, | |
| "grad_norm": 0.3055887818336487, | |
| "learning_rate": 7.01788776553613e-06, | |
| "entropy": 0.7661666601896286, | |
| "num_tokens": 71915757.0, | |
| "mean_token_accuracy": 0.7935220867395401, | |
| "epoch": 0.8340865074241446, | |
| "step": 3230 | |
| }, | |
| { | |
| "loss": 0.6704, | |
| "grad_norm": 0.26342907547950745, | |
| "learning_rate": 6.806680945769112e-06, | |
| "entropy": 0.6782629758119583, | |
| "num_tokens": 72138609.0, | |
| "mean_token_accuracy": 0.815047687292099, | |
| "epoch": 0.8366688185926404, | |
| "step": 3240 | |
| }, | |
| { | |
| "loss": 0.6974, | |
| "grad_norm": 0.30100300908088684, | |
| "learning_rate": 6.598468735537566e-06, | |
| "entropy": 0.7019865930080413, | |
| "num_tokens": 72355904.0, | |
| "mean_token_accuracy": 0.8122695237398148, | |
| "epoch": 0.8392511297611362, | |
| "step": 3250 | |
| }, | |
| { | |
| "loss": 0.6812, | |
| "grad_norm": 0.31231164932250977, | |
| "learning_rate": 6.3932655702746594e-06, | |
| "entropy": 0.6826063901185989, | |
| "num_tokens": 72582185.0, | |
| "mean_token_accuracy": 0.8142417192459106, | |
| "epoch": 0.841833440929632, | |
| "step": 3260 | |
| }, | |
| { | |
| "loss": 0.739, | |
| "grad_norm": 0.2941230535507202, | |
| "learning_rate": 6.191085676795289e-06, | |
| "entropy": 0.7482027873396874, | |
| "num_tokens": 72786873.0, | |
| "mean_token_accuracy": 0.800478795170784, | |
| "epoch": 0.8444157520981278, | |
| "step": 3270 | |
| }, | |
| { | |
| "loss": 0.6933, | |
| "grad_norm": 0.3058132529258728, | |
| "learning_rate": 5.991943072309764e-06, | |
| "entropy": 0.7009373277425766, | |
| "num_tokens": 73007186.0, | |
| "mean_token_accuracy": 0.8104920625686646, | |
| "epoch": 0.8469980632666236, | |
| "step": 3280 | |
| }, | |
| { | |
| "loss": 0.691, | |
| "grad_norm": 0.34394147992134094, | |
| "learning_rate": 5.795851563451965e-06, | |
| "entropy": 0.6961993753910065, | |
| "num_tokens": 73221227.0, | |
| "mean_token_accuracy": 0.8104782938957215, | |
| "epoch": 0.8495803744351195, | |
| "step": 3290 | |
| }, | |
| { | |
| "loss": 0.6845, | |
| "grad_norm": 0.27636125683784485, | |
| "learning_rate": 5.602824745322127e-06, | |
| "entropy": 0.6845636337995529, | |
| "num_tokens": 73446914.0, | |
| "mean_token_accuracy": 0.8126100540161133, | |
| "epoch": 0.8521626856036152, | |
| "step": 3300 | |
| }, | |
| { | |
| "loss": 0.6588, | |
| "grad_norm": 0.28646811842918396, | |
| "learning_rate": 5.412876000544309e-06, | |
| "entropy": 0.6653542622923851, | |
| "num_tokens": 73665913.0, | |
| "mean_token_accuracy": 0.8176908433437348, | |
| "epoch": 0.854744996772111, | |
| "step": 3310 | |
| }, | |
| { | |
| "loss": 0.7334, | |
| "grad_norm": 0.28045985102653503, | |
| "learning_rate": 5.226018498338526e-06, | |
| "entropy": 0.7391698479652404, | |
| "num_tokens": 73893662.0, | |
| "mean_token_accuracy": 0.8034716665744781, | |
| "epoch": 0.8573273079406069, | |
| "step": 3320 | |
| }, | |
| { | |
| "loss": 0.6959, | |
| "grad_norm": 0.27868810296058655, | |
| "learning_rate": 5.042265193607765e-06, | |
| "entropy": 0.7021783247590065, | |
| "num_tokens": 74117461.0, | |
| "mean_token_accuracy": 0.8107198417186737, | |
| "epoch": 0.8599096191091027, | |
| "step": 3330 | |
| }, | |
| { | |
| "loss": 0.6257, | |
| "grad_norm": 0.289795458316803, | |
| "learning_rate": 4.861628826039799e-06, | |
| "entropy": 0.6267055228352547, | |
| "num_tokens": 74350458.0, | |
| "mean_token_accuracy": 0.8263014882802964, | |
| "epoch": 0.8624919302775984, | |
| "step": 3340 | |
| }, | |
| { | |
| "loss": 0.6857, | |
| "grad_norm": 0.3118912875652313, | |
| "learning_rate": 4.684121919223938e-06, | |
| "entropy": 0.6892769366502762, | |
| "num_tokens": 74570028.0, | |
| "mean_token_accuracy": 0.8128671914339065, | |
| "epoch": 0.8650742414460942, | |
| "step": 3350 | |
| }, | |
| { | |
| "loss": 0.7262, | |
| "grad_norm": 0.2844097912311554, | |
| "learning_rate": 4.509756779782765e-06, | |
| "entropy": 0.7205952927470207, | |
| "num_tokens": 74788292.0, | |
| "mean_token_accuracy": 0.8064137637615204, | |
| "epoch": 0.8676565526145901, | |
| "step": 3360 | |
| }, | |
| { | |
| "loss": 0.6939, | |
| "grad_norm": 0.2834044098854065, | |
| "learning_rate": 4.338545496518908e-06, | |
| "entropy": 0.7010853558778762, | |
| "num_tokens": 75015661.0, | |
| "mean_token_accuracy": 0.8110615521669388, | |
| "epoch": 0.8702388637830859, | |
| "step": 3370 | |
| }, | |
| { | |
| "loss": 0.6466, | |
| "grad_norm": 0.31039345264434814, | |
| "learning_rate": 4.17049993957695e-06, | |
| "entropy": 0.6548168569803238, | |
| "num_tokens": 75237714.0, | |
| "mean_token_accuracy": 0.8208600997924804, | |
| "epoch": 0.8728211749515816, | |
| "step": 3380 | |
| }, | |
| { | |
| "loss": 0.7192, | |
| "grad_norm": 0.31068113446235657, | |
| "learning_rate": 4.005631759620409e-06, | |
| "entropy": 0.7178841561079026, | |
| "num_tokens": 75466595.0, | |
| "mean_token_accuracy": 0.8053244233131409, | |
| "epoch": 0.8754034861200775, | |
| "step": 3390 | |
| }, | |
| { | |
| "loss": 0.7341, | |
| "grad_norm": 0.2801123559474945, | |
| "learning_rate": 3.8439523870240466e-06, | |
| "entropy": 0.7397258043289184, | |
| "num_tokens": 75685746.0, | |
| "mean_token_accuracy": 0.8019078284502029, | |
| "epoch": 0.8779857972885733, | |
| "step": 3400 | |
| }, | |
| { | |
| "loss": 0.749, | |
| "grad_norm": 0.28393077850341797, | |
| "learning_rate": 3.685473031081377e-06, | |
| "entropy": 0.7531062722206116, | |
| "num_tokens": 75891129.0, | |
| "mean_token_accuracy": 0.799371811747551, | |
| "epoch": 0.8805681084570691, | |
| "step": 3410 | |
| }, | |
| { | |
| "loss": 0.7069, | |
| "grad_norm": 0.2816139757633209, | |
| "learning_rate": 3.5302046792275057e-06, | |
| "entropy": 0.7183081269264221, | |
| "num_tokens": 76127939.0, | |
| "mean_token_accuracy": 0.8063154727220535, | |
| "epoch": 0.8831504196255648, | |
| "step": 3420 | |
| }, | |
| { | |
| "loss": 0.6473, | |
| "grad_norm": 0.3396845757961273, | |
| "learning_rate": 3.378158096277395e-06, | |
| "entropy": 0.6580532863736153, | |
| "num_tokens": 76356205.0, | |
| "mean_token_accuracy": 0.8217125713825226, | |
| "epoch": 0.8857327307940607, | |
| "step": 3430 | |
| }, | |
| { | |
| "loss": 0.6843, | |
| "grad_norm": 0.3237053155899048, | |
| "learning_rate": 3.2293438236795203e-06, | |
| "entropy": 0.6855427086353302, | |
| "num_tokens": 76583682.0, | |
| "mean_token_accuracy": 0.814071336388588, | |
| "epoch": 0.8883150419625565, | |
| "step": 3440 | |
| }, | |
| { | |
| "loss": 0.717, | |
| "grad_norm": 0.24803553521633148, | |
| "learning_rate": 3.083772178785016e-06, | |
| "entropy": 0.7196818366646767, | |
| "num_tokens": 76805913.0, | |
| "mean_token_accuracy": 0.8079381316900254, | |
| "epoch": 0.8908973531310523, | |
| "step": 3450 | |
| }, | |
| { | |
| "loss": 0.6765, | |
| "grad_norm": 0.2812926173210144, | |
| "learning_rate": 2.941453254132398e-06, | |
| "entropy": 0.6827230915427208, | |
| "num_tokens": 77042857.0, | |
| "mean_token_accuracy": 0.8151951909065247, | |
| "epoch": 0.8934796642995481, | |
| "step": 3460 | |
| }, | |
| { | |
| "loss": 0.7466, | |
| "grad_norm": 0.25068199634552, | |
| "learning_rate": 2.802396916747818e-06, | |
| "entropy": 0.7477584034204483, | |
| "num_tokens": 77274945.0, | |
| "mean_token_accuracy": 0.7997568786144257, | |
| "epoch": 0.8960619754680439, | |
| "step": 3470 | |
| }, | |
| { | |
| "loss": 0.7463, | |
| "grad_norm": 0.29893818497657776, | |
| "learning_rate": 2.666612807461e-06, | |
| "entropy": 0.7611301898956299, | |
| "num_tokens": 77507117.0, | |
| "mean_token_accuracy": 0.7970719575881958, | |
| "epoch": 0.8986442866365397, | |
| "step": 3480 | |
| }, | |
| { | |
| "loss": 0.734, | |
| "grad_norm": 0.25104856491088867, | |
| "learning_rate": 2.5341103402368116e-06, | |
| "entropy": 0.7325819343328476, | |
| "num_tokens": 77741888.0, | |
| "mean_token_accuracy": 0.802145466208458, | |
| "epoch": 0.9012265978050356, | |
| "step": 3490 | |
| }, | |
| { | |
| "loss": 0.7342, | |
| "grad_norm": 0.2604316174983978, | |
| "learning_rate": 2.404898701522612e-06, | |
| "entropy": 0.7358214914798736, | |
| "num_tokens": 77976461.0, | |
| "mean_token_accuracy": 0.8025443255901337, | |
| "epoch": 0.9038089089735313, | |
| "step": 3500 | |
| }, | |
| { | |
| "loss": 0.6718, | |
| "grad_norm": 0.27755334973335266, | |
| "learning_rate": 2.278986849611353e-06, | |
| "entropy": 0.6820023268461227, | |
| "num_tokens": 78199709.0, | |
| "mean_token_accuracy": 0.8153169691562653, | |
| "epoch": 0.9063912201420271, | |
| "step": 3510 | |
| }, | |
| { | |
| "loss": 0.7006, | |
| "grad_norm": 0.27558284997940063, | |
| "learning_rate": 2.1563835140204825e-06, | |
| "entropy": 0.6964967548847198, | |
| "num_tokens": 78427884.0, | |
| "mean_token_accuracy": 0.8086929380893707, | |
| "epoch": 0.9089735313105229, | |
| "step": 3520 | |
| }, | |
| { | |
| "loss": 0.6621, | |
| "grad_norm": 0.29580724239349365, | |
| "learning_rate": 2.037097194886728e-06, | |
| "entropy": 0.6729359984397888, | |
| "num_tokens": 78640676.0, | |
| "mean_token_accuracy": 0.8171943426132202, | |
| "epoch": 0.9115558424790188, | |
| "step": 3530 | |
| }, | |
| { | |
| "loss": 0.6216, | |
| "grad_norm": 0.37162065505981445, | |
| "learning_rate": 1.921136162376791e-06, | |
| "entropy": 0.6259115815162659, | |
| "num_tokens": 78854352.0, | |
| "mean_token_accuracy": 0.8265722841024399, | |
| "epoch": 0.9141381536475145, | |
| "step": 3540 | |
| }, | |
| { | |
| "loss": 0.7331, | |
| "grad_norm": 0.2595006227493286, | |
| "learning_rate": 1.8085084561139555e-06, | |
| "entropy": 0.7356774628162384, | |
| "num_tokens": 79083671.0, | |
| "mean_token_accuracy": 0.8027292221784592, | |
| "epoch": 0.9167204648160103, | |
| "step": 3550 | |
| }, | |
| { | |
| "loss": 0.7051, | |
| "grad_norm": 0.2967183589935303, | |
| "learning_rate": 1.6992218846207053e-06, | |
| "entropy": 0.7063945472240448, | |
| "num_tokens": 79299500.0, | |
| "mean_token_accuracy": 0.8102407902479172, | |
| "epoch": 0.9193027759845062, | |
| "step": 3560 | |
| }, | |
| { | |
| "loss": 0.6776, | |
| "grad_norm": 0.31715014576911926, | |
| "learning_rate": 1.593284024777364e-06, | |
| "entropy": 0.678543546795845, | |
| "num_tokens": 79516676.0, | |
| "mean_token_accuracy": 0.81387257874012, | |
| "epoch": 0.921885087153002, | |
| "step": 3570 | |
| }, | |
| { | |
| "loss": 0.6979, | |
| "grad_norm": 0.2775455713272095, | |
| "learning_rate": 1.4907022212967803e-06, | |
| "entropy": 0.6995210900902749, | |
| "num_tokens": 79753818.0, | |
| "mean_token_accuracy": 0.8093364298343658, | |
| "epoch": 0.9244673983214977, | |
| "step": 3580 | |
| }, | |
| { | |
| "loss": 0.6913, | |
| "grad_norm": 0.3358924686908722, | |
| "learning_rate": 1.3914835862151033e-06, | |
| "entropy": 0.6959970772266388, | |
| "num_tokens": 79987679.0, | |
| "mean_token_accuracy": 0.8107724577188492, | |
| "epoch": 0.9270497094899935, | |
| "step": 3590 | |
| }, | |
| { | |
| "loss": 0.721, | |
| "grad_norm": 0.25691378116607666, | |
| "learning_rate": 1.295634998398726e-06, | |
| "entropy": 0.7186200335621834, | |
| "num_tokens": 80219837.0, | |
| "mean_token_accuracy": 0.8068506330251694, | |
| "epoch": 0.9296320206584894, | |
| "step": 3600 | |
| }, | |
| { | |
| "loss": 0.7425, | |
| "grad_norm": 0.291149765253067, | |
| "learning_rate": 1.2031631030673618e-06, | |
| "entropy": 0.7435696452856064, | |
| "num_tokens": 80451637.0, | |
| "mean_token_accuracy": 0.8018748521804809, | |
| "epoch": 0.9322143318269851, | |
| "step": 3610 | |
| }, | |
| { | |
| "loss": 0.6562, | |
| "grad_norm": 0.28476300835609436, | |
| "learning_rate": 1.114074311333313e-06, | |
| "entropy": 0.6657967358827591, | |
| "num_tokens": 80672000.0, | |
| "mean_token_accuracy": 0.818168243765831, | |
| "epoch": 0.9347966429954809, | |
| "step": 3620 | |
| }, | |
| { | |
| "loss": 0.7104, | |
| "grad_norm": 0.3295108377933502, | |
| "learning_rate": 1.0283747997570092e-06, | |
| "entropy": 0.7145670384168625, | |
| "num_tokens": 80885606.0, | |
| "mean_token_accuracy": 0.807201874256134, | |
| "epoch": 0.9373789541639768, | |
| "step": 3630 | |
| }, | |
| { | |
| "loss": 0.7107, | |
| "grad_norm": 0.28491243720054626, | |
| "learning_rate": 9.460705099187783e-07, | |
| "entropy": 0.7149154737591743, | |
| "num_tokens": 81100462.0, | |
| "mean_token_accuracy": 0.8080836564302445, | |
| "epoch": 0.9399612653324726, | |
| "step": 3640 | |
| }, | |
| { | |
| "loss": 0.715, | |
| "grad_norm": 0.3006097376346588, | |
| "learning_rate": 8.671671480069032e-07, | |
| "entropy": 0.7211218193173409, | |
| "num_tokens": 81324223.0, | |
| "mean_token_accuracy": 0.8062219977378845, | |
| "epoch": 0.9425435765009683, | |
| "step": 3650 | |
| }, | |
| { | |
| "loss": 0.6908, | |
| "grad_norm": 0.3136330544948578, | |
| "learning_rate": 7.916701844220109e-07, | |
| "entropy": 0.6916179537773133, | |
| "num_tokens": 81546085.0, | |
| "mean_token_accuracy": 0.8120699375867844, | |
| "epoch": 0.9451258876694641, | |
| "step": 3660 | |
| }, | |
| { | |
| "loss": 0.7581, | |
| "grad_norm": 0.30285340547561646, | |
| "learning_rate": 7.195848533978311e-07, | |
| "entropy": 0.7561731487512589, | |
| "num_tokens": 81764441.0, | |
| "mean_token_accuracy": 0.7981429368257522, | |
| "epoch": 0.94770819883796, | |
| "step": 3670 | |
| }, | |
| { | |
| "loss": 0.7366, | |
| "grad_norm": 0.26118963956832886, | |
| "learning_rate": 6.509161526382701e-07, | |
| "entropy": 0.7330646455287934, | |
| "num_tokens": 81993927.0, | |
| "mean_token_accuracy": 0.8043831527233124, | |
| "epoch": 0.9502905100064558, | |
| "step": 3680 | |
| }, | |
| { | |
| "loss": 0.748, | |
| "grad_norm": 0.27240875363349915, | |
| "learning_rate": 5.856688429709434e-07, | |
| "entropy": 0.7462060987949372, | |
| "num_tokens": 82223112.0, | |
| "mean_token_accuracy": 0.8016072034835815, | |
| "epoch": 0.9528728211749515, | |
| "step": 3690 | |
| }, | |
| { | |
| "loss": 0.6935, | |
| "grad_norm": 0.2734779715538025, | |
| "learning_rate": 5.238474480170952e-07, | |
| "entropy": 0.6943931892514229, | |
| "num_tokens": 82455082.0, | |
| "mean_token_accuracy": 0.8110814154148102, | |
| "epoch": 0.9554551323434474, | |
| "step": 3700 | |
| }, | |
| { | |
| "loss": 0.7164, | |
| "grad_norm": 0.3203067481517792, | |
| "learning_rate": 4.6545625387798274e-07, | |
| "entropy": 0.7151076972484589, | |
| "num_tokens": 82671312.0, | |
| "mean_token_accuracy": 0.8075156539678574, | |
| "epoch": 0.9580374435119432, | |
| "step": 3710 | |
| }, | |
| { | |
| "loss": 0.6983, | |
| "grad_norm": 0.2791406810283661, | |
| "learning_rate": 4.104993088376974e-07, | |
| "entropy": 0.7000140130519867, | |
| "num_tokens": 82901834.0, | |
| "mean_token_accuracy": 0.811114838719368, | |
| "epoch": 0.960619754680439, | |
| "step": 3720 | |
| }, | |
| { | |
| "loss": 0.6859, | |
| "grad_norm": 0.3430984318256378, | |
| "learning_rate": 3.5898042308252224e-07, | |
| "entropy": 0.6891811639070511, | |
| "num_tokens": 83126676.0, | |
| "mean_token_accuracy": 0.8148478597402573, | |
| "epoch": 0.9632020658489348, | |
| "step": 3730 | |
| }, | |
| { | |
| "loss": 0.7634, | |
| "grad_norm": 0.2839677035808563, | |
| "learning_rate": 3.109031684367547e-07, | |
| "entropy": 0.7797995641827583, | |
| "num_tokens": 83330169.0, | |
| "mean_token_accuracy": 0.7926018327474594, | |
| "epoch": 0.9657843770174306, | |
| "step": 3740 | |
| }, | |
| { | |
| "loss": 0.6537, | |
| "grad_norm": 0.2943638563156128, | |
| "learning_rate": 2.662708781150769e-07, | |
| "entropy": 0.6571737483143807, | |
| "num_tokens": 83552644.0, | |
| "mean_token_accuracy": 0.8210361778736115, | |
| "epoch": 0.9683666881859264, | |
| "step": 3750 | |
| }, | |
| { | |
| "loss": 0.6883, | |
| "grad_norm": 0.30458876490592957, | |
| "learning_rate": 2.250866464914625e-07, | |
| "entropy": 0.6957482188940048, | |
| "num_tokens": 83768196.0, | |
| "mean_token_accuracy": 0.8107382476329803, | |
| "epoch": 0.9709489993544222, | |
| "step": 3760 | |
| }, | |
| { | |
| "loss": 0.6383, | |
| "grad_norm": 0.3314257264137268, | |
| "learning_rate": 1.8735332888463742e-07, | |
| "entropy": 0.6453122898936272, | |
| "num_tokens": 83990317.0, | |
| "mean_token_accuracy": 0.8240663319826126, | |
| "epoch": 0.973531310522918, | |
| "step": 3770 | |
| }, | |
| { | |
| "loss": 0.7142, | |
| "grad_norm": 0.30556467175483704, | |
| "learning_rate": 1.5307354136011586e-07, | |
| "entropy": 0.7219583809375762, | |
| "num_tokens": 84209863.0, | |
| "mean_token_accuracy": 0.8086627215147019, | |
| "epoch": 0.9761136216914138, | |
| "step": 3780 | |
| }, | |
| { | |
| "loss": 0.7143, | |
| "grad_norm": 0.29667413234710693, | |
| "learning_rate": 1.22249660548851e-07, | |
| "entropy": 0.7240412354469299, | |
| "num_tokens": 84447812.0, | |
| "mean_token_accuracy": 0.8065494656562805, | |
| "epoch": 0.9786959328599096, | |
| "step": 3790 | |
| }, | |
| { | |
| "loss": 0.7197, | |
| "grad_norm": 0.2865903377532959, | |
| "learning_rate": 9.48838234824223e-08, | |
| "entropy": 0.7270282983779908, | |
| "num_tokens": 84680310.0, | |
| "mean_token_accuracy": 0.8049559354782104, | |
| "epoch": 0.9812782440284055, | |
| "step": 3800 | |
| }, | |
| { | |
| "loss": 0.7106, | |
| "grad_norm": 0.2443910390138626, | |
| "learning_rate": 7.097792744492071e-08, | |
| "entropy": 0.7112312227487564, | |
| "num_tokens": 84909798.0, | |
| "mean_token_accuracy": 0.8073030918836593, | |
| "epoch": 0.9838605551969012, | |
| "step": 3810 | |
| }, | |
| { | |
| "loss": 0.7667, | |
| "grad_norm": 0.3134123384952545, | |
| "learning_rate": 5.053362984137611e-08, | |
| "entropy": 0.765004500746727, | |
| "num_tokens": 85131658.0, | |
| "mean_token_accuracy": 0.7956638902425766, | |
| "epoch": 0.986442866365397, | |
| "step": 3820 | |
| }, | |
| { | |
| "loss": 0.7177, | |
| "grad_norm": 0.3139767646789551, | |
| "learning_rate": 3.355234808284924e-08, | |
| "entropy": 0.7268019631505013, | |
| "num_tokens": 85351505.0, | |
| "mean_token_accuracy": 0.8048588812351227, | |
| "epoch": 0.9890251775338929, | |
| "step": 3830 | |
| }, | |
| { | |
| "loss": 0.7362, | |
| "grad_norm": 0.2681005895137787, | |
| "learning_rate": 2.0035259488182523e-08, | |
| "entropy": 0.74400754570961, | |
| "num_tokens": 85579956.0, | |
| "mean_token_accuracy": 0.8011926084756851, | |
| "epoch": 0.9916074887023887, | |
| "step": 3840 | |
| }, | |
| { | |
| "loss": 0.7149, | |
| "grad_norm": 0.2973273992538452, | |
| "learning_rate": 9.983301202365347e-09, | |
| "entropy": 0.7227866470813751, | |
| "num_tokens": 85807276.0, | |
| "mean_token_accuracy": 0.8073938339948654, | |
| "epoch": 0.9941897998708844, | |
| "step": 3850 | |
| }, | |
| { | |
| "loss": 0.7912, | |
| "grad_norm": 0.3043336272239685, | |
| "learning_rate": 3.3971701315638206e-09, | |
| "entropy": 0.7958255261182785, | |
| "num_tokens": 86031243.0, | |
| "mean_token_accuracy": 0.7904368698596954, | |
| "epoch": 0.9967721110393802, | |
| "step": 3860 | |
| }, | |
| { | |
| "loss": 0.7145, | |
| "grad_norm": 0.3112486004829407, | |
| "learning_rate": 2.7732289478166196e-10, | |
| "entropy": 0.716927382349968, | |
| "num_tokens": 86249568.0, | |
| "mean_token_accuracy": 0.8073446452617645, | |
| "epoch": 0.9993544222078761, | |
| "step": 3870 | |
| }, | |
| { | |
| "train_runtime": 49267.654, | |
| "train_samples_per_second": 2.515, | |
| "train_steps_per_second": 0.079, | |
| "total_flos": 5.541551362900722e+19, | |
| "train_loss": 0.74992161919958, | |
| "entropy": 0.6862897753715516, | |
| "num_tokens": 86293367.0, | |
| "mean_token_accuracy": 0.8136373043060303, | |
| "epoch": 1.0, | |
| "step": 3873 | |
| } | |
| ] |