Text Generation
Transformers
Safetensors
mistral
Generated from Trainer
trl
alignment-handbook
sft
conversational
text-generation-inference
Instructions to use fhalation/zephyr-7b-sft-full with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use fhalation/zephyr-7b-sft-full with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="fhalation/zephyr-7b-sft-full") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("fhalation/zephyr-7b-sft-full") model = AutoModelForCausalLM.from_pretrained("fhalation/zephyr-7b-sft-full") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use fhalation/zephyr-7b-sft-full with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "fhalation/zephyr-7b-sft-full" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "fhalation/zephyr-7b-sft-full", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/fhalation/zephyr-7b-sft-full
- SGLang
How to use fhalation/zephyr-7b-sft-full with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "fhalation/zephyr-7b-sft-full" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "fhalation/zephyr-7b-sft-full", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "fhalation/zephyr-7b-sft-full" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "fhalation/zephyr-7b-sft-full", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use fhalation/zephyr-7b-sft-full with Docker Model Runner:
docker model run hf.co/fhalation/zephyr-7b-sft-full
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 1797, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0027824151363383415, | |
| "grad_norm": 11.703799339759724, | |
| "learning_rate": 4.444444444444445e-07, | |
| "loss": 1.1659, | |
| "mean_token_accuracy": 0.6976747632026672, | |
| "num_tokens": 585246.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.005564830272676683, | |
| "grad_norm": 5.602071651449005, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 1.0864, | |
| "mean_token_accuracy": 0.7083896398544312, | |
| "num_tokens": 1171311.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008347245409015025, | |
| "grad_norm": 3.1420241423044755, | |
| "learning_rate": 1.5555555555555558e-06, | |
| "loss": 1.0324, | |
| "mean_token_accuracy": 0.7197834849357605, | |
| "num_tokens": 1757884.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.011129660545353366, | |
| "grad_norm": 2.934538436088403, | |
| "learning_rate": 2.1111111111111114e-06, | |
| "loss": 1.0212, | |
| "mean_token_accuracy": 0.7205601453781127, | |
| "num_tokens": 2351106.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.013912075681691708, | |
| "grad_norm": 2.7899043139645996, | |
| "learning_rate": 2.666666666666667e-06, | |
| "loss": 0.9897, | |
| "mean_token_accuracy": 0.7282361865043641, | |
| "num_tokens": 2942453.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.01669449081803005, | |
| "grad_norm": 2.618414182725373, | |
| "learning_rate": 3.2222222222222227e-06, | |
| "loss": 0.9967, | |
| "mean_token_accuracy": 0.7268516778945923, | |
| "num_tokens": 3524350.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.019476905954368393, | |
| "grad_norm": 3.312970663728368, | |
| "learning_rate": 3.777777777777778e-06, | |
| "loss": 1.0133, | |
| "mean_token_accuracy": 0.7244957327842713, | |
| "num_tokens": 4116171.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.022259321090706732, | |
| "grad_norm": 2.5606592497391945, | |
| "learning_rate": 4.333333333333334e-06, | |
| "loss": 1.0046, | |
| "mean_token_accuracy": 0.724289059638977, | |
| "num_tokens": 4703551.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.025041736227045076, | |
| "grad_norm": 3.539575821864669, | |
| "learning_rate": 4.888888888888889e-06, | |
| "loss": 1.0161, | |
| "mean_token_accuracy": 0.7216586589813232, | |
| "num_tokens": 5283285.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.027824151363383415, | |
| "grad_norm": 2.9309023782989936, | |
| "learning_rate": 5.444444444444445e-06, | |
| "loss": 1.0203, | |
| "mean_token_accuracy": 0.7215822339057922, | |
| "num_tokens": 5876874.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03060656649972176, | |
| "grad_norm": 3.284413853809602, | |
| "learning_rate": 6e-06, | |
| "loss": 0.9989, | |
| "mean_token_accuracy": 0.7253866314888, | |
| "num_tokens": 6470253.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0333889816360601, | |
| "grad_norm": 3.035514802811082, | |
| "learning_rate": 6.555555555555556e-06, | |
| "loss": 0.9791, | |
| "mean_token_accuracy": 0.7286934494972229, | |
| "num_tokens": 7056051.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.036171396772398445, | |
| "grad_norm": 2.8291474051822996, | |
| "learning_rate": 7.111111111111112e-06, | |
| "loss": 0.9859, | |
| "mean_token_accuracy": 0.7266968131065369, | |
| "num_tokens": 7640554.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.038953811908736785, | |
| "grad_norm": 2.76432791315742, | |
| "learning_rate": 7.666666666666667e-06, | |
| "loss": 1.0127, | |
| "mean_token_accuracy": 0.7222738623619079, | |
| "num_tokens": 8235176.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.041736227045075125, | |
| "grad_norm": 3.0121797507858985, | |
| "learning_rate": 8.222222222222222e-06, | |
| "loss": 1.033, | |
| "mean_token_accuracy": 0.7161361455917359, | |
| "num_tokens": 8823675.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.044518642181413465, | |
| "grad_norm": 2.6579939949703095, | |
| "learning_rate": 8.777777777777778e-06, | |
| "loss": 1.0106, | |
| "mean_token_accuracy": 0.722004747390747, | |
| "num_tokens": 9419065.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04730105731775181, | |
| "grad_norm": 2.756043313002429, | |
| "learning_rate": 9.333333333333334e-06, | |
| "loss": 1.0284, | |
| "mean_token_accuracy": 0.7184515237808228, | |
| "num_tokens": 10018707.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.05008347245409015, | |
| "grad_norm": 2.7330649459439487, | |
| "learning_rate": 9.88888888888889e-06, | |
| "loss": 1.0381, | |
| "mean_token_accuracy": 0.716748857498169, | |
| "num_tokens": 10615769.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.05286588759042849, | |
| "grad_norm": 3.119124532731729, | |
| "learning_rate": 1.0444444444444445e-05, | |
| "loss": 1.0159, | |
| "mean_token_accuracy": 0.7196141958236695, | |
| "num_tokens": 11206791.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.05564830272676683, | |
| "grad_norm": 2.6869099855331524, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 1.0259, | |
| "mean_token_accuracy": 0.7180516362190247, | |
| "num_tokens": 11794887.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05843071786310518, | |
| "grad_norm": 2.6746654996398225, | |
| "learning_rate": 1.1555555555555556e-05, | |
| "loss": 1.0068, | |
| "mean_token_accuracy": 0.7229955673217774, | |
| "num_tokens": 12385737.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.06121313299944352, | |
| "grad_norm": 2.9016124547228124, | |
| "learning_rate": 1.211111111111111e-05, | |
| "loss": 1.0407, | |
| "mean_token_accuracy": 0.7161330699920654, | |
| "num_tokens": 12974642.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06399554813578186, | |
| "grad_norm": 2.7590693104720887, | |
| "learning_rate": 1.2666666666666667e-05, | |
| "loss": 1.034, | |
| "mean_token_accuracy": 0.7153326869010925, | |
| "num_tokens": 13580517.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.0667779632721202, | |
| "grad_norm": 3.0191169781226734, | |
| "learning_rate": 1.3222222222222223e-05, | |
| "loss": 1.0218, | |
| "mean_token_accuracy": 0.7199317574501037, | |
| "num_tokens": 14169869.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06956037840845854, | |
| "grad_norm": 2.673488197212258, | |
| "learning_rate": 1.377777777777778e-05, | |
| "loss": 1.0589, | |
| "mean_token_accuracy": 0.7112971425056458, | |
| "num_tokens": 14758276.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.07234279354479689, | |
| "grad_norm": 2.7094276528268795, | |
| "learning_rate": 1.4333333333333334e-05, | |
| "loss": 1.049, | |
| "mean_token_accuracy": 0.712811267375946, | |
| "num_tokens": 15345599.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07512520868113523, | |
| "grad_norm": 2.6373785827567957, | |
| "learning_rate": 1.488888888888889e-05, | |
| "loss": 1.0225, | |
| "mean_token_accuracy": 0.7185760021209717, | |
| "num_tokens": 15932693.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.07790762381747357, | |
| "grad_norm": 2.9632223301755154, | |
| "learning_rate": 1.5444444444444446e-05, | |
| "loss": 1.0654, | |
| "mean_token_accuracy": 0.7110099554061889, | |
| "num_tokens": 16520098.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08069003895381191, | |
| "grad_norm": 2.817380088138206, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.0674, | |
| "mean_token_accuracy": 0.709651243686676, | |
| "num_tokens": 17110470.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.08347245409015025, | |
| "grad_norm": 2.6945752947019073, | |
| "learning_rate": 1.6555555555555556e-05, | |
| "loss": 1.054, | |
| "mean_token_accuracy": 0.7122701048851013, | |
| "num_tokens": 17696206.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08625486922648859, | |
| "grad_norm": 2.397611832219796, | |
| "learning_rate": 1.7111111111111112e-05, | |
| "loss": 1.0735, | |
| "mean_token_accuracy": 0.709396231174469, | |
| "num_tokens": 18292078.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.08903728436282693, | |
| "grad_norm": 2.7655354856899517, | |
| "learning_rate": 1.7666666666666668e-05, | |
| "loss": 1.0809, | |
| "mean_token_accuracy": 0.7083318114280701, | |
| "num_tokens": 18888122.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09181969949916527, | |
| "grad_norm": 2.758607000704, | |
| "learning_rate": 1.8222222222222224e-05, | |
| "loss": 1.0822, | |
| "mean_token_accuracy": 0.706460428237915, | |
| "num_tokens": 19484865.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.09460211463550362, | |
| "grad_norm": 2.570904335556245, | |
| "learning_rate": 1.877777777777778e-05, | |
| "loss": 1.0826, | |
| "mean_token_accuracy": 0.7078906774520874, | |
| "num_tokens": 20071044.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09738452977184196, | |
| "grad_norm": 2.735490789384101, | |
| "learning_rate": 1.9333333333333333e-05, | |
| "loss": 1.0858, | |
| "mean_token_accuracy": 0.7050360441207886, | |
| "num_tokens": 20662778.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.1001669449081803, | |
| "grad_norm": 2.7739191020812655, | |
| "learning_rate": 1.988888888888889e-05, | |
| "loss": 1.0681, | |
| "mean_token_accuracy": 0.7093758583068848, | |
| "num_tokens": 21253812.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.10294936004451864, | |
| "grad_norm": 2.8028592827713363, | |
| "learning_rate": 1.9999698027421894e-05, | |
| "loss": 1.0702, | |
| "mean_token_accuracy": 0.7094247817993165, | |
| "num_tokens": 21843322.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.10573177518085698, | |
| "grad_norm": 2.595713944513709, | |
| "learning_rate": 1.9998471295079908e-05, | |
| "loss": 1.0458, | |
| "mean_token_accuracy": 0.7138230800628662, | |
| "num_tokens": 22433061.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.10851419031719532, | |
| "grad_norm": 2.7873986833719946, | |
| "learning_rate": 1.9996301045360874e-05, | |
| "loss": 1.0974, | |
| "mean_token_accuracy": 0.704916775226593, | |
| "num_tokens": 23021153.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.11129660545353366, | |
| "grad_norm": 2.439806161655261, | |
| "learning_rate": 1.9993187483062935e-05, | |
| "loss": 1.0771, | |
| "mean_token_accuracy": 0.7064628720283508, | |
| "num_tokens": 23609275.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11407902058987202, | |
| "grad_norm": 2.4957981823187763, | |
| "learning_rate": 1.9989130902001025e-05, | |
| "loss": 1.0917, | |
| "mean_token_accuracy": 0.7053624391555786, | |
| "num_tokens": 24205073.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.11686143572621036, | |
| "grad_norm": 2.896823560689145, | |
| "learning_rate": 1.9984131684979134e-05, | |
| "loss": 1.1004, | |
| "mean_token_accuracy": 0.7049420475959778, | |
| "num_tokens": 24800240.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1196438508625487, | |
| "grad_norm": 2.5192627022618623, | |
| "learning_rate": 1.997819030375419e-05, | |
| "loss": 1.0623, | |
| "mean_token_accuracy": 0.7119413614273071, | |
| "num_tokens": 25387549.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.12242626599888703, | |
| "grad_norm": 2.8132540932895695, | |
| "learning_rate": 1.9971307318991546e-05, | |
| "loss": 1.0915, | |
| "mean_token_accuracy": 0.7074636220932007, | |
| "num_tokens": 25965133.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.12520868113522537, | |
| "grad_norm": 35.333375177867765, | |
| "learning_rate": 1.996348338021207e-05, | |
| "loss": 1.114, | |
| "mean_token_accuracy": 0.700543737411499, | |
| "num_tokens": 26563977.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.12799109627156371, | |
| "grad_norm": 2.920196605955216, | |
| "learning_rate": 1.9954719225730847e-05, | |
| "loss": 1.1139, | |
| "mean_token_accuracy": 0.7011779904365539, | |
| "num_tokens": 27156932.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.13077351140790205, | |
| "grad_norm": 2.60574074107156, | |
| "learning_rate": 1.9945015682587512e-05, | |
| "loss": 1.096, | |
| "mean_token_accuracy": 0.7043320059776306, | |
| "num_tokens": 27754019.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.1335559265442404, | |
| "grad_norm": 2.865631892072952, | |
| "learning_rate": 1.9934373666468203e-05, | |
| "loss": 1.0804, | |
| "mean_token_accuracy": 0.706881308555603, | |
| "num_tokens": 28342275.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.13633834168057873, | |
| "grad_norm": 2.504324070572632, | |
| "learning_rate": 1.992279418161915e-05, | |
| "loss": 1.099, | |
| "mean_token_accuracy": 0.7036979913711547, | |
| "num_tokens": 28928534.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.13912075681691707, | |
| "grad_norm": 2.6138293066655045, | |
| "learning_rate": 1.991027832075192e-05, | |
| "loss": 1.0921, | |
| "mean_token_accuracy": 0.7047542929649353, | |
| "num_tokens": 29513990.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1419031719532554, | |
| "grad_norm": 2.3711537074020037, | |
| "learning_rate": 1.989682726494028e-05, | |
| "loss": 1.0562, | |
| "mean_token_accuracy": 0.7139820337295533, | |
| "num_tokens": 30113881.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.14468558708959378, | |
| "grad_norm": 2.338233428322228, | |
| "learning_rate": 1.988244228350877e-05, | |
| "loss": 1.0811, | |
| "mean_token_accuracy": 0.7071714401245117, | |
| "num_tokens": 30700467.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.14746800222593212, | |
| "grad_norm": 2.5175646612189655, | |
| "learning_rate": 1.986712473391289e-05, | |
| "loss": 1.0979, | |
| "mean_token_accuracy": 0.7044062852859497, | |
| "num_tokens": 31292719.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.15025041736227046, | |
| "grad_norm": 2.40596554930329, | |
| "learning_rate": 1.9850876061611036e-05, | |
| "loss": 1.092, | |
| "mean_token_accuracy": 0.706499433517456, | |
| "num_tokens": 31883435.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1530328324986088, | |
| "grad_norm": 2.4796442221955384, | |
| "learning_rate": 1.9833697799928074e-05, | |
| "loss": 1.0967, | |
| "mean_token_accuracy": 0.7027202010154724, | |
| "num_tokens": 32481693.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.15581524763494714, | |
| "grad_norm": 2.4780125377517663, | |
| "learning_rate": 1.9815591569910654e-05, | |
| "loss": 1.1121, | |
| "mean_token_accuracy": 0.7006395697593689, | |
| "num_tokens": 33071205.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.15859766277128548, | |
| "grad_norm": 2.3510385258823976, | |
| "learning_rate": 1.979655908017424e-05, | |
| "loss": 1.0861, | |
| "mean_token_accuracy": 0.7057282090187073, | |
| "num_tokens": 33671052.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.16138007790762382, | |
| "grad_norm": 2.5067301813131655, | |
| "learning_rate": 1.9776602126741867e-05, | |
| "loss": 1.0807, | |
| "mean_token_accuracy": 0.7070404767990113, | |
| "num_tokens": 34260518.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.16416249304396216, | |
| "grad_norm": 2.28351268206912, | |
| "learning_rate": 1.975572259287467e-05, | |
| "loss": 1.0803, | |
| "mean_token_accuracy": 0.7072898864746093, | |
| "num_tokens": 34848727.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.1669449081803005, | |
| "grad_norm": 2.7544633183153926, | |
| "learning_rate": 1.973392244889415e-05, | |
| "loss": 1.0854, | |
| "mean_token_accuracy": 0.7059407949447631, | |
| "num_tokens": 35437641.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.16972732331663884, | |
| "grad_norm": 2.3830000818389925, | |
| "learning_rate": 1.9711203751996267e-05, | |
| "loss": 1.0988, | |
| "mean_token_accuracy": 0.7048187136650086, | |
| "num_tokens": 36037300.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.17250973845297718, | |
| "grad_norm": 2.360850123530632, | |
| "learning_rate": 1.9687568646057277e-05, | |
| "loss": 1.0736, | |
| "mean_token_accuracy": 0.7092967867851258, | |
| "num_tokens": 36629457.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.17529215358931552, | |
| "grad_norm": 2.2707504412701582, | |
| "learning_rate": 1.966301936143146e-05, | |
| "loss": 1.0958, | |
| "mean_token_accuracy": 0.7042588949203491, | |
| "num_tokens": 37224522.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.17807456872565386, | |
| "grad_norm": 2.201925130311423, | |
| "learning_rate": 1.9637558214740618e-05, | |
| "loss": 1.0964, | |
| "mean_token_accuracy": 0.7053308248519897, | |
| "num_tokens": 37815851.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1808569838619922, | |
| "grad_norm": 2.2178593995634697, | |
| "learning_rate": 1.9611187608655484e-05, | |
| "loss": 1.1105, | |
| "mean_token_accuracy": 0.7033011674880981, | |
| "num_tokens": 38396495.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.18363939899833054, | |
| "grad_norm": 2.2136118629553185, | |
| "learning_rate": 1.9583910031668984e-05, | |
| "loss": 1.0862, | |
| "mean_token_accuracy": 0.7051831126213074, | |
| "num_tokens": 38994173.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1864218141346689, | |
| "grad_norm": 2.292573884422775, | |
| "learning_rate": 1.955572805786141e-05, | |
| "loss": 1.1233, | |
| "mean_token_accuracy": 0.7006201148033142, | |
| "num_tokens": 39577921.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.18920422927100725, | |
| "grad_norm": 2.2980324001779873, | |
| "learning_rate": 1.9526644346657508e-05, | |
| "loss": 1.1007, | |
| "mean_token_accuracy": 0.7046499371528625, | |
| "num_tokens": 40170313.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.19198664440734559, | |
| "grad_norm": 2.2611492444425916, | |
| "learning_rate": 1.9496661642575517e-05, | |
| "loss": 1.065, | |
| "mean_token_accuracy": 0.7105947017669678, | |
| "num_tokens": 40765429.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.19476905954368393, | |
| "grad_norm": 2.1512493790533185, | |
| "learning_rate": 1.946578277496821e-05, | |
| "loss": 1.0917, | |
| "mean_token_accuracy": 0.7073456883430481, | |
| "num_tokens": 41365863.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.19755147468002227, | |
| "grad_norm": 2.271847848176885, | |
| "learning_rate": 1.943401065775584e-05, | |
| "loss": 1.1011, | |
| "mean_token_accuracy": 0.7054303050041199, | |
| "num_tokens": 41957368.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.2003338898163606, | |
| "grad_norm": 2.222128508786771, | |
| "learning_rate": 1.940134828915123e-05, | |
| "loss": 1.1086, | |
| "mean_token_accuracy": 0.7034181118011474, | |
| "num_tokens": 42546312.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.20311630495269895, | |
| "grad_norm": 2.159135050960954, | |
| "learning_rate": 1.936779875137678e-05, | |
| "loss": 1.0821, | |
| "mean_token_accuracy": 0.707557737827301, | |
| "num_tokens": 43141094.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.20589872008903728, | |
| "grad_norm": 2.2055966042491546, | |
| "learning_rate": 1.9333365210373668e-05, | |
| "loss": 1.0902, | |
| "mean_token_accuracy": 0.705539345741272, | |
| "num_tokens": 43731225.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.20868113522537562, | |
| "grad_norm": 2.1858790427306785, | |
| "learning_rate": 1.9298050915503053e-05, | |
| "loss": 1.1066, | |
| "mean_token_accuracy": 0.7038124799728394, | |
| "num_tokens": 44311286.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.21146355036171396, | |
| "grad_norm": 2.204033703125247, | |
| "learning_rate": 1.926185919923946e-05, | |
| "loss": 1.0971, | |
| "mean_token_accuracy": 0.7054288148880005, | |
| "num_tokens": 44906755.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2142459654980523, | |
| "grad_norm": 2.2598831388779694, | |
| "learning_rate": 1.9224793476856293e-05, | |
| "loss": 1.1201, | |
| "mean_token_accuracy": 0.700083589553833, | |
| "num_tokens": 45491095.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.21702838063439064, | |
| "grad_norm": 2.1902124071852977, | |
| "learning_rate": 1.9186857246103586e-05, | |
| "loss": 1.079, | |
| "mean_token_accuracy": 0.7079141974449158, | |
| "num_tokens": 46084794.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.21981079577072898, | |
| "grad_norm": 2.225166806223554, | |
| "learning_rate": 1.9148054086877884e-05, | |
| "loss": 1.0965, | |
| "mean_token_accuracy": 0.7044667720794677, | |
| "num_tokens": 46674587.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.22259321090706732, | |
| "grad_norm": 2.110266160830183, | |
| "learning_rate": 1.9108387660884456e-05, | |
| "loss": 1.1019, | |
| "mean_token_accuracy": 0.7042423367500306, | |
| "num_tokens": 47263613.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.22537562604340566, | |
| "grad_norm": 2.137008460009227, | |
| "learning_rate": 1.9067861711291744e-05, | |
| "loss": 1.0984, | |
| "mean_token_accuracy": 0.7045777201652527, | |
| "num_tokens": 47848405.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.22815804117974403, | |
| "grad_norm": 2.5090054849409573, | |
| "learning_rate": 1.9026480062378136e-05, | |
| "loss": 1.1232, | |
| "mean_token_accuracy": 0.7006029844284057, | |
| "num_tokens": 48440420.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.23094045631608237, | |
| "grad_norm": 2.5298900029195934, | |
| "learning_rate": 1.8984246619171075e-05, | |
| "loss": 1.0998, | |
| "mean_token_accuracy": 0.7040575265884399, | |
| "num_tokens": 49026577.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.2337228714524207, | |
| "grad_norm": 2.243788572378369, | |
| "learning_rate": 1.894116536707857e-05, | |
| "loss": 1.0931, | |
| "mean_token_accuracy": 0.7059786558151245, | |
| "num_tokens": 49618303.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.23650528658875905, | |
| "grad_norm": 2.2973971910882853, | |
| "learning_rate": 1.8897240371513098e-05, | |
| "loss": 1.1076, | |
| "mean_token_accuracy": 0.7032187581062317, | |
| "num_tokens": 50211716.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.2392877017250974, | |
| "grad_norm": 2.121159415163043, | |
| "learning_rate": 1.8852475777507983e-05, | |
| "loss": 1.0882, | |
| "mean_token_accuracy": 0.7079625129699707, | |
| "num_tokens": 50806268.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.24207011686143573, | |
| "grad_norm": 2.2653088399729593, | |
| "learning_rate": 1.8806875809326204e-05, | |
| "loss": 1.0988, | |
| "mean_token_accuracy": 0.7044902324676514, | |
| "num_tokens": 51395551.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.24485253199777407, | |
| "grad_norm": 2.0640694310319647, | |
| "learning_rate": 1.876044477006183e-05, | |
| "loss": 1.1057, | |
| "mean_token_accuracy": 0.7019346117973327, | |
| "num_tokens": 51988430.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2476349471341124, | |
| "grad_norm": 2.1331480343408225, | |
| "learning_rate": 1.8713187041233896e-05, | |
| "loss": 1.0845, | |
| "mean_token_accuracy": 0.7060743689537048, | |
| "num_tokens": 52583147.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.25041736227045075, | |
| "grad_norm": 2.4013534842444186, | |
| "learning_rate": 1.866510708237297e-05, | |
| "loss": 1.0979, | |
| "mean_token_accuracy": 0.7047066450119018, | |
| "num_tokens": 53181352.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2531997774067891, | |
| "grad_norm": 2.3023711179533226, | |
| "learning_rate": 1.861620943060031e-05, | |
| "loss": 1.1275, | |
| "mean_token_accuracy": 0.6983560442924499, | |
| "num_tokens": 53772836.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.25598219254312743, | |
| "grad_norm": 2.2577981667782208, | |
| "learning_rate": 1.856649870019972e-05, | |
| "loss": 1.0957, | |
| "mean_token_accuracy": 0.7056548476219178, | |
| "num_tokens": 54367700.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2587646076794658, | |
| "grad_norm": 2.676938686558113, | |
| "learning_rate": 1.8515979582182112e-05, | |
| "loss": 1.0906, | |
| "mean_token_accuracy": 0.707176685333252, | |
| "num_tokens": 54960810.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.2615470228158041, | |
| "grad_norm": 2.4165806936926133, | |
| "learning_rate": 1.8464656843842837e-05, | |
| "loss": 1.0897, | |
| "mean_token_accuracy": 0.7070010900497437, | |
| "num_tokens": 55550003.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2643294379521425, | |
| "grad_norm": 2.461649569325264, | |
| "learning_rate": 1.8412535328311813e-05, | |
| "loss": 1.1121, | |
| "mean_token_accuracy": 0.7028052568435669, | |
| "num_tokens": 56136218.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.2671118530884808, | |
| "grad_norm": 2.1804919862847365, | |
| "learning_rate": 1.8359619954096497e-05, | |
| "loss": 1.1076, | |
| "mean_token_accuracy": 0.7032665610313416, | |
| "num_tokens": 56726599.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.26989426822481916, | |
| "grad_norm": 2.68554258980135, | |
| "learning_rate": 1.8305915714617745e-05, | |
| "loss": 1.0993, | |
| "mean_token_accuracy": 0.7033315062522888, | |
| "num_tokens": 57321297.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.27267668336115747, | |
| "grad_norm": 2.3164265508749544, | |
| "learning_rate": 1.8251427677738596e-05, | |
| "loss": 1.067, | |
| "mean_token_accuracy": 0.710555636882782, | |
| "num_tokens": 57913003.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.27545909849749584, | |
| "grad_norm": 2.25329369986598, | |
| "learning_rate": 1.8196160985286052e-05, | |
| "loss": 1.0913, | |
| "mean_token_accuracy": 0.708107590675354, | |
| "num_tokens": 58499228.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.27824151363383415, | |
| "grad_norm": 2.2429809416951656, | |
| "learning_rate": 1.814012085256585e-05, | |
| "loss": 1.0993, | |
| "mean_token_accuracy": 0.7040925621986389, | |
| "num_tokens": 59090708.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2810239287701725, | |
| "grad_norm": 2.2073249069907384, | |
| "learning_rate": 1.8083312567870315e-05, | |
| "loss": 1.0879, | |
| "mean_token_accuracy": 0.7081225514411926, | |
| "num_tokens": 59685930.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.2838063439065108, | |
| "grad_norm": 2.0857359362520214, | |
| "learning_rate": 1.8025741491979326e-05, | |
| "loss": 1.0616, | |
| "mean_token_accuracy": 0.7111712694168091, | |
| "num_tokens": 60280434.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.2865887590428492, | |
| "grad_norm": 2.163016097659582, | |
| "learning_rate": 1.7967413057654452e-05, | |
| "loss": 1.0775, | |
| "mean_token_accuracy": 0.7096009373664856, | |
| "num_tokens": 60868682.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.28937117417918756, | |
| "grad_norm": 2.3117437321963306, | |
| "learning_rate": 1.7908332769126255e-05, | |
| "loss": 1.1076, | |
| "mean_token_accuracy": 0.7027746677398682, | |
| "num_tokens": 61458691.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2921535893155259, | |
| "grad_norm": 2.4449046203168856, | |
| "learning_rate": 1.784850620157491e-05, | |
| "loss": 1.0963, | |
| "mean_token_accuracy": 0.7077104687690735, | |
| "num_tokens": 62050298.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.29493600445186424, | |
| "grad_norm": 2.2894244206650574, | |
| "learning_rate": 1.7787939000604063e-05, | |
| "loss": 1.074, | |
| "mean_token_accuracy": 0.709146237373352, | |
| "num_tokens": 62641275.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.29771841958820255, | |
| "grad_norm": 2.235175132800985, | |
| "learning_rate": 1.7726636881708114e-05, | |
| "loss": 1.0921, | |
| "mean_token_accuracy": 0.7072658061981201, | |
| "num_tokens": 63230436.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.3005008347245409, | |
| "grad_norm": 2.1669106168397665, | |
| "learning_rate": 1.7664605629732832e-05, | |
| "loss": 1.0954, | |
| "mean_token_accuracy": 0.7046370029449462, | |
| "num_tokens": 63818119.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.30328324986087923, | |
| "grad_norm": 2.1441378931657478, | |
| "learning_rate": 1.7601851098329484e-05, | |
| "loss": 1.0671, | |
| "mean_token_accuracy": 0.710686981678009, | |
| "num_tokens": 64410016.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.3060656649972176, | |
| "grad_norm": 2.124149203635388, | |
| "learning_rate": 1.7538379209402442e-05, | |
| "loss": 1.0893, | |
| "mean_token_accuracy": 0.7079866886138916, | |
| "num_tokens": 65008878.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3088480801335559, | |
| "grad_norm": 2.092451847928283, | |
| "learning_rate": 1.7474195952550355e-05, | |
| "loss": 1.0911, | |
| "mean_token_accuracy": 0.7058361053466797, | |
| "num_tokens": 65591920.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.3116304952698943, | |
| "grad_norm": 1.979401747526117, | |
| "learning_rate": 1.7409307384500932e-05, | |
| "loss": 1.0781, | |
| "mean_token_accuracy": 0.7093043208122254, | |
| "num_tokens": 66183326.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3144129104062326, | |
| "grad_norm": 2.3013535796680133, | |
| "learning_rate": 1.7343719628539396e-05, | |
| "loss": 1.1062, | |
| "mean_token_accuracy": 0.7034829258918762, | |
| "num_tokens": 66770419.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.31719532554257096, | |
| "grad_norm": 2.232144210677623, | |
| "learning_rate": 1.7277438873930654e-05, | |
| "loss": 1.0888, | |
| "mean_token_accuracy": 0.7088476419448853, | |
| "num_tokens": 67356232.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3199777406789093, | |
| "grad_norm": 2.2615310284916426, | |
| "learning_rate": 1.7210471375335225e-05, | |
| "loss": 1.0762, | |
| "mean_token_accuracy": 0.7096709370613098, | |
| "num_tokens": 67948261.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.32276015581524764, | |
| "grad_norm": 2.069623037881395, | |
| "learning_rate": 1.7142823452219036e-05, | |
| "loss": 1.0584, | |
| "mean_token_accuracy": 0.7133225679397583, | |
| "num_tokens": 68530816.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.32554257095158595, | |
| "grad_norm": 2.01949110828742, | |
| "learning_rate": 1.7074501488257062e-05, | |
| "loss": 1.0771, | |
| "mean_token_accuracy": 0.7082255363464356, | |
| "num_tokens": 69121402.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.3283249860879243, | |
| "grad_norm": 2.114712605110562, | |
| "learning_rate": 1.700551193073092e-05, | |
| "loss": 1.0434, | |
| "mean_token_accuracy": 0.7137895464897156, | |
| "num_tokens": 69707900.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3311074012242627, | |
| "grad_norm": 2.1204083275143533, | |
| "learning_rate": 1.693586128992048e-05, | |
| "loss": 1.0753, | |
| "mean_token_accuracy": 0.7090141296386718, | |
| "num_tokens": 70297299.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.333889816360601, | |
| "grad_norm": 2.170713633905745, | |
| "learning_rate": 1.6865556138489497e-05, | |
| "loss": 1.0944, | |
| "mean_token_accuracy": 0.706296420097351, | |
| "num_tokens": 70886257.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.33667223149693937, | |
| "grad_norm": 2.1428343367458074, | |
| "learning_rate": 1.6794603110865396e-05, | |
| "loss": 1.0871, | |
| "mean_token_accuracy": 0.7076637268066406, | |
| "num_tokens": 71474356.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.3394546466332777, | |
| "grad_norm": 3.3075272432648886, | |
| "learning_rate": 1.672300890261317e-05, | |
| "loss": 1.044, | |
| "mean_token_accuracy": 0.7172706961631775, | |
| "num_tokens": 72059816.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.34223706176961605, | |
| "grad_norm": 2.1270286082703573, | |
| "learning_rate": 1.6650780269803587e-05, | |
| "loss": 1.0863, | |
| "mean_token_accuracy": 0.7074844360351562, | |
| "num_tokens": 72652774.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.34501947690595436, | |
| "grad_norm": 2.100968885315603, | |
| "learning_rate": 1.6577924028375622e-05, | |
| "loss": 1.0677, | |
| "mean_token_accuracy": 0.71006840467453, | |
| "num_tokens": 73239819.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.3478018920422927, | |
| "grad_norm": 2.008632532868866, | |
| "learning_rate": 1.6504447053493264e-05, | |
| "loss": 1.0645, | |
| "mean_token_accuracy": 0.7101643443107605, | |
| "num_tokens": 73831159.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.35058430717863104, | |
| "grad_norm": 2.0858240283477545, | |
| "learning_rate": 1.643035627889674e-05, | |
| "loss": 1.0717, | |
| "mean_token_accuracy": 0.7094730496406555, | |
| "num_tokens": 74422688.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.3533667223149694, | |
| "grad_norm": 2.1384864549062197, | |
| "learning_rate": 1.63556586962482e-05, | |
| "loss": 1.1, | |
| "mean_token_accuracy": 0.7050098419189453, | |
| "num_tokens": 75009215.0, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.3561491374513077, | |
| "grad_norm": 2.106464767907768, | |
| "learning_rate": 1.628036135447194e-05, | |
| "loss": 1.0894, | |
| "mean_token_accuracy": 0.707071328163147, | |
| "num_tokens": 75598228.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3589315525876461, | |
| "grad_norm": 2.0551179685857144, | |
| "learning_rate": 1.6204471359089224e-05, | |
| "loss": 1.0785, | |
| "mean_token_accuracy": 0.7078182816505432, | |
| "num_tokens": 76186740.0, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.3617139677239844, | |
| "grad_norm": 2.1646737527032314, | |
| "learning_rate": 1.612799587154777e-05, | |
| "loss": 1.07, | |
| "mean_token_accuracy": 0.7111572623252869, | |
| "num_tokens": 76774832.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.36449638286032277, | |
| "grad_norm": 2.104548936492404, | |
| "learning_rate": 1.6050942108545938e-05, | |
| "loss": 1.0747, | |
| "mean_token_accuracy": 0.7105032086372376, | |
| "num_tokens": 77363315.0, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.3672787979966611, | |
| "grad_norm": 2.0651187265677216, | |
| "learning_rate": 1.5973317341351725e-05, | |
| "loss": 1.0697, | |
| "mean_token_accuracy": 0.7097868919372559, | |
| "num_tokens": 77951799.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.37006121313299944, | |
| "grad_norm": 1.9772941125582544, | |
| "learning_rate": 1.58951288951166e-05, | |
| "loss": 1.0703, | |
| "mean_token_accuracy": 0.7106229305267334, | |
| "num_tokens": 78551404.0, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.3728436282693378, | |
| "grad_norm": 2.081741954302233, | |
| "learning_rate": 1.5816384148184273e-05, | |
| "loss": 1.0564, | |
| "mean_token_accuracy": 0.7130509853363037, | |
| "num_tokens": 79148333.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3756260434056761, | |
| "grad_norm": 2.116111905318221, | |
| "learning_rate": 1.57370905313944e-05, | |
| "loss": 1.0901, | |
| "mean_token_accuracy": 0.7071909785270691, | |
| "num_tokens": 79731743.0, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.3784084585420145, | |
| "grad_norm": 2.010006167198628, | |
| "learning_rate": 1.5657255527381395e-05, | |
| "loss": 1.0741, | |
| "mean_token_accuracy": 0.7091400980949402, | |
| "num_tokens": 80332028.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3811908736783528, | |
| "grad_norm": 2.0107259580616956, | |
| "learning_rate": 1.5576886669868297e-05, | |
| "loss": 1.0492, | |
| "mean_token_accuracy": 0.7131890416145324, | |
| "num_tokens": 80923863.0, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.38397328881469117, | |
| "grad_norm": 1.9791050603322653, | |
| "learning_rate": 1.5495991542955855e-05, | |
| "loss": 1.0503, | |
| "mean_token_accuracy": 0.7160694479942322, | |
| "num_tokens": 81512560.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.3867557039510295, | |
| "grad_norm": 1.9490662087580195, | |
| "learning_rate": 1.541457778040684e-05, | |
| "loss": 1.0529, | |
| "mean_token_accuracy": 0.7135980725288391, | |
| "num_tokens": 82097379.0, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.38953811908736785, | |
| "grad_norm": 2.0179615858909377, | |
| "learning_rate": 1.5332653064925683e-05, | |
| "loss": 1.0519, | |
| "mean_token_accuracy": 0.7147277235984802, | |
| "num_tokens": 82685268.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.39232053422370616, | |
| "grad_norm": 2.0450453857514175, | |
| "learning_rate": 1.5250225127433485e-05, | |
| "loss": 1.043, | |
| "mean_token_accuracy": 0.7144908547401428, | |
| "num_tokens": 83277230.0, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.39510294936004453, | |
| "grad_norm": 1.9276469987768912, | |
| "learning_rate": 1.5167301746338466e-05, | |
| "loss": 1.0784, | |
| "mean_token_accuracy": 0.7108999609947204, | |
| "num_tokens": 83861406.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.39788536449638284, | |
| "grad_norm": 1.9433023381927899, | |
| "learning_rate": 1.5083890746801962e-05, | |
| "loss": 1.0597, | |
| "mean_token_accuracy": 0.7121692419052124, | |
| "num_tokens": 84459146.0, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.4006677796327212, | |
| "grad_norm": 1.9546874431863348, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 1.0919, | |
| "mean_token_accuracy": 0.7079582571983337, | |
| "num_tokens": 85049969.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4034501947690595, | |
| "grad_norm": 1.9563600752313475, | |
| "learning_rate": 1.491563742238051e-05, | |
| "loss": 1.0692, | |
| "mean_token_accuracy": 0.7110470652580261, | |
| "num_tokens": 85638302.0, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.4062326099053979, | |
| "grad_norm": 1.9290273436145757, | |
| "learning_rate": 1.483081097491628e-05, | |
| "loss": 1.0697, | |
| "mean_token_accuracy": 0.7116230726242065, | |
| "num_tokens": 86229643.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.4090150250417362, | |
| "grad_norm": 1.9858611430728594, | |
| "learning_rate": 1.4745528662353728e-05, | |
| "loss": 1.0483, | |
| "mean_token_accuracy": 0.7151864290237426, | |
| "num_tokens": 86822104.0, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.41179744017807457, | |
| "grad_norm": 2.04642483648034, | |
| "learning_rate": 1.4659798532457497e-05, | |
| "loss": 1.0775, | |
| "mean_token_accuracy": 0.7090552926063538, | |
| "num_tokens": 87413792.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.41457985531441294, | |
| "grad_norm": 1.9893794431614882, | |
| "learning_rate": 1.4573628675251051e-05, | |
| "loss": 1.05, | |
| "mean_token_accuracy": 0.7146545886993408, | |
| "num_tokens": 88001772.0, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.41736227045075125, | |
| "grad_norm": 1.9071569454284205, | |
| "learning_rate": 1.4487027222253216e-05, | |
| "loss": 1.071, | |
| "mean_token_accuracy": 0.7112080335617066, | |
| "num_tokens": 88586368.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4201446855870896, | |
| "grad_norm": 2.1261499946440106, | |
| "learning_rate": 1.4400002345710871e-05, | |
| "loss": 1.053, | |
| "mean_token_accuracy": 0.7138799786567688, | |
| "num_tokens": 89169649.0, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.42292710072342793, | |
| "grad_norm": 2.0601232013853354, | |
| "learning_rate": 1.4312562257827742e-05, | |
| "loss": 1.0592, | |
| "mean_token_accuracy": 0.7137506484985352, | |
| "num_tokens": 89758883.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.4257095158597663, | |
| "grad_norm": 2.0053263574303126, | |
| "learning_rate": 1.4224715209989463e-05, | |
| "loss": 1.0762, | |
| "mean_token_accuracy": 0.7106667995452881, | |
| "num_tokens": 90343260.0, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.4284919309961046, | |
| "grad_norm": 2.040531463590581, | |
| "learning_rate": 1.4136469491984913e-05, | |
| "loss": 1.0532, | |
| "mean_token_accuracy": 0.7144197583198547, | |
| "num_tokens": 90931881.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.431274346132443, | |
| "grad_norm": 2.1902642918655353, | |
| "learning_rate": 1.4047833431223938e-05, | |
| "loss": 1.0688, | |
| "mean_token_accuracy": 0.7094583511352539, | |
| "num_tokens": 91515784.0, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.4340567612687813, | |
| "grad_norm": 2.128849928536235, | |
| "learning_rate": 1.3958815391951552e-05, | |
| "loss": 1.0675, | |
| "mean_token_accuracy": 0.7103721380233765, | |
| "num_tokens": 92113098.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.43683917640511966, | |
| "grad_norm": 1.9500214149444213, | |
| "learning_rate": 1.3869423774458594e-05, | |
| "loss": 1.0728, | |
| "mean_token_accuracy": 0.7097015857696534, | |
| "num_tokens": 92709566.0, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.43962159154145797, | |
| "grad_norm": 1.8764892174897658, | |
| "learning_rate": 1.3779667014289067e-05, | |
| "loss": 1.0431, | |
| "mean_token_accuracy": 0.7169391632080078, | |
| "num_tokens": 93292537.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.44240400667779634, | |
| "grad_norm": 2.10238424716131, | |
| "learning_rate": 1.3689553581444069e-05, | |
| "loss": 1.0145, | |
| "mean_token_accuracy": 0.7227911353111267, | |
| "num_tokens": 93878784.0, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.44518642181413465, | |
| "grad_norm": 2.0641117161806033, | |
| "learning_rate": 1.3599091979582537e-05, | |
| "loss": 1.0576, | |
| "mean_token_accuracy": 0.7129832863807678, | |
| "num_tokens": 94467072.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.447968836950473, | |
| "grad_norm": 2.0209262573363143, | |
| "learning_rate": 1.3508290745218789e-05, | |
| "loss": 1.0281, | |
| "mean_token_accuracy": 0.7192481160163879, | |
| "num_tokens": 95055139.0, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.4507512520868113, | |
| "grad_norm": 1.9960033545510802, | |
| "learning_rate": 1.341715844691695e-05, | |
| "loss": 1.0381, | |
| "mean_token_accuracy": 0.7170910716056824, | |
| "num_tokens": 95643923.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4535336672231497, | |
| "grad_norm": 2.666908392602128, | |
| "learning_rate": 1.3325703684482383e-05, | |
| "loss": 1.0911, | |
| "mean_token_accuracy": 0.7066366791725158, | |
| "num_tokens": 96229319.0, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.45631608235948806, | |
| "grad_norm": 1.9412627867610472, | |
| "learning_rate": 1.3233935088150154e-05, | |
| "loss": 1.044, | |
| "mean_token_accuracy": 0.7168261289596558, | |
| "num_tokens": 96825493.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.4590984974958264, | |
| "grad_norm": 1.8919553931638313, | |
| "learning_rate": 1.3141861317770628e-05, | |
| "loss": 1.0856, | |
| "mean_token_accuracy": 0.708315372467041, | |
| "num_tokens": 97415636.0, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.46188091263216474, | |
| "grad_norm": 1.979892135995854, | |
| "learning_rate": 1.3049491061992274e-05, | |
| "loss": 1.0411, | |
| "mean_token_accuracy": 0.716647469997406, | |
| "num_tokens": 98008396.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.46466332776850305, | |
| "grad_norm": 1.941207079118909, | |
| "learning_rate": 1.2956833037441756e-05, | |
| "loss": 1.0489, | |
| "mean_token_accuracy": 0.7146740078926086, | |
| "num_tokens": 98593026.0, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.4674457429048414, | |
| "grad_norm": 2.184633389681683, | |
| "learning_rate": 1.2863895987901364e-05, | |
| "loss": 1.0746, | |
| "mean_token_accuracy": 0.7111501693725586, | |
| "num_tokens": 99185818.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.47022815804117973, | |
| "grad_norm": 1.9997670534792031, | |
| "learning_rate": 1.2770688683483914e-05, | |
| "loss": 1.0701, | |
| "mean_token_accuracy": 0.708341383934021, | |
| "num_tokens": 99774152.0, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.4730105731775181, | |
| "grad_norm": 2.0094775586736953, | |
| "learning_rate": 1.2677219919805137e-05, | |
| "loss": 1.0455, | |
| "mean_token_accuracy": 0.7151992082595825, | |
| "num_tokens": 100363649.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4757929883138564, | |
| "grad_norm": 2.159247041588428, | |
| "learning_rate": 1.2583498517153662e-05, | |
| "loss": 1.0338, | |
| "mean_token_accuracy": 0.7189494609832764, | |
| "num_tokens": 100957067.0, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.4785754034501948, | |
| "grad_norm": 2.028288393918697, | |
| "learning_rate": 1.2489533319658703e-05, | |
| "loss": 1.0394, | |
| "mean_token_accuracy": 0.7162809491157531, | |
| "num_tokens": 101549170.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4813578185865331, | |
| "grad_norm": 2.03306725822367, | |
| "learning_rate": 1.2395333194455444e-05, | |
| "loss": 1.0468, | |
| "mean_token_accuracy": 0.7151136279106141, | |
| "num_tokens": 102142380.0, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.48414023372287146, | |
| "grad_norm": 2.1682696530364374, | |
| "learning_rate": 1.2300907030848307e-05, | |
| "loss": 1.0554, | |
| "mean_token_accuracy": 0.7153695344924926, | |
| "num_tokens": 102734295.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.4869226488592098, | |
| "grad_norm": 2.019722631206529, | |
| "learning_rate": 1.2206263739472085e-05, | |
| "loss": 1.0397, | |
| "mean_token_accuracy": 0.7160439848899841, | |
| "num_tokens": 103319783.0, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.48970506399554814, | |
| "grad_norm": 1.9817132841610883, | |
| "learning_rate": 1.2111412251451085e-05, | |
| "loss": 1.0487, | |
| "mean_token_accuracy": 0.7163015246391297, | |
| "num_tokens": 103911953.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.49248747913188645, | |
| "grad_norm": 1.924001725795815, | |
| "learning_rate": 1.2016361517556334e-05, | |
| "loss": 1.0267, | |
| "mean_token_accuracy": 0.7179745554924011, | |
| "num_tokens": 104499490.0, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.4952698942682248, | |
| "grad_norm": 1.9061990735413328, | |
| "learning_rate": 1.1921120507360934e-05, | |
| "loss": 1.0194, | |
| "mean_token_accuracy": 0.721126937866211, | |
| "num_tokens": 105087086.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.4980523094045632, | |
| "grad_norm": 2.0609228249311635, | |
| "learning_rate": 1.182569820839362e-05, | |
| "loss": 1.0241, | |
| "mean_token_accuracy": 0.7190962195396423, | |
| "num_tokens": 105676890.0, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.5008347245409015, | |
| "grad_norm": 2.136985315971352, | |
| "learning_rate": 1.1730103625290658e-05, | |
| "loss": 1.0727, | |
| "mean_token_accuracy": 0.7091086864471435, | |
| "num_tokens": 106260405.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5036171396772399, | |
| "grad_norm": 1.8617004459073545, | |
| "learning_rate": 1.1634345778946112e-05, | |
| "loss": 1.032, | |
| "mean_token_accuracy": 0.7186322927474975, | |
| "num_tokens": 106854042.0, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.5063995548135782, | |
| "grad_norm": 1.9034875739816928, | |
| "learning_rate": 1.1538433705660561e-05, | |
| "loss": 1.0323, | |
| "mean_token_accuracy": 0.7186863660812378, | |
| "num_tokens": 107444483.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5091819699499165, | |
| "grad_norm": 2.0209462331889156, | |
| "learning_rate": 1.1442376456288402e-05, | |
| "loss": 1.0378, | |
| "mean_token_accuracy": 0.7178295731544495, | |
| "num_tokens": 108034521.0, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.5119643850862549, | |
| "grad_norm": 1.9075532608687007, | |
| "learning_rate": 1.1346183095383731e-05, | |
| "loss": 1.0475, | |
| "mean_token_accuracy": 0.7155048370361328, | |
| "num_tokens": 108621638.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5147468002225932, | |
| "grad_norm": 2.1926421106061165, | |
| "learning_rate": 1.1249862700344969e-05, | |
| "loss": 1.0305, | |
| "mean_token_accuracy": 0.7172364115715026, | |
| "num_tokens": 109218688.0, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.5175292153589316, | |
| "grad_norm": 1.9946226985852828, | |
| "learning_rate": 1.1153424360558268e-05, | |
| "loss": 1.0339, | |
| "mean_token_accuracy": 0.716673743724823, | |
| "num_tokens": 109808908.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5203116304952699, | |
| "grad_norm": 1.9329841963207612, | |
| "learning_rate": 1.1056877176539767e-05, | |
| "loss": 1.0291, | |
| "mean_token_accuracy": 0.7183609366416931, | |
| "num_tokens": 110396483.0, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.5230940456316082, | |
| "grad_norm": 2.013170356946493, | |
| "learning_rate": 1.0960230259076819e-05, | |
| "loss": 1.0456, | |
| "mean_token_accuracy": 0.7170237064361572, | |
| "num_tokens": 110985830.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5258764607679466, | |
| "grad_norm": 1.8999726413056113, | |
| "learning_rate": 1.086349272836824e-05, | |
| "loss": 1.0313, | |
| "mean_token_accuracy": 0.7180803418159485, | |
| "num_tokens": 111581446.0, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.528658875904285, | |
| "grad_norm": 2.0409502249425335, | |
| "learning_rate": 1.0766673713163667e-05, | |
| "loss": 1.0261, | |
| "mean_token_accuracy": 0.717565405368805, | |
| "num_tokens": 112170288.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5314412910406232, | |
| "grad_norm": 1.885899964182058, | |
| "learning_rate": 1.0669782349902122e-05, | |
| "loss": 1.0363, | |
| "mean_token_accuracy": 0.7165609478950501, | |
| "num_tokens": 112758548.0, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.5342237061769616, | |
| "grad_norm": 1.8421326453167242, | |
| "learning_rate": 1.0572827781849835e-05, | |
| "loss": 1.0248, | |
| "mean_token_accuracy": 0.7190156698226928, | |
| "num_tokens": 113351756.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5370061213132999, | |
| "grad_norm": 1.9626347976920178, | |
| "learning_rate": 1.0475819158237426e-05, | |
| "loss": 1.0484, | |
| "mean_token_accuracy": 0.716618275642395, | |
| "num_tokens": 113929618.0, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.5397885364496383, | |
| "grad_norm": 1.9260027594244913, | |
| "learning_rate": 1.0378765633396526e-05, | |
| "loss": 1.0122, | |
| "mean_token_accuracy": 0.7219829797744751, | |
| "num_tokens": 114511171.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.5425709515859767, | |
| "grad_norm": 2.01788882274496, | |
| "learning_rate": 1.0281676365895939e-05, | |
| "loss": 1.0341, | |
| "mean_token_accuracy": 0.7173137307167053, | |
| "num_tokens": 115100329.0, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.5453533667223149, | |
| "grad_norm": 2.0538865520683554, | |
| "learning_rate": 1.0184560517677353e-05, | |
| "loss": 1.0588, | |
| "mean_token_accuracy": 0.715462589263916, | |
| "num_tokens": 115692616.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5481357818586533, | |
| "grad_norm": 1.981183937899365, | |
| "learning_rate": 1.0087427253190775e-05, | |
| "loss": 1.0287, | |
| "mean_token_accuracy": 0.7187099099159241, | |
| "num_tokens": 116282977.0, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.5509181969949917, | |
| "grad_norm": 1.9187211997367468, | |
| "learning_rate": 9.990285738529733e-06, | |
| "loss": 1.0103, | |
| "mean_token_accuracy": 0.7224372506141663, | |
| "num_tokens": 116867356.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.55370061213133, | |
| "grad_norm": 1.876324102948862, | |
| "learning_rate": 9.89314514056627e-06, | |
| "loss": 0.9724, | |
| "mean_token_accuracy": 0.7300009608268738, | |
| "num_tokens": 117453088.0, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.5564830272676683, | |
| "grad_norm": 1.9623303147959466, | |
| "learning_rate": 9.79601462608595e-06, | |
| "loss": 1.0035, | |
| "mean_token_accuracy": 0.7247561693191529, | |
| "num_tokens": 118045362.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5592654424040067, | |
| "grad_norm": 1.8686077447526968, | |
| "learning_rate": 9.698903360922773e-06, | |
| "loss": 0.9856, | |
| "mean_token_accuracy": 0.7283125519752502, | |
| "num_tokens": 118637830.0, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.562047857540345, | |
| "grad_norm": 1.8357018834213918, | |
| "learning_rate": 9.601820509094272e-06, | |
| "loss": 1.0289, | |
| "mean_token_accuracy": 0.7204028606414795, | |
| "num_tokens": 119229320.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5648302726766834, | |
| "grad_norm": 2.0538909601653286, | |
| "learning_rate": 9.504775231936716e-06, | |
| "loss": 1.0498, | |
| "mean_token_accuracy": 0.7141813278198242, | |
| "num_tokens": 119821047.0, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.5676126878130217, | |
| "grad_norm": 1.836766812630116, | |
| "learning_rate": 9.407776687240591e-06, | |
| "loss": 0.9964, | |
| "mean_token_accuracy": 0.7254474043846131, | |
| "num_tokens": 120416538.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.57039510294936, | |
| "grad_norm": 1.9514390647117241, | |
| "learning_rate": 9.310834028386436e-06, | |
| "loss": 1.0173, | |
| "mean_token_accuracy": 0.7226798415184021, | |
| "num_tokens": 121007161.0, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.5731775180856984, | |
| "grad_norm": 1.9399928393210226, | |
| "learning_rate": 9.213956403481037e-06, | |
| "loss": 1.0142, | |
| "mean_token_accuracy": 0.7212600111961365, | |
| "num_tokens": 121598725.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5759599332220368, | |
| "grad_norm": 1.9888989034840572, | |
| "learning_rate": 9.117152954494195e-06, | |
| "loss": 1.0328, | |
| "mean_token_accuracy": 0.7186324715614318, | |
| "num_tokens": 122188007.0, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.5787423483583751, | |
| "grad_norm": 1.9479652836008858, | |
| "learning_rate": 9.020432816395993e-06, | |
| "loss": 1.0293, | |
| "mean_token_accuracy": 0.7192287445068359, | |
| "num_tokens": 122775444.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.5815247634947134, | |
| "grad_norm": 1.868873876658722, | |
| "learning_rate": 8.92380511629481e-06, | |
| "loss": 1.0088, | |
| "mean_token_accuracy": 0.7221626877784729, | |
| "num_tokens": 123364954.0, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.5843071786310517, | |
| "grad_norm": 1.8460354821562948, | |
| "learning_rate": 8.827278972575984e-06, | |
| "loss": 1.0034, | |
| "mean_token_accuracy": 0.7228006601333619, | |
| "num_tokens": 123959095.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5870895937673901, | |
| "grad_norm": 2.027141329127932, | |
| "learning_rate": 8.730863494041379e-06, | |
| "loss": 1.0222, | |
| "mean_token_accuracy": 0.7207066774368286, | |
| "num_tokens": 124551375.0, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.5898720089037285, | |
| "grad_norm": 1.9303209158495678, | |
| "learning_rate": 8.634567779049807e-06, | |
| "loss": 1.0136, | |
| "mean_token_accuracy": 0.7226389169692993, | |
| "num_tokens": 125148583.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.5926544240400667, | |
| "grad_norm": 1.821089900718507, | |
| "learning_rate": 8.538400914658456e-06, | |
| "loss": 1.0157, | |
| "mean_token_accuracy": 0.7223539471626281, | |
| "num_tokens": 125739350.0, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.5954368391764051, | |
| "grad_norm": 1.9402132874471187, | |
| "learning_rate": 8.442371975765368e-06, | |
| "loss": 1.0255, | |
| "mean_token_accuracy": 0.7197723150253296, | |
| "num_tokens": 126327360.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5982192543127435, | |
| "grad_norm": 4.186389331333341, | |
| "learning_rate": 8.346490024253103e-06, | |
| "loss": 0.9985, | |
| "mean_token_accuracy": 0.7242988467216491, | |
| "num_tokens": 126919726.0, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.6010016694490818, | |
| "grad_norm": 1.8722288044640503, | |
| "learning_rate": 8.250764108133562e-06, | |
| "loss": 1.018, | |
| "mean_token_accuracy": 0.720951783657074, | |
| "num_tokens": 127503663.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.6037840845854201, | |
| "grad_norm": 1.9837555239288165, | |
| "learning_rate": 8.15520326069421e-06, | |
| "loss": 1.0133, | |
| "mean_token_accuracy": 0.7221065282821655, | |
| "num_tokens": 128092295.0, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.6065664997217585, | |
| "grad_norm": 1.918332101066207, | |
| "learning_rate": 8.05981649964559e-06, | |
| "loss": 1.0336, | |
| "mean_token_accuracy": 0.7192610502243042, | |
| "num_tokens": 128681634.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.6093489148580968, | |
| "grad_norm": 1.9154193774615031, | |
| "learning_rate": 7.964612826270399e-06, | |
| "loss": 0.9984, | |
| "mean_token_accuracy": 0.7253150701522827, | |
| "num_tokens": 129276945.0, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.6121313299944352, | |
| "grad_norm": 1.9833171612274754, | |
| "learning_rate": 7.86960122457404e-06, | |
| "loss": 1.0098, | |
| "mean_token_accuracy": 0.7225422620773315, | |
| "num_tokens": 129870803.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6149137451307735, | |
| "grad_norm": 1.8771784670033254, | |
| "learning_rate": 7.774790660436857e-06, | |
| "loss": 1.0001, | |
| "mean_token_accuracy": 0.7242280602455139, | |
| "num_tokens": 130461085.0, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.6176961602671118, | |
| "grad_norm": 1.98853932669659, | |
| "learning_rate": 7.680190080768046e-06, | |
| "loss": 1.006, | |
| "mean_token_accuracy": 0.7234596967697143, | |
| "num_tokens": 131044253.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.6204785754034502, | |
| "grad_norm": 2.4167589390741946, | |
| "learning_rate": 7.585808412661379e-06, | |
| "loss": 1.0199, | |
| "mean_token_accuracy": 0.7218466520309448, | |
| "num_tokens": 131634142.0, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.6232609905397886, | |
| "grad_norm": 1.9611913130221166, | |
| "learning_rate": 7.4916545625527745e-06, | |
| "loss": 1.011, | |
| "mean_token_accuracy": 0.7240106225013733, | |
| "num_tokens": 132217401.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.6260434056761269, | |
| "grad_norm": 1.8664547397127202, | |
| "learning_rate": 7.397737415379853e-06, | |
| "loss": 1.0042, | |
| "mean_token_accuracy": 0.7248145937919617, | |
| "num_tokens": 132804730.0, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.6288258208124652, | |
| "grad_norm": 1.8854684297024977, | |
| "learning_rate": 7.304065833743475e-06, | |
| "loss": 1.0112, | |
| "mean_token_accuracy": 0.7220677971839905, | |
| "num_tokens": 133395672.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.6316082359488036, | |
| "grad_norm": 1.986610522875934, | |
| "learning_rate": 7.210648657071433e-06, | |
| "loss": 1.0152, | |
| "mean_token_accuracy": 0.7226180791854858, | |
| "num_tokens": 133987957.0, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.6343906510851419, | |
| "grad_norm": 1.889771442380467, | |
| "learning_rate": 7.117494700784292e-06, | |
| "loss": 0.9915, | |
| "mean_token_accuracy": 0.7284766793251037, | |
| "num_tokens": 134580628.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.6371730662214803, | |
| "grad_norm": 1.795583208948344, | |
| "learning_rate": 7.024612755463529e-06, | |
| "loss": 1.0106, | |
| "mean_token_accuracy": 0.7225217223167419, | |
| "num_tokens": 135175991.0, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.6399554813578185, | |
| "grad_norm": 1.9767456426052468, | |
| "learning_rate": 6.9320115860219705e-06, | |
| "loss": 1.005, | |
| "mean_token_accuracy": 0.724352490901947, | |
| "num_tokens": 135760748.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6427378964941569, | |
| "grad_norm": 1.9522889123008382, | |
| "learning_rate": 6.839699930876727e-06, | |
| "loss": 1.0128, | |
| "mean_token_accuracy": 0.7235522747039795, | |
| "num_tokens": 136348202.0, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.6455203116304953, | |
| "grad_norm": 1.8485871557042115, | |
| "learning_rate": 6.747686501124531e-06, | |
| "loss": 1.0202, | |
| "mean_token_accuracy": 0.7193972945213318, | |
| "num_tokens": 136939858.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.6483027267668336, | |
| "grad_norm": 1.784737267536378, | |
| "learning_rate": 6.655979979719744e-06, | |
| "loss": 0.9938, | |
| "mean_token_accuracy": 0.7244254350662231, | |
| "num_tokens": 137528803.0, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.6510851419031719, | |
| "grad_norm": 1.8881790480821496, | |
| "learning_rate": 6.5645890206549566e-06, | |
| "loss": 0.974, | |
| "mean_token_accuracy": 0.7322948575019836, | |
| "num_tokens": 138114576.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6538675570395103, | |
| "grad_norm": 1.941045031850279, | |
| "learning_rate": 6.473522248144359e-06, | |
| "loss": 0.9798, | |
| "mean_token_accuracy": 0.7303597807884217, | |
| "num_tokens": 138701429.0, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.6566499721758486, | |
| "grad_norm": 1.9164315660360134, | |
| "learning_rate": 6.382788255809893e-06, | |
| "loss": 1.0005, | |
| "mean_token_accuracy": 0.7247542023658753, | |
| "num_tokens": 139296441.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.659432387312187, | |
| "grad_norm": 1.9252747772897012, | |
| "learning_rate": 6.292395605870314e-06, | |
| "loss": 0.9935, | |
| "mean_token_accuracy": 0.727207088470459, | |
| "num_tokens": 139884765.0, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.6622148024485254, | |
| "grad_norm": 2.48708952288793, | |
| "learning_rate": 6.202352828333211e-06, | |
| "loss": 0.997, | |
| "mean_token_accuracy": 0.7260267257690429, | |
| "num_tokens": 140474793.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6649972175848636, | |
| "grad_norm": 1.8703391327630177, | |
| "learning_rate": 6.112668420190042e-06, | |
| "loss": 0.9826, | |
| "mean_token_accuracy": 0.7283554911613465, | |
| "num_tokens": 141064904.0, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.667779632721202, | |
| "grad_norm": 1.9501052152673526, | |
| "learning_rate": 6.023350844614344e-06, | |
| "loss": 0.9763, | |
| "mean_token_accuracy": 0.7310232162475586, | |
| "num_tokens": 141649410.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6705620478575404, | |
| "grad_norm": 1.9782655835974774, | |
| "learning_rate": 5.9344085301630425e-06, | |
| "loss": 1.003, | |
| "mean_token_accuracy": 0.723707640171051, | |
| "num_tokens": 142239282.0, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.6733444629938787, | |
| "grad_norm": 1.9354136592872768, | |
| "learning_rate": 5.845849869981137e-06, | |
| "loss": 1.0027, | |
| "mean_token_accuracy": 0.7262308835983277, | |
| "num_tokens": 142827280.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.676126878130217, | |
| "grad_norm": 1.8791345693920576, | |
| "learning_rate": 5.757683221009625e-06, | |
| "loss": 0.9975, | |
| "mean_token_accuracy": 0.7248466491699219, | |
| "num_tokens": 143422429.0, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.6789092932665554, | |
| "grad_norm": 1.8963640356605256, | |
| "learning_rate": 5.669916903196931e-06, | |
| "loss": 1.0014, | |
| "mean_token_accuracy": 0.7251011848449707, | |
| "num_tokens": 144009015.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6816917084028937, | |
| "grad_norm": 1.7618216747620736, | |
| "learning_rate": 5.58255919871374e-06, | |
| "loss": 0.9848, | |
| "mean_token_accuracy": 0.7293275952339172, | |
| "num_tokens": 144602634.0, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.6844741235392321, | |
| "grad_norm": 1.8457721051471847, | |
| "learning_rate": 5.495618351171484e-06, | |
| "loss": 0.9919, | |
| "mean_token_accuracy": 0.7272073984146118, | |
| "num_tokens": 145196052.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.6872565386755703, | |
| "grad_norm": 1.9450479369664486, | |
| "learning_rate": 5.409102564844393e-06, | |
| "loss": 0.9938, | |
| "mean_token_accuracy": 0.7261118292808533, | |
| "num_tokens": 145794135.0, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.6900389538119087, | |
| "grad_norm": 1.8511319642398822, | |
| "learning_rate": 5.323020003895307e-06, | |
| "loss": 0.9484, | |
| "mean_token_accuracy": 0.7359282970428467, | |
| "num_tokens": 146384348.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.6928213689482471, | |
| "grad_norm": 1.7695297382973636, | |
| "learning_rate": 5.237378791605249e-06, | |
| "loss": 0.9638, | |
| "mean_token_accuracy": 0.7326830267906189, | |
| "num_tokens": 146981119.0, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.6956037840845855, | |
| "grad_norm": 1.9109931194680208, | |
| "learning_rate": 5.152187009606864e-06, | |
| "loss": 0.9878, | |
| "mean_token_accuracy": 0.7266369104385376, | |
| "num_tokens": 147573298.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6983861992209237, | |
| "grad_norm": 1.8838124813014612, | |
| "learning_rate": 5.067452697121773e-06, | |
| "loss": 1.0136, | |
| "mean_token_accuracy": 0.7227142214775085, | |
| "num_tokens": 148166017.0, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.7011686143572621, | |
| "grad_norm": 1.8764858816220478, | |
| "learning_rate": 4.98318385020197e-06, | |
| "loss": 0.991, | |
| "mean_token_accuracy": 0.7256438136100769, | |
| "num_tokens": 148758203.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.7039510294936004, | |
| "grad_norm": 1.7836760728163532, | |
| "learning_rate": 4.8993884209752364e-06, | |
| "loss": 0.9776, | |
| "mean_token_accuracy": 0.728616988658905, | |
| "num_tokens": 149343772.0, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.7067334446299388, | |
| "grad_norm": 1.859071558512736, | |
| "learning_rate": 4.81607431689475e-06, | |
| "loss": 0.9859, | |
| "mean_token_accuracy": 0.7275610089302063, | |
| "num_tokens": 149934534.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.7095158597662772, | |
| "grad_norm": 1.830584410956007, | |
| "learning_rate": 4.7332493999928785e-06, | |
| "loss": 0.9997, | |
| "mean_token_accuracy": 0.7258034944534302, | |
| "num_tokens": 150528868.0, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.7122982749026154, | |
| "grad_norm": 1.7872234444186852, | |
| "learning_rate": 4.6509214861392785e-06, | |
| "loss": 0.9904, | |
| "mean_token_accuracy": 0.7283051371574402, | |
| "num_tokens": 151128370.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.7150806900389538, | |
| "grad_norm": 1.8707290023032752, | |
| "learning_rate": 4.569098344303319e-06, | |
| "loss": 0.9715, | |
| "mean_token_accuracy": 0.7312512874603272, | |
| "num_tokens": 151722014.0, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.7178631051752922, | |
| "grad_norm": 1.847939158663493, | |
| "learning_rate": 4.487787695820991e-06, | |
| "loss": 0.973, | |
| "mean_token_accuracy": 0.7308701038360595, | |
| "num_tokens": 152312667.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.7206455203116305, | |
| "grad_norm": 1.7516029476618973, | |
| "learning_rate": 4.406997213666236e-06, | |
| "loss": 0.9661, | |
| "mean_token_accuracy": 0.731387734413147, | |
| "num_tokens": 152899175.0, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.7234279354479688, | |
| "grad_norm": 1.8151365859715758, | |
| "learning_rate": 4.326734521726905e-06, | |
| "loss": 0.9563, | |
| "mean_token_accuracy": 0.7346587657928467, | |
| "num_tokens": 153488259.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7262103505843072, | |
| "grad_norm": 1.7688011856056145, | |
| "learning_rate": 4.24700719408531e-06, | |
| "loss": 0.975, | |
| "mean_token_accuracy": 0.7301976919174195, | |
| "num_tokens": 154076796.0, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.7289927657206455, | |
| "grad_norm": 1.9580888383075816, | |
| "learning_rate": 4.167822754303493e-06, | |
| "loss": 0.9738, | |
| "mean_token_accuracy": 0.7310252785682678, | |
| "num_tokens": 154664728.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7317751808569839, | |
| "grad_norm": 1.999126181586082, | |
| "learning_rate": 4.0891886747132356e-06, | |
| "loss": 0.9824, | |
| "mean_token_accuracy": 0.7299495816230774, | |
| "num_tokens": 155254919.0, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.7345575959933222, | |
| "grad_norm": 1.921167819518032, | |
| "learning_rate": 4.011112375710958e-06, | |
| "loss": 1.0045, | |
| "mean_token_accuracy": 0.7263089060783386, | |
| "num_tokens": 155842686.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7373400111296605, | |
| "grad_norm": 1.801354649641667, | |
| "learning_rate": 3.933601225057446e-06, | |
| "loss": 0.9541, | |
| "mean_token_accuracy": 0.7346353769302368, | |
| "num_tokens": 156428772.0, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.7401224262659989, | |
| "grad_norm": 1.8122280326962623, | |
| "learning_rate": 3.85666253718263e-06, | |
| "loss": 0.9565, | |
| "mean_token_accuracy": 0.7328558325767517, | |
| "num_tokens": 157015422.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7429048414023373, | |
| "grad_norm": 1.839585045567707, | |
| "learning_rate": 3.7803035724953007e-06, | |
| "loss": 0.9652, | |
| "mean_token_accuracy": 0.7333778142929077, | |
| "num_tokens": 157603116.0, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.7456872565386756, | |
| "grad_norm": 1.8056794531482636, | |
| "learning_rate": 3.704531536698012e-06, | |
| "loss": 0.9576, | |
| "mean_token_accuracy": 0.7345310568809509, | |
| "num_tokens": 158186881.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7484696716750139, | |
| "grad_norm": 1.9143591141126923, | |
| "learning_rate": 3.6293535801070735e-06, | |
| "loss": 0.9709, | |
| "mean_token_accuracy": 0.7322964310646057, | |
| "num_tokens": 158782774.0, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.7512520868113522, | |
| "grad_norm": 1.777185833225788, | |
| "learning_rate": 3.5547767969778355e-06, | |
| "loss": 0.9892, | |
| "mean_token_accuracy": 0.7279403567314148, | |
| "num_tokens": 159372802.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.7540345019476906, | |
| "grad_norm": 1.9535914643310681, | |
| "learning_rate": 3.4808082248352058e-06, | |
| "loss": 0.9802, | |
| "mean_token_accuracy": 0.7304705739021301, | |
| "num_tokens": 159960156.0, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.756816917084029, | |
| "grad_norm": 1.8788792436096216, | |
| "learning_rate": 3.40745484380956e-06, | |
| "loss": 0.9821, | |
| "mean_token_accuracy": 0.7290140271186829, | |
| "num_tokens": 160545408.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.7595993322203672, | |
| "grad_norm": 1.927958774997016, | |
| "learning_rate": 3.3347235759780483e-06, | |
| "loss": 0.9752, | |
| "mean_token_accuracy": 0.731472396850586, | |
| "num_tokens": 161134387.0, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.7623817473567056, | |
| "grad_norm": 1.916500311041559, | |
| "learning_rate": 3.262621284711376e-06, | |
| "loss": 0.9846, | |
| "mean_token_accuracy": 0.729660439491272, | |
| "num_tokens": 161724072.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.765164162493044, | |
| "grad_norm": 1.7923444423002466, | |
| "learning_rate": 3.191154774026156e-06, | |
| "loss": 0.9655, | |
| "mean_token_accuracy": 0.7318884611129761, | |
| "num_tokens": 162310073.0, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.7679465776293823, | |
| "grad_norm": 1.7760349668951476, | |
| "learning_rate": 3.1203307879428146e-06, | |
| "loss": 0.9522, | |
| "mean_token_accuracy": 0.7352138042449952, | |
| "num_tokens": 162900817.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.7707289927657206, | |
| "grad_norm": 1.9573185565221094, | |
| "learning_rate": 3.0501560098492056e-06, | |
| "loss": 0.9476, | |
| "mean_token_accuracy": 0.7361976623535156, | |
| "num_tokens": 163488071.0, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.773511407902059, | |
| "grad_norm": 1.8472978388798753, | |
| "learning_rate": 2.9806370618699142e-06, | |
| "loss": 0.9599, | |
| "mean_token_accuracy": 0.7325201988220215, | |
| "num_tokens": 164076411.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.7762938230383973, | |
| "grad_norm": 1.9039958120991338, | |
| "learning_rate": 2.911780504241354e-06, | |
| "loss": 0.955, | |
| "mean_token_accuracy": 0.7342100620269776, | |
| "num_tokens": 164665515.0, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.7790762381747357, | |
| "grad_norm": 1.8467129585219402, | |
| "learning_rate": 2.8435928346926945e-06, | |
| "loss": 0.959, | |
| "mean_token_accuracy": 0.7346114397048951, | |
| "num_tokens": 165255953.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.781858653311074, | |
| "grad_norm": 2.0658334017918474, | |
| "learning_rate": 2.776080487832715e-06, | |
| "loss": 0.961, | |
| "mean_token_accuracy": 0.7332920074462891, | |
| "num_tokens": 165838622.0, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.7846410684474123, | |
| "grad_norm": 1.8352945141096326, | |
| "learning_rate": 2.70924983454257e-06, | |
| "loss": 0.9963, | |
| "mean_token_accuracy": 0.7269340515136719, | |
| "num_tokens": 166431603.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7874234835837507, | |
| "grad_norm": 1.7741661715418184, | |
| "learning_rate": 2.6431071813746277e-06, | |
| "loss": 0.9548, | |
| "mean_token_accuracy": 0.7333246469497681, | |
| "num_tokens": 167020566.0, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.7902058987200891, | |
| "grad_norm": 1.7577119163466035, | |
| "learning_rate": 2.5776587699573007e-06, | |
| "loss": 0.9557, | |
| "mean_token_accuracy": 0.7359763622283936, | |
| "num_tokens": 167611779.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.7929883138564274, | |
| "grad_norm": 1.8883403863945059, | |
| "learning_rate": 2.512910776406089e-06, | |
| "loss": 0.9714, | |
| "mean_token_accuracy": 0.7312511920928955, | |
| "num_tokens": 168191251.0, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.7957707289927657, | |
| "grad_norm": 1.8710738581171973, | |
| "learning_rate": 2.4488693107407335e-06, | |
| "loss": 0.9731, | |
| "mean_token_accuracy": 0.7300806879997254, | |
| "num_tokens": 168782303.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.798553144129104, | |
| "grad_norm": 1.810175185890339, | |
| "learning_rate": 2.3855404163086558e-06, | |
| "loss": 0.9595, | |
| "mean_token_accuracy": 0.7339372992515564, | |
| "num_tokens": 169372790.0, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.8013355592654424, | |
| "grad_norm": 1.8716987813342199, | |
| "learning_rate": 2.322930069214664e-06, | |
| "loss": 0.9422, | |
| "mean_token_accuracy": 0.7376594424247742, | |
| "num_tokens": 169958372.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.8041179744017808, | |
| "grad_norm": 1.8055779362732807, | |
| "learning_rate": 2.2610441777570104e-06, | |
| "loss": 0.9713, | |
| "mean_token_accuracy": 0.7313427925109863, | |
| "num_tokens": 170547568.0, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.806900389538119, | |
| "grad_norm": 1.8191938807882237, | |
| "learning_rate": 2.1998885818698434e-06, | |
| "loss": 0.9395, | |
| "mean_token_accuracy": 0.7381924271583558, | |
| "num_tokens": 171132579.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.8096828046744574, | |
| "grad_norm": 1.889036188183445, | |
| "learning_rate": 2.1394690525721275e-06, | |
| "loss": 0.9744, | |
| "mean_token_accuracy": 0.7313727378845215, | |
| "num_tokens": 171722385.0, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.8124652198107958, | |
| "grad_norm": 1.8701292804037986, | |
| "learning_rate": 2.079791291423039e-06, | |
| "loss": 0.9786, | |
| "mean_token_accuracy": 0.729002046585083, | |
| "num_tokens": 172315922.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.8152476349471341, | |
| "grad_norm": 1.8760160382405797, | |
| "learning_rate": 2.0208609299839465e-06, | |
| "loss": 0.9683, | |
| "mean_token_accuracy": 0.7306602478027344, | |
| "num_tokens": 172910228.0, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.8180300500834724, | |
| "grad_norm": 2.129314039319898, | |
| "learning_rate": 1.962683529286973e-06, | |
| "loss": 0.9634, | |
| "mean_token_accuracy": 0.7342103958129883, | |
| "num_tokens": 173492796.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.8208124652198108, | |
| "grad_norm": 1.786710036467001, | |
| "learning_rate": 1.9052645793102277e-06, | |
| "loss": 0.9646, | |
| "mean_token_accuracy": 0.7335922002792359, | |
| "num_tokens": 174076770.0, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.8235948803561491, | |
| "grad_norm": 1.895632789397937, | |
| "learning_rate": 1.8486094984597268e-06, | |
| "loss": 1.0103, | |
| "mean_token_accuracy": 0.723214328289032, | |
| "num_tokens": 174666564.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.8263772954924875, | |
| "grad_norm": 1.8125039443565851, | |
| "learning_rate": 1.7927236330581e-06, | |
| "loss": 0.9504, | |
| "mean_token_accuracy": 0.7362190008163452, | |
| "num_tokens": 175250981.0, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.8291597106288259, | |
| "grad_norm": 1.722786992549355, | |
| "learning_rate": 1.7376122568400533e-06, | |
| "loss": 0.9499, | |
| "mean_token_accuracy": 0.7359644174575806, | |
| "num_tokens": 175846510.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.8319421257651641, | |
| "grad_norm": 1.841603530171618, | |
| "learning_rate": 1.6832805704547272e-06, | |
| "loss": 0.9551, | |
| "mean_token_accuracy": 0.7342963933944702, | |
| "num_tokens": 176432352.0, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.8347245409015025, | |
| "grad_norm": 1.9002675861540066, | |
| "learning_rate": 1.6297337009749249e-06, | |
| "loss": 0.9446, | |
| "mean_token_accuracy": 0.7374125838279724, | |
| "num_tokens": 177024825.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8375069560378409, | |
| "grad_norm": 1.809765975130373, | |
| "learning_rate": 1.5769767014132885e-06, | |
| "loss": 0.9544, | |
| "mean_token_accuracy": 0.7355196237564087, | |
| "num_tokens": 177612725.0, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.8402893711741792, | |
| "grad_norm": 1.75809727968389, | |
| "learning_rate": 1.5250145502454594e-06, | |
| "loss": 0.9548, | |
| "mean_token_accuracy": 0.7356468796730041, | |
| "num_tokens": 178207999.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.8430717863105175, | |
| "grad_norm": 1.9477117802608452, | |
| "learning_rate": 1.473852150940297e-06, | |
| "loss": 0.9501, | |
| "mean_token_accuracy": 0.7353867173194886, | |
| "num_tokens": 178792546.0, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.8458542014468559, | |
| "grad_norm": 1.8675907748201204, | |
| "learning_rate": 1.4234943314971328e-06, | |
| "loss": 0.9472, | |
| "mean_token_accuracy": 0.7378309011459351, | |
| "num_tokens": 179380874.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.8486366165831942, | |
| "grad_norm": 1.9839427192393002, | |
| "learning_rate": 1.373945843990192e-06, | |
| "loss": 0.9686, | |
| "mean_token_accuracy": 0.7325679302215576, | |
| "num_tokens": 179970205.0, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.8514190317195326, | |
| "grad_norm": 1.8714168212459494, | |
| "learning_rate": 1.3252113641201537e-06, | |
| "loss": 0.9532, | |
| "mean_token_accuracy": 0.7361051917076111, | |
| "num_tokens": 180566757.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.8542014468558708, | |
| "grad_norm": 2.088368062803997, | |
| "learning_rate": 1.2772954907729074e-06, | |
| "loss": 0.9185, | |
| "mean_token_accuracy": 0.7416197896003723, | |
| "num_tokens": 181156035.0, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.8569838619922092, | |
| "grad_norm": 1.9320065667167445, | |
| "learning_rate": 1.2302027455855969e-06, | |
| "loss": 0.9452, | |
| "mean_token_accuracy": 0.736557149887085, | |
| "num_tokens": 181740790.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.8597662771285476, | |
| "grad_norm": 1.8430967747183953, | |
| "learning_rate": 1.1839375725199098e-06, | |
| "loss": 0.9541, | |
| "mean_token_accuracy": 0.7358271360397339, | |
| "num_tokens": 182328713.0, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.862548692264886, | |
| "grad_norm": 1.9004590415789702, | |
| "learning_rate": 1.1385043374427341e-06, | |
| "loss": 0.9663, | |
| "mean_token_accuracy": 0.731933867931366, | |
| "num_tokens": 182919104.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8653311074012242, | |
| "grad_norm": 1.738836286774547, | |
| "learning_rate": 1.0939073277141598e-06, | |
| "loss": 0.9462, | |
| "mean_token_accuracy": 0.737112843990326, | |
| "num_tokens": 183507931.0, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.8681135225375626, | |
| "grad_norm": 1.929598189537243, | |
| "learning_rate": 1.0501507517829012e-06, | |
| "loss": 0.9662, | |
| "mean_token_accuracy": 0.7332514524459839, | |
| "num_tokens": 184093155.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.8708959376739009, | |
| "grad_norm": 1.8032404204598453, | |
| "learning_rate": 1.0072387387891535e-06, | |
| "loss": 0.941, | |
| "mean_token_accuracy": 0.7367923140525818, | |
| "num_tokens": 184680915.0, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.8736783528102393, | |
| "grad_norm": 1.8432734129518489, | |
| "learning_rate": 9.65175338174954e-07, | |
| "loss": 0.9615, | |
| "mean_token_accuracy": 0.7350999474525451, | |
| "num_tokens": 185272303.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.8764607679465777, | |
| "grad_norm": 1.9082038654252018, | |
| "learning_rate": 9.239645193020386e-07, | |
| "loss": 0.969, | |
| "mean_token_accuracy": 0.7324134349822998, | |
| "num_tokens": 185865698.0, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.8792431830829159, | |
| "grad_norm": 1.8191308709884229, | |
| "learning_rate": 8.836101710772826e-07, | |
| "loss": 0.9429, | |
| "mean_token_accuracy": 0.7369024634361268, | |
| "num_tokens": 186455776.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.8820255982192543, | |
| "grad_norm": 1.8602027757002002, | |
| "learning_rate": 8.441161015857092e-07, | |
| "loss": 0.9621, | |
| "mean_token_accuracy": 0.7330835700035095, | |
| "num_tokens": 187049436.0, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.8848080133555927, | |
| "grad_norm": 1.8629150091039206, | |
| "learning_rate": 8.054860377311368e-07, | |
| "loss": 0.9632, | |
| "mean_token_accuracy": 0.7352335929870606, | |
| "num_tokens": 187643221.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.887590428491931, | |
| "grad_norm": 1.7199187404593725, | |
| "learning_rate": 7.677236248844855e-07, | |
| "loss": 0.9208, | |
| "mean_token_accuracy": 0.7412317156791687, | |
| "num_tokens": 188241695.0, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.8903728436282693, | |
| "grad_norm": 1.9691877660185322, | |
| "learning_rate": 7.308324265397837e-07, | |
| "loss": 0.9454, | |
| "mean_token_accuracy": 0.7370688557624817, | |
| "num_tokens": 188836037.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8931552587646077, | |
| "grad_norm": 1.9408377445794653, | |
| "learning_rate": 6.948159239778829e-07, | |
| "loss": 0.9529, | |
| "mean_token_accuracy": 0.7338770508766175, | |
| "num_tokens": 189432028.0, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.895937673900946, | |
| "grad_norm": 1.7165506466090106, | |
| "learning_rate": 6.596775159379543e-07, | |
| "loss": 0.9539, | |
| "mean_token_accuracy": 0.7329376816749573, | |
| "num_tokens": 190027436.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.8987200890372844, | |
| "grad_norm": 1.8489886082758764, | |
| "learning_rate": 6.254205182967566e-07, | |
| "loss": 0.9827, | |
| "mean_token_accuracy": 0.7286684274673462, | |
| "num_tokens": 190619847.0, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.9015025041736227, | |
| "grad_norm": 1.8671715601161172, | |
| "learning_rate": 5.920481637557318e-07, | |
| "loss": 0.9519, | |
| "mean_token_accuracy": 0.7349419355392456, | |
| "num_tokens": 191212681.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.904284919309961, | |
| "grad_norm": 1.9345594767714265, | |
| "learning_rate": 5.59563601535943e-07, | |
| "loss": 0.9273, | |
| "mean_token_accuracy": 0.7412004351615906, | |
| "num_tokens": 191805794.0, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.9070673344462994, | |
| "grad_norm": 1.8551965637564125, | |
| "learning_rate": 5.279698970809011e-07, | |
| "loss": 0.9414, | |
| "mean_token_accuracy": 0.7379236817359924, | |
| "num_tokens": 192402500.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.9098497495826378, | |
| "grad_norm": 1.7354354682366766, | |
| "learning_rate": 4.972700317672829e-07, | |
| "loss": 0.9497, | |
| "mean_token_accuracy": 0.7355363965034485, | |
| "num_tokens": 192989888.0, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.9126321647189761, | |
| "grad_norm": 1.853770597272773, | |
| "learning_rate": 4.674669026236045e-07, | |
| "loss": 0.9457, | |
| "mean_token_accuracy": 0.7373492479324341, | |
| "num_tokens": 193579779.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.9154145798553144, | |
| "grad_norm": 1.8911816701117188, | |
| "learning_rate": 4.385633220568186e-07, | |
| "loss": 0.9575, | |
| "mean_token_accuracy": 0.7329700469970704, | |
| "num_tokens": 194172425.0, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.9181969949916527, | |
| "grad_norm": 1.810081275671363, | |
| "learning_rate": 4.1056201758693957e-07, | |
| "loss": 0.9497, | |
| "mean_token_accuracy": 0.7358971953392028, | |
| "num_tokens": 194760384.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.9209794101279911, | |
| "grad_norm": 1.8068796359631913, | |
| "learning_rate": 3.834656315896379e-07, | |
| "loss": 0.9349, | |
| "mean_token_accuracy": 0.7383142828941345, | |
| "num_tokens": 195354279.0, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.9237618252643295, | |
| "grad_norm": 1.7877479581883122, | |
| "learning_rate": 3.572767210469086e-07, | |
| "loss": 0.9418, | |
| "mean_token_accuracy": 0.7375067234039306, | |
| "num_tokens": 195940416.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.9265442404006677, | |
| "grad_norm": 1.8061020929645715, | |
| "learning_rate": 3.319977573057642e-07, | |
| "loss": 0.9361, | |
| "mean_token_accuracy": 0.7379802227020263, | |
| "num_tokens": 196528933.0, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.9293266555370061, | |
| "grad_norm": 1.7940687494892404, | |
| "learning_rate": 3.0763112584503264e-07, | |
| "loss": 0.9441, | |
| "mean_token_accuracy": 0.7372017502784729, | |
| "num_tokens": 197115460.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.9321090706733445, | |
| "grad_norm": 1.7726872041826656, | |
| "learning_rate": 2.841791260502402e-07, | |
| "loss": 0.9618, | |
| "mean_token_accuracy": 0.7329637885093689, | |
| "num_tokens": 197705685.0, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.9348914858096828, | |
| "grad_norm": 2.1180868450006214, | |
| "learning_rate": 2.6164397099663676e-07, | |
| "loss": 0.9656, | |
| "mean_token_accuracy": 0.7338227391242981, | |
| "num_tokens": 198291195.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.9376739009460211, | |
| "grad_norm": 1.875515474041982, | |
| "learning_rate": 2.4002778724034447e-07, | |
| "loss": 0.9543, | |
| "mean_token_accuracy": 0.7351009726524353, | |
| "num_tokens": 198877612.0, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.9404563160823595, | |
| "grad_norm": 1.8136225504642058, | |
| "learning_rate": 2.1933261461769772e-07, | |
| "loss": 0.9181, | |
| "mean_token_accuracy": 0.7414175033569336, | |
| "num_tokens": 199453234.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9432387312186978, | |
| "grad_norm": 1.7835014898755626, | |
| "learning_rate": 1.9956040605273784e-07, | |
| "loss": 0.9749, | |
| "mean_token_accuracy": 0.730805778503418, | |
| "num_tokens": 200046406.0, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.9460211463550362, | |
| "grad_norm": 1.7517370910691499, | |
| "learning_rate": 1.8071302737293294e-07, | |
| "loss": 0.9323, | |
| "mean_token_accuracy": 0.7374993085861206, | |
| "num_tokens": 200635906.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9488035614913745, | |
| "grad_norm": 1.8527935092393206, | |
| "learning_rate": 1.6279225713310088e-07, | |
| "loss": 0.9295, | |
| "mean_token_accuracy": 0.7385197877883911, | |
| "num_tokens": 201229877.0, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.9515859766277128, | |
| "grad_norm": 1.8136557164407798, | |
| "learning_rate": 1.4579978644757463e-07, | |
| "loss": 0.9471, | |
| "mean_token_accuracy": 0.7368571162223816, | |
| "num_tokens": 201819996.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.9543683917640512, | |
| "grad_norm": 1.9202219251983068, | |
| "learning_rate": 1.297372188306234e-07, | |
| "loss": 0.9695, | |
| "mean_token_accuracy": 0.7313903212547302, | |
| "num_tokens": 202413015.0, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.9571508069003896, | |
| "grad_norm": 1.9683140510809376, | |
| "learning_rate": 1.1460607004512681e-07, | |
| "loss": 0.9575, | |
| "mean_token_accuracy": 0.734274709224701, | |
| "num_tokens": 203004551.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.9599332220367279, | |
| "grad_norm": 1.9014384654161847, | |
| "learning_rate": 1.004077679595472e-07, | |
| "loss": 0.9535, | |
| "mean_token_accuracy": 0.7355141043663025, | |
| "num_tokens": 203598341.0, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.9627156371730662, | |
| "grad_norm": 1.9286046696281232, | |
| "learning_rate": 8.714365241318079e-08, | |
| "loss": 0.9554, | |
| "mean_token_accuracy": 0.7344950199127197, | |
| "num_tokens": 204184977.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.9654980523094046, | |
| "grad_norm": 1.848637459602703, | |
| "learning_rate": 7.481497508972313e-08, | |
| "loss": 0.9495, | |
| "mean_token_accuracy": 0.7371500611305237, | |
| "num_tokens": 204771217.0, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.9682804674457429, | |
| "grad_norm": 1.8523905866159716, | |
| "learning_rate": 6.342289939915369e-08, | |
| "loss": 0.9586, | |
| "mean_token_accuracy": 0.7342531204223632, | |
| "num_tokens": 205355942.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.9710628825820813, | |
| "grad_norm": 1.7373507952790055, | |
| "learning_rate": 5.2968500367951425e-08, | |
| "loss": 0.9239, | |
| "mean_token_accuracy": 0.7412109613418579, | |
| "num_tokens": 205948315.0, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.9738452977184195, | |
| "grad_norm": 1.7596019981079423, | |
| "learning_rate": 4.345276453764258e-08, | |
| "loss": 0.9212, | |
| "mean_token_accuracy": 0.7406978607177734, | |
| "num_tokens": 206543533.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.9766277128547579, | |
| "grad_norm": 1.9410970916478685, | |
| "learning_rate": 3.487658987171294e-08, | |
| "loss": 0.9673, | |
| "mean_token_accuracy": 0.732921814918518, | |
| "num_tokens": 207131317.0, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.9794101279910963, | |
| "grad_norm": 1.8954966093705055, | |
| "learning_rate": 2.724078567086119e-08, | |
| "loss": 0.961, | |
| "mean_token_accuracy": 0.733088493347168, | |
| "num_tokens": 207715056.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.9821925431274346, | |
| "grad_norm": 1.9018642175991762, | |
| "learning_rate": 2.054607249663665e-08, | |
| "loss": 0.9794, | |
| "mean_token_accuracy": 0.7320362687110901, | |
| "num_tokens": 208303816.0, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.9849749582637729, | |
| "grad_norm": 1.7650786536070042, | |
| "learning_rate": 1.4793082103435885e-08, | |
| "loss": 0.9314, | |
| "mean_token_accuracy": 0.739205515384674, | |
| "num_tokens": 208893950.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.9877573734001113, | |
| "grad_norm": 1.7266587239672113, | |
| "learning_rate": 9.982357378891528e-09, | |
| "loss": 0.9607, | |
| "mean_token_accuracy": 0.7332687497138977, | |
| "num_tokens": 209482604.0, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.9905397885364496, | |
| "grad_norm": 1.8691016007630499, | |
| "learning_rate": 6.114352292639902e-09, | |
| "loss": 0.9559, | |
| "mean_token_accuracy": 0.734725546836853, | |
| "num_tokens": 210067242.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.993322203672788, | |
| "grad_norm": 1.6438880148449324, | |
| "learning_rate": 3.1894318534819725e-09, | |
| "loss": 0.9707, | |
| "mean_token_accuracy": 0.7310842633247375, | |
| "num_tokens": 210660656.0, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.9961046188091264, | |
| "grad_norm": 1.9330784206590557, | |
| "learning_rate": 1.2078720749364447e-09, | |
| "loss": 0.9491, | |
| "mean_token_accuracy": 0.7360571622848511, | |
| "num_tokens": 211243204.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.9988870339454646, | |
| "grad_norm": 1.8314428771696998, | |
| "learning_rate": 1.69859949198381e-10, | |
| "loss": 0.9316, | |
| "mean_token_accuracy": 0.7387963652610778, | |
| "num_tokens": 211840329.0, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.9461386799812317, | |
| "eval_mean_token_accuracy": 0.7361341528594494, | |
| "eval_num_tokens": 212075689.0, | |
| "eval_runtime": 4.8216, | |
| "eval_samples_per_second": 207.398, | |
| "eval_steps_per_second": 3.318, | |
| "step": 1797 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 1797, | |
| "total_flos": 376255241256960.0, | |
| "train_loss": 1.0248491220097444, | |
| "train_runtime": 4390.0943, | |
| "train_samples_per_second": 52.385, | |
| "train_steps_per_second": 0.409 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1797, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 376255241256960.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |