Instructions to use minpeter/pretrain with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use minpeter/pretrain with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="minpeter/pretrain")# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("minpeter/pretrain") model = AutoModelForCausalLM.from_pretrained("minpeter/pretrain") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use minpeter/pretrain with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "minpeter/pretrain" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "minpeter/pretrain", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/minpeter/pretrain
- SGLang
How to use minpeter/pretrain with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "minpeter/pretrain" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "minpeter/pretrain", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "minpeter/pretrain" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "minpeter/pretrain", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use minpeter/pretrain with Docker Model Runner:
docker model run hf.co/minpeter/pretrain
| { | |
| "best_global_step": 2000, | |
| "best_metric": 9.218317031860352, | |
| "best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000", | |
| "epoch": 0.06870834556550091, | |
| "eval_steps": 1000, | |
| "global_step": 22000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 7.807766541534195e-05, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.499063085571518e-06, | |
| "loss": 10.8863, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0001561553308306839, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 3.0605871330418487e-06, | |
| "loss": 10.8814, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.00023423299624602585, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 4.62211118051218e-06, | |
| "loss": 10.883, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.0003123106616613678, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 6.183635227982511e-06, | |
| "loss": 10.8828, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.00039038832707670977, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 7.745159275452842e-06, | |
| "loss": 10.8834, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.0004684659924920517, | |
| "grad_norm": 2.125, | |
| "learning_rate": 9.306683322923173e-06, | |
| "loss": 10.8796, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0005465436579073936, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.0868207370393504e-05, | |
| "loss": 10.8798, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.0006246213233227356, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.2429731417863835e-05, | |
| "loss": 10.8764, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.0007026989887380775, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.3991255465334166e-05, | |
| "loss": 10.8779, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.0007807766541534195, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.5552779512804497e-05, | |
| "loss": 10.8715, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.0008588543195687615, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.7114303560274827e-05, | |
| "loss": 10.8676, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.0009369319849841034, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 1.867582760774516e-05, | |
| "loss": 10.8621, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.0010150096503994453, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 2.0237351655215492e-05, | |
| "loss": 10.8566, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.0010930873158147873, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 2.179887570268582e-05, | |
| "loss": 10.8515, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0011711649812301292, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.3360399750156154e-05, | |
| "loss": 10.8445, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.0012492426466454711, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 2.4921923797626483e-05, | |
| "loss": 10.8383, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.001327320312060813, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 2.6483447845096816e-05, | |
| "loss": 10.8244, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.001405397977476155, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 2.804497189256715e-05, | |
| "loss": 10.8193, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.001483475642891497, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.9606495940037475e-05, | |
| "loss": 10.7992, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.001561553308306839, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 3.116801998750781e-05, | |
| "loss": 10.7987, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.001639630973722181, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 3.272954403497814e-05, | |
| "loss": 10.7803, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.001717708639137523, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 3.429106808244847e-05, | |
| "loss": 10.7653, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.0017957863045528649, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 3.58525921299188e-05, | |
| "loss": 10.745, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.0018738639699682068, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 3.741411617738913e-05, | |
| "loss": 10.7327, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0019519416353835487, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.897564022485946e-05, | |
| "loss": 10.7159, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.0020300193007988907, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 4.053716427232979e-05, | |
| "loss": 10.6931, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.0021080969662142326, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 4.2098688319800126e-05, | |
| "loss": 10.6688, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.0021861746316295745, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 4.3660212367270456e-05, | |
| "loss": 10.6408, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.0022642522970449165, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 4.522173641474079e-05, | |
| "loss": 10.63, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.0023423299624602584, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 4.678326046221112e-05, | |
| "loss": 10.6057, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.0024204076278756003, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 4.834478450968145e-05, | |
| "loss": 10.5781, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.0024984852932909423, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 4.990630855715178e-05, | |
| "loss": 10.5501, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.002576562958706284, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 5.1467832604622116e-05, | |
| "loss": 10.5076, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.002654640624121626, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 5.3029356652092445e-05, | |
| "loss": 10.477, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.002732718289536968, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 5.4590880699562774e-05, | |
| "loss": 10.4528, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.00281079595495231, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 5.6152404747033104e-05, | |
| "loss": 10.4192, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.002888873620367652, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 5.771392879450343e-05, | |
| "loss": 10.3672, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.002966951285782994, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 5.927545284197377e-05, | |
| "loss": 10.3219, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.003045028951198336, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 6.08369768894441e-05, | |
| "loss": 10.3154, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.003123106616613678, | |
| "grad_norm": 3.125, | |
| "learning_rate": 6.239850093691443e-05, | |
| "loss": 10.2594, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.003123106616613678, | |
| "eval_loss": 10.227066993713379, | |
| "eval_runtime": 102.2402, | |
| "eval_samples_per_second": 50.89, | |
| "eval_steps_per_second": 3.189, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.00320118428202902, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 6.396002498438476e-05, | |
| "loss": 10.2088, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.003279261947444362, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 6.552154903185509e-05, | |
| "loss": 10.1806, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.003357339612859704, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 6.708307307932544e-05, | |
| "loss": 10.1161, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.003435417278275046, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 6.864459712679575e-05, | |
| "loss": 10.0794, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.003513494943690388, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 7.020612117426608e-05, | |
| "loss": 10.0233, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.0035915726091057297, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 7.176764522173641e-05, | |
| "loss": 9.9989, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.0036696502745210717, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 7.332916926920674e-05, | |
| "loss": 9.9246, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.0037477279399364136, | |
| "grad_norm": 2.0, | |
| "learning_rate": 7.489069331667708e-05, | |
| "loss": 9.8812, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.0038258056053517555, | |
| "grad_norm": 1.875, | |
| "learning_rate": 7.645221736414741e-05, | |
| "loss": 9.8312, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.0039038832707670975, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 7.801374141161774e-05, | |
| "loss": 9.8123, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.003981960936182439, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 7.957526545908807e-05, | |
| "loss": 9.7681, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.004060038601597781, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 8.113678950655841e-05, | |
| "loss": 9.7618, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.004138116267013123, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 8.269831355402874e-05, | |
| "loss": 9.6876, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.004216193932428465, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 8.425983760149906e-05, | |
| "loss": 9.689, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.004294271597843807, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 8.582136164896939e-05, | |
| "loss": 9.6471, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.004372349263259149, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 8.738288569643972e-05, | |
| "loss": 9.6163, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.004450426928674491, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 8.894440974391006e-05, | |
| "loss": 9.5749, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.004528504594089833, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 9.050593379138039e-05, | |
| "loss": 9.5571, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.004606582259505175, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 9.206745783885072e-05, | |
| "loss": 9.539, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.004684659924920517, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 9.362898188632105e-05, | |
| "loss": 9.5223, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.004762737590335859, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 9.519050593379139e-05, | |
| "loss": 9.4832, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.004840815255751201, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 9.675202998126172e-05, | |
| "loss": 9.4502, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.004918892921166543, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 9.831355402873205e-05, | |
| "loss": 9.4381, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.0049969705865818845, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 9.987507807620237e-05, | |
| "loss": 9.4403, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.0050750482519972264, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 0.0001014366021236727, | |
| "loss": 9.4185, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.005153125917412568, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 0.00010299812617114304, | |
| "loss": 9.4044, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.00523120358282791, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 0.00010455965021861337, | |
| "loss": 9.3678, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.005309281248243252, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 0.0001061211742660837, | |
| "loss": 9.3262, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.005387358913658594, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 0.00010768269831355403, | |
| "loss": 9.365, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.005465436579073936, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 0.00010924422236102437, | |
| "loss": 9.319, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.005543514244489278, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 0.0001108057464084947, | |
| "loss": 9.3204, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.00562159190990462, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 0.00011236727045596503, | |
| "loss": 9.2825, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.005699669575319962, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 0.00011392879450343536, | |
| "loss": 9.2735, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.005777747240735304, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 0.0001154903185509057, | |
| "loss": 9.264, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.005855824906150646, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 0.00011705184259837602, | |
| "loss": 9.263, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.005933902571565988, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 0.00011861336664584634, | |
| "loss": 9.2572, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.0060119802369813305, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 0.00012017489069331667, | |
| "loss": 9.2461, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.006090057902396672, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 0.000121736414740787, | |
| "loss": 9.2345, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.006168135567812014, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 0.00012329793878825736, | |
| "loss": 9.1947, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.006246213233227356, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 0.00012485946283572768, | |
| "loss": 9.2535, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.006246213233227356, | |
| "eval_loss": 9.218317031860352, | |
| "eval_runtime": 102.1917, | |
| "eval_samples_per_second": 50.914, | |
| "eval_steps_per_second": 3.19, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.006324290898642698, | |
| "grad_norm": 1.625, | |
| "learning_rate": 0.000126420986883198, | |
| "loss": 9.2323, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.00640236856405804, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 0.00012798251093066833, | |
| "loss": 9.2067, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.006480446229473382, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 0.00012954403497813865, | |
| "loss": 9.2166, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.006558523894888724, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 0.000131105559025609, | |
| "loss": 9.2106, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.006636601560304066, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 0.0001326670830730793, | |
| "loss": 9.1922, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.006714679225719408, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 0.00013422860712054965, | |
| "loss": 9.1984, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.00679275689113475, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 0.00013579013116802, | |
| "loss": 9.1911, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.006870834556550092, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 0.0001373516552154903, | |
| "loss": 9.1912, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.006948912221965434, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 0.00013891317926296065, | |
| "loss": 9.1964, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.007026989887380776, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 0.000140474703310431, | |
| "loss": 9.2259, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.0071050675527961175, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 0.0001420362273579013, | |
| "loss": 9.1996, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.0071831452182114595, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 0.00014359775140537165, | |
| "loss": 9.2401, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.007261222883626801, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 0.00014515927545284197, | |
| "loss": 9.2228, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.007339300549042143, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 0.0001467207995003123, | |
| "loss": 9.1847, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.007417378214457485, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 0.00014828232354778266, | |
| "loss": 9.1981, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.007495455879872827, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 0.00014984384759525297, | |
| "loss": 9.1894, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.007573533545288169, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 0.00015140537164272331, | |
| "loss": 9.2327, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.007651611210703511, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 0.00015296689569019363, | |
| "loss": 9.2134, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.007729688876118853, | |
| "grad_norm": 2.375, | |
| "learning_rate": 0.00015452841973766397, | |
| "loss": 9.2026, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.007807766541534195, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 0.00015608994378513432, | |
| "loss": 9.194, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.007885844206949537, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 0.0001576514678326046, | |
| "loss": 9.2258, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.007963921872364879, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 0.00015921299188007495, | |
| "loss": 9.2412, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.00804199953778022, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 0.0001607745159275453, | |
| "loss": 9.2441, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.008120077203195563, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 0.0001623360399750156, | |
| "loss": 9.25, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.008198154868610905, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 0.00016389756402248595, | |
| "loss": 9.2576, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.008276232534026247, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 0.00016545908806995626, | |
| "loss": 9.2549, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.008354310199441588, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 0.0001670206121174266, | |
| "loss": 9.2837, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.00843238786485693, | |
| "grad_norm": 2.96875, | |
| "learning_rate": 0.00016858213616489695, | |
| "loss": 9.2405, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.008510465530272272, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 0.00017014366021236727, | |
| "loss": 9.2927, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.008588543195687614, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 0.0001717051842598376, | |
| "loss": 9.2427, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.008666620861102956, | |
| "grad_norm": 3.0, | |
| "learning_rate": 0.00017326670830730792, | |
| "loss": 9.3099, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.008744698526518298, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 0.00017482823235477827, | |
| "loss": 9.2786, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.00882277619193364, | |
| "grad_norm": 3.875, | |
| "learning_rate": 0.0001763897564022486, | |
| "loss": 9.2615, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.008900853857348982, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 0.00017795128044971893, | |
| "loss": 9.3233, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.008978931522764324, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 0.00017951280449718927, | |
| "loss": 9.2634, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.009057009188179666, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 0.00018107432854465959, | |
| "loss": 9.3164, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.009135086853595008, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 0.00018263585259212993, | |
| "loss": 9.3274, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.00921316451901035, | |
| "grad_norm": 4.0, | |
| "learning_rate": 0.00018419737663960027, | |
| "loss": 9.3091, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.009291242184425692, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 0.0001857589006870706, | |
| "loss": 9.317, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.009369319849841034, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 0.00018732042473454093, | |
| "loss": 9.3963, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.009369319849841034, | |
| "eval_loss": 9.343441009521484, | |
| "eval_runtime": 102.2757, | |
| "eval_samples_per_second": 50.872, | |
| "eval_steps_per_second": 3.187, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.009447397515256375, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 0.00018888194878201127, | |
| "loss": 9.3633, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.009525475180671717, | |
| "grad_norm": 3.125, | |
| "learning_rate": 0.00019044347282948156, | |
| "loss": 9.335, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.00960355284608706, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 0.0001920049968769519, | |
| "loss": 9.3406, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.009681630511502401, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 0.00019356652092442222, | |
| "loss": 9.3829, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.009759708176917743, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 0.00019512804497189256, | |
| "loss": 9.3734, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 0.009837785842333085, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 0.0001966895690193629, | |
| "loss": 9.3608, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.009915863507748427, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 0.00019825109306683322, | |
| "loss": 9.3771, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 0.009993941173163769, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 0.00019981261711430356, | |
| "loss": 9.4023, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.010072018838579111, | |
| "grad_norm": 3.875, | |
| "learning_rate": 0.00020137414116177388, | |
| "loss": 9.4179, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 0.010150096503994453, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 0.00020293566520924422, | |
| "loss": 9.407, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.010228174169409795, | |
| "grad_norm": 5.0, | |
| "learning_rate": 0.00020449718925671457, | |
| "loss": 9.4284, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 0.010306251834825137, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 0.00020605871330418488, | |
| "loss": 9.4289, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.010384329500240479, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 0.00020762023735165522, | |
| "loss": 9.409, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 0.01046240716565582, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 0.00020918176139912557, | |
| "loss": 9.4343, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.010540484831071163, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 0.00021074328544659588, | |
| "loss": 9.4513, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 0.010618562496486504, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 0.00021230480949406623, | |
| "loss": 9.458, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.010696640161901846, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 0.00021386633354153654, | |
| "loss": 9.5056, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 0.010774717827317188, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 0.00021542785758900688, | |
| "loss": 9.4958, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.01085279549273253, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 0.00021698938163647723, | |
| "loss": 9.5275, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 0.010930873158147872, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 0.00021855090568394754, | |
| "loss": 9.4947, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.011008950823563214, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 0.00022011242973141789, | |
| "loss": 9.5044, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 0.011087028488978556, | |
| "grad_norm": 4.5, | |
| "learning_rate": 0.00022167395377888817, | |
| "loss": 9.5233, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.011165106154393898, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 0.00022323547782635852, | |
| "loss": 9.5455, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 0.01124318381980924, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 0.00022479700187382886, | |
| "loss": 9.5962, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.011321261485224582, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 0.00022635852592129918, | |
| "loss": 9.5476, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 0.011399339150639924, | |
| "grad_norm": 4.9375, | |
| "learning_rate": 0.00022792004996876952, | |
| "loss": 9.5762, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.011477416816055266, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 0.00022948157401623983, | |
| "loss": 9.6066, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 0.011555494481470608, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 0.00023104309806371018, | |
| "loss": 9.6445, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.01163357214688595, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 0.00023260462211118052, | |
| "loss": 9.6033, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 0.011711649812301292, | |
| "grad_norm": 5.0, | |
| "learning_rate": 0.00023416614615865084, | |
| "loss": 9.6635, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.011789727477716633, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 0.00023572767020612118, | |
| "loss": 9.6236, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 0.011867805143131975, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 0.00023728919425359152, | |
| "loss": 9.6867, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.011945882808547317, | |
| "grad_norm": 4.5, | |
| "learning_rate": 0.00023885071830106184, | |
| "loss": 9.6757, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 0.012023960473962661, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 0.00024041224234853218, | |
| "loss": 9.7058, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.012102038139378003, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 0.0002419737663960025, | |
| "loss": 9.7057, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 0.012180115804793345, | |
| "grad_norm": 6.0, | |
| "learning_rate": 0.00024353529044347284, | |
| "loss": 9.7199, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.012258193470208687, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 0.00024509681449094316, | |
| "loss": 9.7453, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 0.012336271135624029, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 0.00024665833853841347, | |
| "loss": 9.7496, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.01241434880103937, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 0.00024821986258588384, | |
| "loss": 9.7802, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 0.012492426466454713, | |
| "grad_norm": 5.5, | |
| "learning_rate": 0.00024978138663335416, | |
| "loss": 9.814, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.012492426466454713, | |
| "eval_loss": 9.791647911071777, | |
| "eval_runtime": 102.2247, | |
| "eval_samples_per_second": 50.898, | |
| "eval_steps_per_second": 3.189, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.012570504131870055, | |
| "grad_norm": 6.0625, | |
| "learning_rate": 0.00025134291068082447, | |
| "loss": 9.785, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 0.012648581797285396, | |
| "grad_norm": 4.875, | |
| "learning_rate": 0.00025290443472829484, | |
| "loss": 9.8043, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.012726659462700738, | |
| "grad_norm": 6.03125, | |
| "learning_rate": 0.00025446595877576516, | |
| "loss": 9.8206, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 0.01280473712811608, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 0.0002560274828232355, | |
| "loss": 9.8267, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.012882814793531422, | |
| "grad_norm": 5.5625, | |
| "learning_rate": 0.0002575890068707058, | |
| "loss": 9.8286, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 0.012960892458946764, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 0.00025915053091817616, | |
| "loss": 9.8346, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.013038970124362106, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 0.0002607120549656465, | |
| "loss": 9.8788, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 0.013117047789777448, | |
| "grad_norm": 5.84375, | |
| "learning_rate": 0.00026227357901311685, | |
| "loss": 9.8549, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.01319512545519279, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 0.0002638351030605871, | |
| "loss": 9.8965, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 0.013273203120608132, | |
| "grad_norm": 6.625, | |
| "learning_rate": 0.0002653966271080575, | |
| "loss": 9.936, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.013351280786023474, | |
| "grad_norm": 5.6875, | |
| "learning_rate": 0.0002669581511555278, | |
| "loss": 9.9048, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 0.013429358451438816, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 0.00026851967520299816, | |
| "loss": 9.9304, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.013507436116854158, | |
| "grad_norm": 5.90625, | |
| "learning_rate": 0.0002700811992504685, | |
| "loss": 9.9668, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 0.0135855137822695, | |
| "grad_norm": 5.90625, | |
| "learning_rate": 0.0002716427232979388, | |
| "loss": 9.957, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.013663591447684842, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 0.0002732042473454091, | |
| "loss": 9.9809, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 0.013741669113100183, | |
| "grad_norm": 6.96875, | |
| "learning_rate": 0.0002747657713928794, | |
| "loss": 9.9868, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.013819746778515525, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 0.0002763272954403498, | |
| "loss": 10.0066, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 0.013897824443930867, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 0.0002778888194878201, | |
| "loss": 9.9905, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.01397590210934621, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 0.0002794503435352905, | |
| "loss": 10.0002, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 0.014053979774761551, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 0.00028101186758276074, | |
| "loss": 10.0869, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.014132057440176893, | |
| "grad_norm": 5.75, | |
| "learning_rate": 0.0002825733916302311, | |
| "loss": 10.0828, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 0.014210135105592235, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 0.00028413491567770143, | |
| "loss": 10.1158, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.014288212771007577, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 0.0002856964397251718, | |
| "loss": 10.1618, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 0.014366290436422919, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 0.0002872579637726421, | |
| "loss": 10.1651, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.014444368101838261, | |
| "grad_norm": 6.9375, | |
| "learning_rate": 0.00028881948782011243, | |
| "loss": 10.1786, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 0.014522445767253603, | |
| "grad_norm": 8.375, | |
| "learning_rate": 0.00029038101186758275, | |
| "loss": 10.1674, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.014600523432668945, | |
| "grad_norm": 7.625, | |
| "learning_rate": 0.0002919425359150531, | |
| "loss": 10.1869, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 0.014678601098084287, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 0.00029350405996252343, | |
| "loss": 10.2085, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.014756678763499629, | |
| "grad_norm": 5.5625, | |
| "learning_rate": 0.0002950655840099938, | |
| "loss": 10.2231, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 0.01483475642891497, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 0.00029662710805746406, | |
| "loss": 10.2671, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.014912834094330312, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 0.00029818863210493443, | |
| "loss": 10.3177, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 0.014990911759745654, | |
| "grad_norm": 7.0, | |
| "learning_rate": 0.00029975015615240475, | |
| "loss": 10.3046, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.015068989425160996, | |
| "grad_norm": 8.125, | |
| "learning_rate": 0.0003013116801998751, | |
| "loss": 10.3212, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 0.015147067090576338, | |
| "grad_norm": 6.75, | |
| "learning_rate": 0.00030287320424734543, | |
| "loss": 10.2822, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.01522514475599168, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 0.0003044347282948157, | |
| "loss": 10.3131, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 0.015303222421407022, | |
| "grad_norm": 7.1875, | |
| "learning_rate": 0.00030599625234228607, | |
| "loss": 10.3112, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.015381300086822364, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 0.0003075577763897564, | |
| "loss": 10.3796, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 0.015459377752237706, | |
| "grad_norm": 6.8125, | |
| "learning_rate": 0.00030911930043722675, | |
| "loss": 10.4225, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.015537455417653048, | |
| "grad_norm": 6.28125, | |
| "learning_rate": 0.00031068082448469707, | |
| "loss": 10.435, | |
| "step": 4975 | |
| }, | |
| { | |
| "epoch": 0.01561553308306839, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 0.0003122423485321674, | |
| "loss": 10.4029, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.01561553308306839, | |
| "eval_loss": 10.434911727905273, | |
| "eval_runtime": 102.2426, | |
| "eval_samples_per_second": 50.889, | |
| "eval_steps_per_second": 3.188, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.01569361074848373, | |
| "grad_norm": 7.21875, | |
| "learning_rate": 0.0003138038725796377, | |
| "loss": 10.4435, | |
| "step": 5025 | |
| }, | |
| { | |
| "epoch": 0.015771688413899074, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 0.00031536539662710807, | |
| "loss": 10.4452, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.015849766079314414, | |
| "grad_norm": 6.40625, | |
| "learning_rate": 0.0003169269206745784, | |
| "loss": 10.4211, | |
| "step": 5075 | |
| }, | |
| { | |
| "epoch": 0.015927843744729758, | |
| "grad_norm": 8.25, | |
| "learning_rate": 0.00031848844472204876, | |
| "loss": 10.5069, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.0160059214101451, | |
| "grad_norm": 8.625, | |
| "learning_rate": 0.00032004996876951907, | |
| "loss": 10.5109, | |
| "step": 5125 | |
| }, | |
| { | |
| "epoch": 0.01608399907556044, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 0.0003216114928169894, | |
| "loss": 10.5435, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.016162076740975785, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 0.0003231730168644597, | |
| "loss": 10.5252, | |
| "step": 5175 | |
| }, | |
| { | |
| "epoch": 0.016240154406391125, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 0.00032473454091193007, | |
| "loss": 10.5791, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.01631823207180647, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 0.0003262960649594004, | |
| "loss": 10.5973, | |
| "step": 5225 | |
| }, | |
| { | |
| "epoch": 0.01639630973722181, | |
| "grad_norm": 8.375, | |
| "learning_rate": 0.00032785758900687076, | |
| "loss": 10.6523, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.016474387402637153, | |
| "grad_norm": 7.375, | |
| "learning_rate": 0.000329419113054341, | |
| "loss": 10.5952, | |
| "step": 5275 | |
| }, | |
| { | |
| "epoch": 0.016552465068052493, | |
| "grad_norm": 6.40625, | |
| "learning_rate": 0.0003309806371018114, | |
| "loss": 10.6144, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.016630542733467837, | |
| "grad_norm": 7.34375, | |
| "learning_rate": 0.0003325421611492817, | |
| "loss": 10.6408, | |
| "step": 5325 | |
| }, | |
| { | |
| "epoch": 0.016708620398883177, | |
| "grad_norm": 9.125, | |
| "learning_rate": 0.0003341036851967521, | |
| "loss": 10.687, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.01678669806429852, | |
| "grad_norm": 8.375, | |
| "learning_rate": 0.0003356652092442224, | |
| "loss": 10.6634, | |
| "step": 5375 | |
| }, | |
| { | |
| "epoch": 0.01686477572971386, | |
| "grad_norm": 7.65625, | |
| "learning_rate": 0.00033722673329169265, | |
| "loss": 10.7616, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.016942853395129204, | |
| "grad_norm": 9.0, | |
| "learning_rate": 0.000338788257339163, | |
| "loss": 10.7149, | |
| "step": 5425 | |
| }, | |
| { | |
| "epoch": 0.017020931060544545, | |
| "grad_norm": 9.5, | |
| "learning_rate": 0.00034034978138663334, | |
| "loss": 10.7277, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.01709900872595989, | |
| "grad_norm": 8.625, | |
| "learning_rate": 0.0003419113054341037, | |
| "loss": 10.7305, | |
| "step": 5475 | |
| }, | |
| { | |
| "epoch": 0.01717708639137523, | |
| "grad_norm": 8.25, | |
| "learning_rate": 0.000343472829481574, | |
| "loss": 10.7721, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.017255164056790572, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 0.00034503435352904434, | |
| "loss": 10.832, | |
| "step": 5525 | |
| }, | |
| { | |
| "epoch": 0.017333241722205912, | |
| "grad_norm": 10.0, | |
| "learning_rate": 0.00034659587757651466, | |
| "loss": 10.8325, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.017411319387621256, | |
| "grad_norm": 9.125, | |
| "learning_rate": 0.000348157401623985, | |
| "loss": 10.8516, | |
| "step": 5575 | |
| }, | |
| { | |
| "epoch": 0.017489397053036596, | |
| "grad_norm": 9.125, | |
| "learning_rate": 0.00034971892567145534, | |
| "loss": 10.9156, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.01756747471845194, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 0.0003512804497189257, | |
| "loss": 10.9185, | |
| "step": 5625 | |
| }, | |
| { | |
| "epoch": 0.01764555238386728, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 0.000352841973766396, | |
| "loss": 10.9967, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.017723630049282624, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 0.00035440349781386634, | |
| "loss": 11.0233, | |
| "step": 5675 | |
| }, | |
| { | |
| "epoch": 0.017801707714697964, | |
| "grad_norm": 10.0, | |
| "learning_rate": 0.00035596502186133666, | |
| "loss": 11.0479, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.017879785380113308, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 0.00035752654590880703, | |
| "loss": 10.9976, | |
| "step": 5725 | |
| }, | |
| { | |
| "epoch": 0.017957863045528648, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 0.00035908806995627734, | |
| "loss": 11.0774, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.01803594071094399, | |
| "grad_norm": 10.125, | |
| "learning_rate": 0.00036064959400374766, | |
| "loss": 11.1011, | |
| "step": 5775 | |
| }, | |
| { | |
| "epoch": 0.01811401837635933, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 0.000362211118051218, | |
| "loss": 11.087, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.018192096041774675, | |
| "grad_norm": 9.875, | |
| "learning_rate": 0.00036377264209868835, | |
| "loss": 11.1436, | |
| "step": 5825 | |
| }, | |
| { | |
| "epoch": 0.018270173707190016, | |
| "grad_norm": 8.75, | |
| "learning_rate": 0.00036533416614615866, | |
| "loss": 11.1463, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.01834825137260536, | |
| "grad_norm": 9.75, | |
| "learning_rate": 0.00036689569019362903, | |
| "loss": 11.1615, | |
| "step": 5875 | |
| }, | |
| { | |
| "epoch": 0.0184263290380207, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 0.00036845721424109935, | |
| "loss": 11.1622, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.018504406703436043, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 0.0003700187382885696, | |
| "loss": 11.2235, | |
| "step": 5925 | |
| }, | |
| { | |
| "epoch": 0.018582484368851383, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 0.00037158026233604, | |
| "loss": 11.2721, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.018660562034266727, | |
| "grad_norm": 8.625, | |
| "learning_rate": 0.0003731417863835103, | |
| "loss": 11.2218, | |
| "step": 5975 | |
| }, | |
| { | |
| "epoch": 0.018738639699682067, | |
| "grad_norm": 8.5, | |
| "learning_rate": 0.00037470331043098067, | |
| "loss": 11.2897, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.018738639699682067, | |
| "eval_loss": 11.259696960449219, | |
| "eval_runtime": 102.0975, | |
| "eval_samples_per_second": 50.961, | |
| "eval_steps_per_second": 3.193, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.01881671736509741, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 0.000376264834478451, | |
| "loss": 11.2667, | |
| "step": 6025 | |
| }, | |
| { | |
| "epoch": 0.01889479503051275, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 0.0003778263585259213, | |
| "loss": 11.2334, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.018972872695928095, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 0.0003793878825733916, | |
| "loss": 11.2445, | |
| "step": 6075 | |
| }, | |
| { | |
| "epoch": 0.019050950361343435, | |
| "grad_norm": 9.0, | |
| "learning_rate": 0.000380949406620862, | |
| "loss": 11.2638, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.01912902802675878, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 0.0003825109306683323, | |
| "loss": 11.2733, | |
| "step": 6125 | |
| }, | |
| { | |
| "epoch": 0.01920710569217412, | |
| "grad_norm": 8.875, | |
| "learning_rate": 0.00038407245471580267, | |
| "loss": 11.327, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.019285183357589462, | |
| "grad_norm": 9.625, | |
| "learning_rate": 0.00038563397876327293, | |
| "loss": 11.3521, | |
| "step": 6175 | |
| }, | |
| { | |
| "epoch": 0.019363261023004803, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 0.0003871955028107433, | |
| "loss": 11.3203, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.019441338688420146, | |
| "grad_norm": 10.25, | |
| "learning_rate": 0.0003887570268582136, | |
| "loss": 11.4162, | |
| "step": 6225 | |
| }, | |
| { | |
| "epoch": 0.019519416353835486, | |
| "grad_norm": 11.0, | |
| "learning_rate": 0.000390318550905684, | |
| "loss": 11.4063, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.01959749401925083, | |
| "grad_norm": 9.9375, | |
| "learning_rate": 0.0003918800749531543, | |
| "loss": 11.5209, | |
| "step": 6275 | |
| }, | |
| { | |
| "epoch": 0.01967557168466617, | |
| "grad_norm": 9.25, | |
| "learning_rate": 0.0003934415990006246, | |
| "loss": 11.5018, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.019753649350081514, | |
| "grad_norm": 12.25, | |
| "learning_rate": 0.00039500312304809493, | |
| "loss": 11.5067, | |
| "step": 6325 | |
| }, | |
| { | |
| "epoch": 0.019831727015496854, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 0.0003965646470955653, | |
| "loss": 11.5646, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.019909804680912198, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 0.0003981261711430356, | |
| "loss": 11.5575, | |
| "step": 6375 | |
| }, | |
| { | |
| "epoch": 0.019987882346327538, | |
| "grad_norm": 10.5, | |
| "learning_rate": 0.000399687695190506, | |
| "loss": 11.634, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.02006596001174288, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 0.00040124921923797625, | |
| "loss": 11.7034, | |
| "step": 6425 | |
| }, | |
| { | |
| "epoch": 0.020144037677158222, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 0.00040281074328544657, | |
| "loss": 11.6978, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.020222115342573566, | |
| "grad_norm": 12.125, | |
| "learning_rate": 0.00040437226733291694, | |
| "loss": 11.7471, | |
| "step": 6475 | |
| }, | |
| { | |
| "epoch": 0.020300193007988906, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 0.00040593379138038725, | |
| "loss": 11.861, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.02037827067340425, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 0.0004074953154278576, | |
| "loss": 11.7507, | |
| "step": 6525 | |
| }, | |
| { | |
| "epoch": 0.02045634833881959, | |
| "grad_norm": 12.0, | |
| "learning_rate": 0.0004090568394753279, | |
| "loss": 11.8528, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.020534426004234933, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 0.00041061836352279825, | |
| "loss": 11.9091, | |
| "step": 6575 | |
| }, | |
| { | |
| "epoch": 0.020612503669650274, | |
| "grad_norm": 11.1875, | |
| "learning_rate": 0.00041217988757026857, | |
| "loss": 11.8911, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.020690581335065617, | |
| "grad_norm": 10.75, | |
| "learning_rate": 0.00041374141161773894, | |
| "loss": 11.9488, | |
| "step": 6625 | |
| }, | |
| { | |
| "epoch": 0.020768659000480957, | |
| "grad_norm": 10.0, | |
| "learning_rate": 0.00041530293566520925, | |
| "loss": 12.016, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.0208467366658963, | |
| "grad_norm": 9.75, | |
| "learning_rate": 0.0004168644597126796, | |
| "loss": 11.9987, | |
| "step": 6675 | |
| }, | |
| { | |
| "epoch": 0.02092481433131164, | |
| "grad_norm": 9.875, | |
| "learning_rate": 0.0004184259837601499, | |
| "loss": 12.0069, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.021002891996726985, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 0.00041998750780762026, | |
| "loss": 11.9872, | |
| "step": 6725 | |
| }, | |
| { | |
| "epoch": 0.021080969662142325, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 0.00042154903185509057, | |
| "loss": 12.0362, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.02115904732755767, | |
| "grad_norm": 10.25, | |
| "learning_rate": 0.00042311055590256094, | |
| "loss": 12.1232, | |
| "step": 6775 | |
| }, | |
| { | |
| "epoch": 0.02123712499297301, | |
| "grad_norm": 10.375, | |
| "learning_rate": 0.00042467207995003126, | |
| "loss": 12.1279, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.021315202658388353, | |
| "grad_norm": 12.5, | |
| "learning_rate": 0.0004262336039975016, | |
| "loss": 12.1096, | |
| "step": 6825 | |
| }, | |
| { | |
| "epoch": 0.021393280323803693, | |
| "grad_norm": 12.125, | |
| "learning_rate": 0.0004277951280449719, | |
| "loss": 12.1568, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.021471357989219036, | |
| "grad_norm": 10.375, | |
| "learning_rate": 0.00042935665209244226, | |
| "loss": 12.191, | |
| "step": 6875 | |
| }, | |
| { | |
| "epoch": 0.021549435654634377, | |
| "grad_norm": 12.125, | |
| "learning_rate": 0.0004309181761399126, | |
| "loss": 12.3206, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.02162751332004972, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 0.00043247970018738294, | |
| "loss": 12.2622, | |
| "step": 6925 | |
| }, | |
| { | |
| "epoch": 0.02170559098546506, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 0.0004340412242348532, | |
| "loss": 12.2397, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.021783668650880404, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 0.0004356027482823235, | |
| "loss": 12.3172, | |
| "step": 6975 | |
| }, | |
| { | |
| "epoch": 0.021861746316295744, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 0.0004371642723297939, | |
| "loss": 12.3313, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.021861746316295744, | |
| "eval_loss": 12.409297943115234, | |
| "eval_runtime": 102.1563, | |
| "eval_samples_per_second": 50.932, | |
| "eval_steps_per_second": 3.191, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.021939823981711088, | |
| "grad_norm": 12.0625, | |
| "learning_rate": 0.0004387257963772642, | |
| "loss": 12.4476, | |
| "step": 7025 | |
| }, | |
| { | |
| "epoch": 0.02201790164712643, | |
| "grad_norm": 12.4375, | |
| "learning_rate": 0.0004402873204247346, | |
| "loss": 12.4722, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.022095979312541772, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 0.00044184884447220484, | |
| "loss": 12.4586, | |
| "step": 7075 | |
| }, | |
| { | |
| "epoch": 0.022174056977957112, | |
| "grad_norm": 11.75, | |
| "learning_rate": 0.0004434103685196752, | |
| "loss": 12.4961, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.022252134643372456, | |
| "grad_norm": 10.375, | |
| "learning_rate": 0.0004449718925671455, | |
| "loss": 12.5456, | |
| "step": 7125 | |
| }, | |
| { | |
| "epoch": 0.022330212308787796, | |
| "grad_norm": 16.5, | |
| "learning_rate": 0.0004465334166146159, | |
| "loss": 12.5605, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.02240828997420314, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 0.0004480949406620862, | |
| "loss": 12.5607, | |
| "step": 7175 | |
| }, | |
| { | |
| "epoch": 0.02248636763961848, | |
| "grad_norm": 12.25, | |
| "learning_rate": 0.0004496564647095565, | |
| "loss": 12.6064, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.022564445305033824, | |
| "grad_norm": 13.875, | |
| "learning_rate": 0.00045121798875702684, | |
| "loss": 12.6238, | |
| "step": 7225 | |
| }, | |
| { | |
| "epoch": 0.022642522970449164, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 0.0004527795128044972, | |
| "loss": 12.6783, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.022720600635864507, | |
| "grad_norm": 12.0, | |
| "learning_rate": 0.00045434103685196753, | |
| "loss": 12.6747, | |
| "step": 7275 | |
| }, | |
| { | |
| "epoch": 0.022798678301279848, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 0.0004559025608994379, | |
| "loss": 12.7325, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.02287675596669519, | |
| "grad_norm": 10.75, | |
| "learning_rate": 0.00045746408494690816, | |
| "loss": 12.8587, | |
| "step": 7325 | |
| }, | |
| { | |
| "epoch": 0.02295483363211053, | |
| "grad_norm": 14.5, | |
| "learning_rate": 0.00045902560899437853, | |
| "loss": 12.8184, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.023032911297525875, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 0.00046058713304184885, | |
| "loss": 12.8454, | |
| "step": 7375 | |
| }, | |
| { | |
| "epoch": 0.023110988962941215, | |
| "grad_norm": 10.75, | |
| "learning_rate": 0.0004621486570893192, | |
| "loss": 12.8707, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.02318906662835656, | |
| "grad_norm": 13.5625, | |
| "learning_rate": 0.00046371018113678953, | |
| "loss": 12.9231, | |
| "step": 7425 | |
| }, | |
| { | |
| "epoch": 0.0232671442937719, | |
| "grad_norm": 12.125, | |
| "learning_rate": 0.0004652717051842599, | |
| "loss": 12.9452, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.023345221959187243, | |
| "grad_norm": 17.0, | |
| "learning_rate": 0.00046683322923173016, | |
| "loss": 13.0146, | |
| "step": 7475 | |
| }, | |
| { | |
| "epoch": 0.023423299624602583, | |
| "grad_norm": 13.4375, | |
| "learning_rate": 0.0004683947532792005, | |
| "loss": 12.9707, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.023501377290017927, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 0.00046995627732667085, | |
| "loss": 12.9913, | |
| "step": 7525 | |
| }, | |
| { | |
| "epoch": 0.023579454955433267, | |
| "grad_norm": 21.125, | |
| "learning_rate": 0.00047151780137414116, | |
| "loss": 13.0038, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.02365753262084861, | |
| "grad_norm": 14.25, | |
| "learning_rate": 0.00047307932542161153, | |
| "loss": 13.0291, | |
| "step": 7575 | |
| }, | |
| { | |
| "epoch": 0.02373561028626395, | |
| "grad_norm": 13.5, | |
| "learning_rate": 0.0004746408494690818, | |
| "loss": 12.9994, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.023813687951679294, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 0.00047620237351655217, | |
| "loss": 13.0487, | |
| "step": 7625 | |
| }, | |
| { | |
| "epoch": 0.023891765617094635, | |
| "grad_norm": 12.875, | |
| "learning_rate": 0.0004777638975640225, | |
| "loss": 13.0586, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.02396984328250998, | |
| "grad_norm": 12.25, | |
| "learning_rate": 0.00047932542161149285, | |
| "loss": 13.0199, | |
| "step": 7675 | |
| }, | |
| { | |
| "epoch": 0.024047920947925322, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 0.00048088694565896317, | |
| "loss": 13.0435, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.024125998613340662, | |
| "grad_norm": 14.0, | |
| "learning_rate": 0.0004824484697064335, | |
| "loss": 13.1292, | |
| "step": 7725 | |
| }, | |
| { | |
| "epoch": 0.024204076278756006, | |
| "grad_norm": 17.625, | |
| "learning_rate": 0.0004840099937539038, | |
| "loss": 13.1447, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.024282153944171346, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 0.00048557151780137417, | |
| "loss": 13.1884, | |
| "step": 7775 | |
| }, | |
| { | |
| "epoch": 0.02436023160958669, | |
| "grad_norm": 14.1875, | |
| "learning_rate": 0.0004871330418488445, | |
| "loss": 13.2918, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.02443830927500203, | |
| "grad_norm": 12.75, | |
| "learning_rate": 0.0004886945658963149, | |
| "loss": 13.2436, | |
| "step": 7825 | |
| }, | |
| { | |
| "epoch": 0.024516386940417374, | |
| "grad_norm": 12.875, | |
| "learning_rate": 0.0004902560899437852, | |
| "loss": 13.2654, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.024594464605832714, | |
| "grad_norm": 13.8125, | |
| "learning_rate": 0.0004918176139912555, | |
| "loss": 13.3083, | |
| "step": 7875 | |
| }, | |
| { | |
| "epoch": 0.024672542271248057, | |
| "grad_norm": 15.5, | |
| "learning_rate": 0.0004933791380387258, | |
| "loss": 13.3748, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.024750619936663398, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 0.0004949406620861961, | |
| "loss": 13.3709, | |
| "step": 7925 | |
| }, | |
| { | |
| "epoch": 0.02482869760207874, | |
| "grad_norm": 14.375, | |
| "learning_rate": 0.0004965021861336665, | |
| "loss": 13.4115, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.02490677526749408, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 0.0004980637101811367, | |
| "loss": 13.5171, | |
| "step": 7975 | |
| }, | |
| { | |
| "epoch": 0.024984852932909425, | |
| "grad_norm": 15.25, | |
| "learning_rate": 0.0004996252342286071, | |
| "loss": 13.4743, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.024984852932909425, | |
| "eval_loss": 13.542752265930176, | |
| "eval_runtime": 102.3748, | |
| "eval_samples_per_second": 50.823, | |
| "eval_steps_per_second": 3.184, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.025062930598324765, | |
| "grad_norm": 14.25, | |
| "learning_rate": 0.0005011867582760775, | |
| "loss": 13.5245, | |
| "step": 8025 | |
| }, | |
| { | |
| "epoch": 0.02514100826374011, | |
| "grad_norm": 15.375, | |
| "learning_rate": 0.0005027482823235477, | |
| "loss": 13.5847, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.02521908592915545, | |
| "grad_norm": 15.5, | |
| "learning_rate": 0.0005043098063710181, | |
| "loss": 13.5861, | |
| "step": 8075 | |
| }, | |
| { | |
| "epoch": 0.025297163594570793, | |
| "grad_norm": 14.0, | |
| "learning_rate": 0.0005058713304184884, | |
| "loss": 13.6261, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.025375241259986133, | |
| "grad_norm": 12.875, | |
| "learning_rate": 0.0005074328544659588, | |
| "loss": 13.6362, | |
| "step": 8125 | |
| }, | |
| { | |
| "epoch": 0.025453318925401477, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 0.0005089943785134291, | |
| "loss": 13.7079, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.025531396590816817, | |
| "grad_norm": 16.375, | |
| "learning_rate": 0.0005105559025608995, | |
| "loss": 13.7344, | |
| "step": 8175 | |
| }, | |
| { | |
| "epoch": 0.02560947425623216, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 0.0005121174266083698, | |
| "loss": 13.8572, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.0256875519216475, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 0.0005136789506558401, | |
| "loss": 13.9229, | |
| "step": 8225 | |
| }, | |
| { | |
| "epoch": 0.025765629587062844, | |
| "grad_norm": 14.125, | |
| "learning_rate": 0.0005152404747033104, | |
| "loss": 13.976, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.025843707252478185, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 0.0005168019987507809, | |
| "loss": 13.9796, | |
| "step": 8275 | |
| }, | |
| { | |
| "epoch": 0.02592178491789353, | |
| "grad_norm": 14.5625, | |
| "learning_rate": 0.0005183635227982511, | |
| "loss": 14.0409, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.02599986258330887, | |
| "grad_norm": 14.0, | |
| "learning_rate": 0.0005199250468457214, | |
| "loss": 13.9807, | |
| "step": 8325 | |
| }, | |
| { | |
| "epoch": 0.026077940248724212, | |
| "grad_norm": 15.9375, | |
| "learning_rate": 0.0005214865708931917, | |
| "loss": 14.1036, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.026156017914139552, | |
| "grad_norm": 13.9375, | |
| "learning_rate": 0.0005230480949406621, | |
| "loss": 14.1808, | |
| "step": 8375 | |
| }, | |
| { | |
| "epoch": 0.026234095579554896, | |
| "grad_norm": 14.75, | |
| "learning_rate": 0.0005246096189881324, | |
| "loss": 14.0815, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.026312173244970236, | |
| "grad_norm": 17.375, | |
| "learning_rate": 0.0005261711430356028, | |
| "loss": 14.2371, | |
| "step": 8425 | |
| }, | |
| { | |
| "epoch": 0.02639025091038558, | |
| "grad_norm": 19.0, | |
| "learning_rate": 0.0005277326670830731, | |
| "loss": 14.3598, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.02646832857580092, | |
| "grad_norm": 15.5, | |
| "learning_rate": 0.0005292941911305435, | |
| "loss": 14.395, | |
| "step": 8475 | |
| }, | |
| { | |
| "epoch": 0.026546406241216264, | |
| "grad_norm": 16.0, | |
| "learning_rate": 0.0005308557151780138, | |
| "loss": 14.4232, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.026624483906631604, | |
| "grad_norm": 16.5, | |
| "learning_rate": 0.0005324172392254841, | |
| "loss": 14.498, | |
| "step": 8525 | |
| }, | |
| { | |
| "epoch": 0.026702561572046948, | |
| "grad_norm": 15.875, | |
| "learning_rate": 0.0005339787632729543, | |
| "loss": 14.5161, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.026780639237462288, | |
| "grad_norm": 16.625, | |
| "learning_rate": 0.0005355402873204247, | |
| "loss": 14.5657, | |
| "step": 8575 | |
| }, | |
| { | |
| "epoch": 0.02685871690287763, | |
| "grad_norm": 16.25, | |
| "learning_rate": 0.0005371018113678951, | |
| "loss": 14.5677, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.02693679456829297, | |
| "grad_norm": 16.25, | |
| "learning_rate": 0.0005386633354153654, | |
| "loss": 14.7238, | |
| "step": 8625 | |
| }, | |
| { | |
| "epoch": 0.027014872233708315, | |
| "grad_norm": 19.75, | |
| "learning_rate": 0.0005402248594628357, | |
| "loss": 14.7521, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.027092949899123656, | |
| "grad_norm": 17.5, | |
| "learning_rate": 0.000541786383510306, | |
| "loss": 14.81, | |
| "step": 8675 | |
| }, | |
| { | |
| "epoch": 0.027171027564539, | |
| "grad_norm": 17.25, | |
| "learning_rate": 0.0005433479075577764, | |
| "loss": 14.8193, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.02724910522995434, | |
| "grad_norm": 19.875, | |
| "learning_rate": 0.0005449094316052468, | |
| "loss": 14.7857, | |
| "step": 8725 | |
| }, | |
| { | |
| "epoch": 0.027327182895369683, | |
| "grad_norm": 16.625, | |
| "learning_rate": 0.0005464709556527171, | |
| "loss": 14.8485, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.027405260560785023, | |
| "grad_norm": 17.75, | |
| "learning_rate": 0.0005480324797001874, | |
| "loss": 14.8765, | |
| "step": 8775 | |
| }, | |
| { | |
| "epoch": 0.027483338226200367, | |
| "grad_norm": 16.625, | |
| "learning_rate": 0.0005495940037476578, | |
| "loss": 14.8669, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.027561415891615707, | |
| "grad_norm": 18.375, | |
| "learning_rate": 0.000551155527795128, | |
| "loss": 14.9237, | |
| "step": 8825 | |
| }, | |
| { | |
| "epoch": 0.02763949355703105, | |
| "grad_norm": 18.75, | |
| "learning_rate": 0.0005527170518425983, | |
| "loss": 15.0786, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.02771757122244639, | |
| "grad_norm": 18.625, | |
| "learning_rate": 0.0005542785758900687, | |
| "loss": 14.9841, | |
| "step": 8875 | |
| }, | |
| { | |
| "epoch": 0.027795648887861735, | |
| "grad_norm": 16.375, | |
| "learning_rate": 0.0005558400999375391, | |
| "loss": 15.1071, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.027873726553277075, | |
| "grad_norm": 18.0, | |
| "learning_rate": 0.0005574016239850094, | |
| "loss": 15.0875, | |
| "step": 8925 | |
| }, | |
| { | |
| "epoch": 0.02795180421869242, | |
| "grad_norm": 16.75, | |
| "learning_rate": 0.0005589631480324797, | |
| "loss": 15.1422, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.02802988188410776, | |
| "grad_norm": 16.125, | |
| "learning_rate": 0.00056052467207995, | |
| "loss": 15.1989, | |
| "step": 8975 | |
| }, | |
| { | |
| "epoch": 0.028107959549523102, | |
| "grad_norm": 17.625, | |
| "learning_rate": 0.0005620861961274205, | |
| "loss": 15.1135, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.028107959549523102, | |
| "eval_loss": 15.24026107788086, | |
| "eval_runtime": 102.3475, | |
| "eval_samples_per_second": 50.837, | |
| "eval_steps_per_second": 3.185, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.028186037214938443, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 0.0005636477201748908, | |
| "loss": 15.2318, | |
| "step": 9025 | |
| }, | |
| { | |
| "epoch": 0.028264114880353786, | |
| "grad_norm": 16.125, | |
| "learning_rate": 0.0005652092442223611, | |
| "loss": 15.1658, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.028342192545769126, | |
| "grad_norm": 16.5, | |
| "learning_rate": 0.0005667707682698313, | |
| "loss": 15.2762, | |
| "step": 9075 | |
| }, | |
| { | |
| "epoch": 0.02842027021118447, | |
| "grad_norm": 15.375, | |
| "learning_rate": 0.0005683322923173016, | |
| "loss": 15.1555, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.02849834787659981, | |
| "grad_norm": 16.75, | |
| "learning_rate": 0.000569893816364772, | |
| "loss": 15.1879, | |
| "step": 9125 | |
| }, | |
| { | |
| "epoch": 0.028576425542015154, | |
| "grad_norm": 15.75, | |
| "learning_rate": 0.0005714553404122423, | |
| "loss": 15.1291, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.028654503207430494, | |
| "grad_norm": 16.375, | |
| "learning_rate": 0.0005730168644597127, | |
| "loss": 15.2669, | |
| "step": 9175 | |
| }, | |
| { | |
| "epoch": 0.028732580872845838, | |
| "grad_norm": 15.625, | |
| "learning_rate": 0.000574578388507183, | |
| "loss": 15.2461, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.028810658538261178, | |
| "grad_norm": 16.25, | |
| "learning_rate": 0.0005761399125546534, | |
| "loss": 15.1608, | |
| "step": 9225 | |
| }, | |
| { | |
| "epoch": 0.028888736203676522, | |
| "grad_norm": 16.625, | |
| "learning_rate": 0.0005777014366021237, | |
| "loss": 15.2685, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.028966813869091862, | |
| "grad_norm": 17.375, | |
| "learning_rate": 0.000579262960649594, | |
| "loss": 15.3043, | |
| "step": 9275 | |
| }, | |
| { | |
| "epoch": 0.029044891534507206, | |
| "grad_norm": 16.625, | |
| "learning_rate": 0.0005808244846970644, | |
| "loss": 15.251, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.029122969199922546, | |
| "grad_norm": 17.875, | |
| "learning_rate": 0.0005823860087445347, | |
| "loss": 15.3825, | |
| "step": 9325 | |
| }, | |
| { | |
| "epoch": 0.02920104686533789, | |
| "grad_norm": 17.625, | |
| "learning_rate": 0.000583947532792005, | |
| "loss": 15.2931, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.02927912453075323, | |
| "grad_norm": 17.375, | |
| "learning_rate": 0.0005855090568394753, | |
| "loss": 15.3777, | |
| "step": 9375 | |
| }, | |
| { | |
| "epoch": 0.029357202196168573, | |
| "grad_norm": 17.5, | |
| "learning_rate": 0.0005870705808869456, | |
| "loss": 15.369, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.029435279861583914, | |
| "grad_norm": 17.75, | |
| "learning_rate": 0.000588632104934416, | |
| "loss": 15.2992, | |
| "step": 9425 | |
| }, | |
| { | |
| "epoch": 0.029513357526999257, | |
| "grad_norm": 15.625, | |
| "learning_rate": 0.0005901936289818864, | |
| "loss": 15.3261, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.029591435192414597, | |
| "grad_norm": 19.375, | |
| "learning_rate": 0.0005917551530293567, | |
| "loss": 15.2748, | |
| "step": 9475 | |
| }, | |
| { | |
| "epoch": 0.02966951285782994, | |
| "grad_norm": 17.875, | |
| "learning_rate": 0.000593316677076827, | |
| "loss": 15.2702, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.02974759052324528, | |
| "grad_norm": 20.625, | |
| "learning_rate": 0.0005948782011242974, | |
| "loss": 15.3905, | |
| "step": 9525 | |
| }, | |
| { | |
| "epoch": 0.029825668188660625, | |
| "grad_norm": 16.5, | |
| "learning_rate": 0.0005964397251717677, | |
| "loss": 15.3166, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.029903745854075965, | |
| "grad_norm": 17.125, | |
| "learning_rate": 0.000598001249219238, | |
| "loss": 15.3973, | |
| "step": 9575 | |
| }, | |
| { | |
| "epoch": 0.02998182351949131, | |
| "grad_norm": 15.4375, | |
| "learning_rate": 0.0005995627732667083, | |
| "loss": 15.3621, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.03005990118490665, | |
| "grad_norm": 17.375, | |
| "learning_rate": 0.0006011242973141786, | |
| "loss": 15.532, | |
| "step": 9625 | |
| }, | |
| { | |
| "epoch": 0.030137978850321993, | |
| "grad_norm": 21.125, | |
| "learning_rate": 0.000602685821361649, | |
| "loss": 15.6577, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.030216056515737333, | |
| "grad_norm": 16.5, | |
| "learning_rate": 0.0006042473454091193, | |
| "loss": 15.7684, | |
| "step": 9675 | |
| }, | |
| { | |
| "epoch": 0.030294134181152677, | |
| "grad_norm": 17.125, | |
| "learning_rate": 0.0006058088694565896, | |
| "loss": 15.8286, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.030372211846568017, | |
| "grad_norm": 17.625, | |
| "learning_rate": 0.0006073703935040599, | |
| "loss": 15.9935, | |
| "step": 9725 | |
| }, | |
| { | |
| "epoch": 0.03045028951198336, | |
| "grad_norm": 20.25, | |
| "learning_rate": 0.0006089319175515304, | |
| "loss": 16.0198, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 0.0305283671773987, | |
| "grad_norm": 17.875, | |
| "learning_rate": 0.0006104934415990007, | |
| "loss": 16.0932, | |
| "step": 9775 | |
| }, | |
| { | |
| "epoch": 0.030606444842814044, | |
| "grad_norm": 21.75, | |
| "learning_rate": 0.000612054965646471, | |
| "loss": 16.0459, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.030684522508229384, | |
| "grad_norm": 20.25, | |
| "learning_rate": 0.0006136164896939413, | |
| "loss": 16.0705, | |
| "step": 9825 | |
| }, | |
| { | |
| "epoch": 0.030762600173644728, | |
| "grad_norm": 18.625, | |
| "learning_rate": 0.0006151780137414116, | |
| "loss": 16.0394, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 0.03084067783906007, | |
| "grad_norm": 16.75, | |
| "learning_rate": 0.0006167395377888819, | |
| "loss": 16.0319, | |
| "step": 9875 | |
| }, | |
| { | |
| "epoch": 0.030918755504475412, | |
| "grad_norm": 16.375, | |
| "learning_rate": 0.0006183010618363523, | |
| "loss": 16.0178, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.030996833169890752, | |
| "grad_norm": 26.375, | |
| "learning_rate": 0.0006198625858838226, | |
| "loss": 16.1248, | |
| "step": 9925 | |
| }, | |
| { | |
| "epoch": 0.031074910835306096, | |
| "grad_norm": 18.375, | |
| "learning_rate": 0.000621424109931293, | |
| "loss": 16.1081, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 0.031152988500721436, | |
| "grad_norm": 18.25, | |
| "learning_rate": 0.0006229856339787633, | |
| "loss": 16.195, | |
| "step": 9975 | |
| }, | |
| { | |
| "epoch": 0.03123106616613678, | |
| "grad_norm": 19.375, | |
| "learning_rate": 0.0006245471580262336, | |
| "loss": 16.2205, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.03123106616613678, | |
| "eval_loss": 16.29703140258789, | |
| "eval_runtime": 102.3113, | |
| "eval_samples_per_second": 50.855, | |
| "eval_steps_per_second": 3.186, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.03130914383155212, | |
| "grad_norm": 17.375, | |
| "learning_rate": 0.0006261086820737039, | |
| "loss": 16.2662, | |
| "step": 10025 | |
| }, | |
| { | |
| "epoch": 0.03138722149696746, | |
| "grad_norm": 25.625, | |
| "learning_rate": 0.0006276702061211744, | |
| "loss": 16.2528, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 0.031465299162382804, | |
| "grad_norm": 19.5, | |
| "learning_rate": 0.0006292317301686447, | |
| "loss": 16.3618, | |
| "step": 10075 | |
| }, | |
| { | |
| "epoch": 0.03154337682779815, | |
| "grad_norm": 19.25, | |
| "learning_rate": 0.0006307932542161149, | |
| "loss": 16.4928, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.03162145449321349, | |
| "grad_norm": 20.125, | |
| "learning_rate": 0.0006323547782635852, | |
| "loss": 16.4938, | |
| "step": 10125 | |
| }, | |
| { | |
| "epoch": 0.03169953215862883, | |
| "grad_norm": 22.5, | |
| "learning_rate": 0.0006339163023110555, | |
| "loss": 16.5037, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 0.03177760982404417, | |
| "grad_norm": 22.0, | |
| "learning_rate": 0.000635477826358526, | |
| "loss": 16.5408, | |
| "step": 10175 | |
| }, | |
| { | |
| "epoch": 0.031855687489459515, | |
| "grad_norm": 22.875, | |
| "learning_rate": 0.0006370393504059963, | |
| "loss": 16.6573, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.03193376515487486, | |
| "grad_norm": 20.625, | |
| "learning_rate": 0.0006386008744534666, | |
| "loss": 16.608, | |
| "step": 10225 | |
| }, | |
| { | |
| "epoch": 0.0320118428202902, | |
| "grad_norm": 18.375, | |
| "learning_rate": 0.0006401623985009369, | |
| "loss": 16.6253, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 0.03208992048570554, | |
| "grad_norm": 22.75, | |
| "learning_rate": 0.0006417239225484073, | |
| "loss": 16.7264, | |
| "step": 10275 | |
| }, | |
| { | |
| "epoch": 0.03216799815112088, | |
| "grad_norm": 22.75, | |
| "learning_rate": 0.0006432854465958776, | |
| "loss": 16.7937, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.03224607581653623, | |
| "grad_norm": 18.375, | |
| "learning_rate": 0.000644846970643348, | |
| "loss": 16.8422, | |
| "step": 10325 | |
| }, | |
| { | |
| "epoch": 0.03232415348195157, | |
| "grad_norm": 19.625, | |
| "learning_rate": 0.0006464084946908183, | |
| "loss": 16.8992, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 0.03240223114736691, | |
| "grad_norm": 18.375, | |
| "learning_rate": 0.0006479700187382886, | |
| "loss": 16.7641, | |
| "step": 10375 | |
| }, | |
| { | |
| "epoch": 0.03248030881278225, | |
| "grad_norm": 19.25, | |
| "learning_rate": 0.0006495315427857589, | |
| "loss": 16.9004, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.032558386478197594, | |
| "grad_norm": 20.0, | |
| "learning_rate": 0.0006510930668332292, | |
| "loss": 16.8661, | |
| "step": 10425 | |
| }, | |
| { | |
| "epoch": 0.03263646414361294, | |
| "grad_norm": 18.5, | |
| "learning_rate": 0.0006526545908806995, | |
| "loss": 16.717, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 0.032714541809028275, | |
| "grad_norm": 21.25, | |
| "learning_rate": 0.00065421611492817, | |
| "loss": 16.7414, | |
| "step": 10475 | |
| }, | |
| { | |
| "epoch": 0.03279261947444362, | |
| "grad_norm": 19.0, | |
| "learning_rate": 0.0006557776389756403, | |
| "loss": 16.751, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.03287069713985896, | |
| "grad_norm": 19.625, | |
| "learning_rate": 0.0006573391630231106, | |
| "loss": 16.761, | |
| "step": 10525 | |
| }, | |
| { | |
| "epoch": 0.032948774805274306, | |
| "grad_norm": 18.625, | |
| "learning_rate": 0.0006589006870705809, | |
| "loss": 16.7134, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 0.03302685247068964, | |
| "grad_norm": 19.5, | |
| "learning_rate": 0.0006604622111180513, | |
| "loss": 16.8445, | |
| "step": 10575 | |
| }, | |
| { | |
| "epoch": 0.033104930136104986, | |
| "grad_norm": 18.125, | |
| "learning_rate": 0.0006620237351655216, | |
| "loss": 16.923, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.03318300780152033, | |
| "grad_norm": 21.75, | |
| "learning_rate": 0.0006635852592129918, | |
| "loss": 17.0445, | |
| "step": 10625 | |
| }, | |
| { | |
| "epoch": 0.03326108546693567, | |
| "grad_norm": 20.5, | |
| "learning_rate": 0.0006651467832604622, | |
| "loss": 17.1396, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 0.03333916313235101, | |
| "grad_norm": 20.5, | |
| "learning_rate": 0.0006667083073079325, | |
| "loss": 17.0307, | |
| "step": 10675 | |
| }, | |
| { | |
| "epoch": 0.033417240797766354, | |
| "grad_norm": 18.875, | |
| "learning_rate": 0.0006682698313554029, | |
| "loss": 17.0284, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.0334953184631817, | |
| "grad_norm": 18.75, | |
| "learning_rate": 0.0006698313554028732, | |
| "loss": 17.1508, | |
| "step": 10725 | |
| }, | |
| { | |
| "epoch": 0.03357339612859704, | |
| "grad_norm": 24.625, | |
| "learning_rate": 0.0006713928794503435, | |
| "loss": 17.1699, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 0.03365147379401238, | |
| "grad_norm": 29.0, | |
| "learning_rate": 0.0006729544034978139, | |
| "loss": 17.2827, | |
| "step": 10775 | |
| }, | |
| { | |
| "epoch": 0.03372955145942772, | |
| "grad_norm": 22.25, | |
| "learning_rate": 0.0006745159275452843, | |
| "loss": 17.4336, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.033807629124843065, | |
| "grad_norm": 19.75, | |
| "learning_rate": 0.0006760774515927546, | |
| "loss": 17.2731, | |
| "step": 10825 | |
| }, | |
| { | |
| "epoch": 0.03388570679025841, | |
| "grad_norm": 20.875, | |
| "learning_rate": 0.0006776389756402249, | |
| "loss": 17.3691, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 0.033963784455673746, | |
| "grad_norm": 20.875, | |
| "learning_rate": 0.0006792004996876951, | |
| "loss": 17.5287, | |
| "step": 10875 | |
| }, | |
| { | |
| "epoch": 0.03404186212108909, | |
| "grad_norm": 19.25, | |
| "learning_rate": 0.0006807620237351655, | |
| "loss": 17.5784, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.03411993978650443, | |
| "grad_norm": 26.625, | |
| "learning_rate": 0.0006823235477826359, | |
| "loss": 17.5465, | |
| "step": 10925 | |
| }, | |
| { | |
| "epoch": 0.03419801745191978, | |
| "grad_norm": 21.75, | |
| "learning_rate": 0.0006838850718301062, | |
| "loss": 17.5995, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 0.03427609511733511, | |
| "grad_norm": 20.625, | |
| "learning_rate": 0.0006854465958775765, | |
| "loss": 17.8194, | |
| "step": 10975 | |
| }, | |
| { | |
| "epoch": 0.03435417278275046, | |
| "grad_norm": 21.125, | |
| "learning_rate": 0.0006870081199250469, | |
| "loss": 17.8228, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.03435417278275046, | |
| "eval_loss": 17.8679141998291, | |
| "eval_runtime": 102.2521, | |
| "eval_samples_per_second": 50.884, | |
| "eval_steps_per_second": 3.188, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.0344322504481658, | |
| "grad_norm": 20.25, | |
| "learning_rate": 0.0006885696439725172, | |
| "loss": 17.8434, | |
| "step": 11025 | |
| }, | |
| { | |
| "epoch": 0.034510328113581144, | |
| "grad_norm": 21.25, | |
| "learning_rate": 0.0006901311680199875, | |
| "loss": 17.8783, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 0.03458840577899648, | |
| "grad_norm": 24.25, | |
| "learning_rate": 0.0006916926920674579, | |
| "loss": 18.0649, | |
| "step": 11075 | |
| }, | |
| { | |
| "epoch": 0.034666483444411825, | |
| "grad_norm": 20.625, | |
| "learning_rate": 0.0006932542161149283, | |
| "loss": 18.0142, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.03474456110982717, | |
| "grad_norm": 21.25, | |
| "learning_rate": 0.0006948157401623986, | |
| "loss": 18.1228, | |
| "step": 11125 | |
| }, | |
| { | |
| "epoch": 0.03482263877524251, | |
| "grad_norm": 22.625, | |
| "learning_rate": 0.0006963772642098688, | |
| "loss": 18.2188, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 0.03490071644065785, | |
| "grad_norm": 24.625, | |
| "learning_rate": 0.0006979387882573391, | |
| "loss": 18.5369, | |
| "step": 11175 | |
| }, | |
| { | |
| "epoch": 0.03497879410607319, | |
| "grad_norm": 23.625, | |
| "learning_rate": 0.0006995003123048094, | |
| "loss": 18.6513, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.035056871771488536, | |
| "grad_norm": 23.125, | |
| "learning_rate": 0.0007010618363522799, | |
| "loss": 18.6154, | |
| "step": 11225 | |
| }, | |
| { | |
| "epoch": 0.03513494943690388, | |
| "grad_norm": 23.75, | |
| "learning_rate": 0.0007026233603997502, | |
| "loss": 18.5765, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 0.035213027102319217, | |
| "grad_norm": 24.25, | |
| "learning_rate": 0.0007041848844472205, | |
| "loss": 18.6452, | |
| "step": 11275 | |
| }, | |
| { | |
| "epoch": 0.03529110476773456, | |
| "grad_norm": 23.5, | |
| "learning_rate": 0.0007057464084946908, | |
| "loss": 18.5797, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.035369182433149904, | |
| "grad_norm": 25.0, | |
| "learning_rate": 0.0007073079325421612, | |
| "loss": 18.5652, | |
| "step": 11325 | |
| }, | |
| { | |
| "epoch": 0.03544726009856525, | |
| "grad_norm": 29.625, | |
| "learning_rate": 0.0007088694565896315, | |
| "loss": 18.623, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 0.035525337763980584, | |
| "grad_norm": 25.875, | |
| "learning_rate": 0.0007104309806371019, | |
| "loss": 18.6827, | |
| "step": 11375 | |
| }, | |
| { | |
| "epoch": 0.03560341542939593, | |
| "grad_norm": 25.0, | |
| "learning_rate": 0.0007119925046845721, | |
| "loss": 18.8077, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.03568149309481127, | |
| "grad_norm": 26.75, | |
| "learning_rate": 0.0007135540287320425, | |
| "loss": 18.7854, | |
| "step": 11425 | |
| }, | |
| { | |
| "epoch": 0.035759570760226615, | |
| "grad_norm": 26.125, | |
| "learning_rate": 0.0007151155527795128, | |
| "loss": 18.8622, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 0.03583764842564195, | |
| "grad_norm": 24.125, | |
| "learning_rate": 0.0007166770768269831, | |
| "loss": 18.8723, | |
| "step": 11475 | |
| }, | |
| { | |
| "epoch": 0.035915726091057296, | |
| "grad_norm": 23.75, | |
| "learning_rate": 0.0007182386008744534, | |
| "loss": 18.8899, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.03599380375647264, | |
| "grad_norm": 24.375, | |
| "learning_rate": 0.0007198001249219239, | |
| "loss": 18.9028, | |
| "step": 11525 | |
| }, | |
| { | |
| "epoch": 0.03607188142188798, | |
| "grad_norm": 24.5, | |
| "learning_rate": 0.0007213616489693942, | |
| "loss": 18.91, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 0.03614995908730332, | |
| "grad_norm": 24.625, | |
| "learning_rate": 0.0007229231730168645, | |
| "loss": 18.8747, | |
| "step": 11575 | |
| }, | |
| { | |
| "epoch": 0.03622803675271866, | |
| "grad_norm": 27.375, | |
| "learning_rate": 0.0007244846970643348, | |
| "loss": 18.7471, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.03630611441813401, | |
| "grad_norm": 21.375, | |
| "learning_rate": 0.0007260462211118051, | |
| "loss": 18.8455, | |
| "step": 11625 | |
| }, | |
| { | |
| "epoch": 0.03638419208354935, | |
| "grad_norm": 23.375, | |
| "learning_rate": 0.0007276077451592754, | |
| "loss": 18.8257, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 0.03646226974896469, | |
| "grad_norm": 19.875, | |
| "learning_rate": 0.0007291692692067458, | |
| "loss": 18.8365, | |
| "step": 11675 | |
| }, | |
| { | |
| "epoch": 0.03654034741438003, | |
| "grad_norm": 24.375, | |
| "learning_rate": 0.0007307307932542161, | |
| "loss": 18.847, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.036618425079795375, | |
| "grad_norm": 23.5, | |
| "learning_rate": 0.0007322923173016864, | |
| "loss": 18.9123, | |
| "step": 11725 | |
| }, | |
| { | |
| "epoch": 0.03669650274521072, | |
| "grad_norm": 22.0, | |
| "learning_rate": 0.0007338538413491568, | |
| "loss": 19.0867, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 0.036774580410626055, | |
| "grad_norm": 21.75, | |
| "learning_rate": 0.0007354153653966271, | |
| "loss": 19.0067, | |
| "step": 11775 | |
| }, | |
| { | |
| "epoch": 0.0368526580760414, | |
| "grad_norm": 22.5, | |
| "learning_rate": 0.0007369768894440974, | |
| "loss": 19.1682, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.03693073574145674, | |
| "grad_norm": 23.625, | |
| "learning_rate": 0.0007385384134915678, | |
| "loss": 19.3881, | |
| "step": 11825 | |
| }, | |
| { | |
| "epoch": 0.037008813406872086, | |
| "grad_norm": 22.5, | |
| "learning_rate": 0.0007400999375390382, | |
| "loss": 19.3211, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 0.03708689107228742, | |
| "grad_norm": 24.875, | |
| "learning_rate": 0.0007416614615865085, | |
| "loss": 19.4715, | |
| "step": 11875 | |
| }, | |
| { | |
| "epoch": 0.03716496873770277, | |
| "grad_norm": 23.125, | |
| "learning_rate": 0.0007432229856339788, | |
| "loss": 19.4858, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.03724304640311811, | |
| "grad_norm": 22.75, | |
| "learning_rate": 0.000744784509681449, | |
| "loss": 19.7193, | |
| "step": 11925 | |
| }, | |
| { | |
| "epoch": 0.037321124068533454, | |
| "grad_norm": 21.375, | |
| "learning_rate": 0.0007463460337289195, | |
| "loss": 19.6023, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 0.03739920173394879, | |
| "grad_norm": 27.375, | |
| "learning_rate": 0.0007479075577763898, | |
| "loss": 19.6003, | |
| "step": 11975 | |
| }, | |
| { | |
| "epoch": 0.037477279399364134, | |
| "grad_norm": 23.75, | |
| "learning_rate": 0.0007494690818238601, | |
| "loss": 19.7391, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.037477279399364134, | |
| "eval_loss": 19.823862075805664, | |
| "eval_runtime": 102.3056, | |
| "eval_samples_per_second": 50.857, | |
| "eval_steps_per_second": 3.187, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.03755535706477948, | |
| "grad_norm": 23.0, | |
| "learning_rate": 0.0007510306058713304, | |
| "loss": 19.8624, | |
| "step": 12025 | |
| }, | |
| { | |
| "epoch": 0.03763343473019482, | |
| "grad_norm": 22.5, | |
| "learning_rate": 0.0007525921299188008, | |
| "loss": 19.7457, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 0.03771151239561016, | |
| "grad_norm": 23.375, | |
| "learning_rate": 0.0007541536539662711, | |
| "loss": 19.83, | |
| "step": 12075 | |
| }, | |
| { | |
| "epoch": 0.0377895900610255, | |
| "grad_norm": 25.625, | |
| "learning_rate": 0.0007557151780137415, | |
| "loss": 19.8478, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.037867667726440846, | |
| "grad_norm": 23.5, | |
| "learning_rate": 0.0007572767020612118, | |
| "loss": 19.8336, | |
| "step": 12125 | |
| }, | |
| { | |
| "epoch": 0.03794574539185619, | |
| "grad_norm": 24.75, | |
| "learning_rate": 0.0007588382261086821, | |
| "loss": 19.9314, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 0.038023823057271526, | |
| "grad_norm": 22.375, | |
| "learning_rate": 0.0007603997501561524, | |
| "loss": 19.9643, | |
| "step": 12175 | |
| }, | |
| { | |
| "epoch": 0.03810190072268687, | |
| "grad_norm": 25.0, | |
| "learning_rate": 0.0007619612742036227, | |
| "loss": 19.9156, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.03817997838810221, | |
| "grad_norm": 22.125, | |
| "learning_rate": 0.000763522798251093, | |
| "loss": 19.9121, | |
| "step": 12225 | |
| }, | |
| { | |
| "epoch": 0.03825805605351756, | |
| "grad_norm": 23.0, | |
| "learning_rate": 0.0007650843222985633, | |
| "loss": 19.7906, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 0.038336133718932894, | |
| "grad_norm": 23.625, | |
| "learning_rate": 0.0007666458463460338, | |
| "loss": 19.911, | |
| "step": 12275 | |
| }, | |
| { | |
| "epoch": 0.03841421138434824, | |
| "grad_norm": 25.125, | |
| "learning_rate": 0.0007682073703935041, | |
| "loss": 19.7616, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.03849228904976358, | |
| "grad_norm": 26.375, | |
| "learning_rate": 0.0007697688944409744, | |
| "loss": 20.0897, | |
| "step": 12325 | |
| }, | |
| { | |
| "epoch": 0.038570366715178925, | |
| "grad_norm": 23.0, | |
| "learning_rate": 0.0007713304184884447, | |
| "loss": 19.9698, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 0.03864844438059426, | |
| "grad_norm": 26.75, | |
| "learning_rate": 0.0007728919425359151, | |
| "loss": 20.0703, | |
| "step": 12375 | |
| }, | |
| { | |
| "epoch": 0.038726522046009605, | |
| "grad_norm": 25.25, | |
| "learning_rate": 0.0007744534665833855, | |
| "loss": 20.0981, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.03880459971142495, | |
| "grad_norm": 30.125, | |
| "learning_rate": 0.0007760149906308557, | |
| "loss": 20.1253, | |
| "step": 12425 | |
| }, | |
| { | |
| "epoch": 0.03888267737684029, | |
| "grad_norm": 27.125, | |
| "learning_rate": 0.000777576514678326, | |
| "loss": 20.3167, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 0.03896075504225563, | |
| "grad_norm": 26.5, | |
| "learning_rate": 0.0007791380387257964, | |
| "loss": 20.4952, | |
| "step": 12475 | |
| }, | |
| { | |
| "epoch": 0.03903883270767097, | |
| "grad_norm": 32.75, | |
| "learning_rate": 0.0007806995627732667, | |
| "loss": 20.614, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.03911691037308632, | |
| "grad_norm": 28.0, | |
| "learning_rate": 0.000782261086820737, | |
| "loss": 20.8281, | |
| "step": 12525 | |
| }, | |
| { | |
| "epoch": 0.03919498803850166, | |
| "grad_norm": 27.625, | |
| "learning_rate": 0.0007838226108682074, | |
| "loss": 20.8424, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 0.039273065703917, | |
| "grad_norm": 37.25, | |
| "learning_rate": 0.0007853841349156778, | |
| "loss": 20.9891, | |
| "step": 12575 | |
| }, | |
| { | |
| "epoch": 0.03935114336933234, | |
| "grad_norm": 28.75, | |
| "learning_rate": 0.0007869456589631481, | |
| "loss": 20.8581, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.039429221034747684, | |
| "grad_norm": 27.375, | |
| "learning_rate": 0.0007885071830106184, | |
| "loss": 20.8655, | |
| "step": 12625 | |
| }, | |
| { | |
| "epoch": 0.03950729870016303, | |
| "grad_norm": 29.5, | |
| "learning_rate": 0.0007900687070580887, | |
| "loss": 21.0043, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 0.039585376365578365, | |
| "grad_norm": 33.5, | |
| "learning_rate": 0.000791630231105559, | |
| "loss": 21.1795, | |
| "step": 12675 | |
| }, | |
| { | |
| "epoch": 0.03966345403099371, | |
| "grad_norm": 28.625, | |
| "learning_rate": 0.0007931917551530294, | |
| "loss": 21.1842, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.03974153169640905, | |
| "grad_norm": 25.125, | |
| "learning_rate": 0.0007947532792004997, | |
| "loss": 21.1583, | |
| "step": 12725 | |
| }, | |
| { | |
| "epoch": 0.039819609361824396, | |
| "grad_norm": 28.875, | |
| "learning_rate": 0.00079631480324797, | |
| "loss": 21.2881, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 0.03989768702723974, | |
| "grad_norm": 30.875, | |
| "learning_rate": 0.0007978763272954403, | |
| "loss": 21.265, | |
| "step": 12775 | |
| }, | |
| { | |
| "epoch": 0.039975764692655076, | |
| "grad_norm": 30.25, | |
| "learning_rate": 0.0007994378513429107, | |
| "loss": 21.3535, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.04005384235807042, | |
| "grad_norm": 29.75, | |
| "learning_rate": 0.000800999375390381, | |
| "loss": 21.4447, | |
| "step": 12825 | |
| }, | |
| { | |
| "epoch": 0.04013192002348576, | |
| "grad_norm": 30.75, | |
| "learning_rate": 0.0008025608994378514, | |
| "loss": 21.5344, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 0.04020999768890111, | |
| "grad_norm": 33.75, | |
| "learning_rate": 0.0008041224234853217, | |
| "loss": 21.3934, | |
| "step": 12875 | |
| }, | |
| { | |
| "epoch": 0.040288075354316444, | |
| "grad_norm": 45.0, | |
| "learning_rate": 0.0008056839475327921, | |
| "loss": 21.7891, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.04036615301973179, | |
| "grad_norm": 28.125, | |
| "learning_rate": 0.0008072454715802624, | |
| "loss": 22.0609, | |
| "step": 12925 | |
| }, | |
| { | |
| "epoch": 0.04044423068514713, | |
| "grad_norm": 32.0, | |
| "learning_rate": 0.0008088069956277326, | |
| "loss": 21.915, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 0.040522308350562475, | |
| "grad_norm": 27.875, | |
| "learning_rate": 0.0008103685196752029, | |
| "loss": 21.9726, | |
| "step": 12975 | |
| }, | |
| { | |
| "epoch": 0.04060038601597781, | |
| "grad_norm": 27.625, | |
| "learning_rate": 0.0008119300437226734, | |
| "loss": 21.9425, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.04060038601597781, | |
| "eval_loss": 22.01194190979004, | |
| "eval_runtime": 102.3317, | |
| "eval_samples_per_second": 50.844, | |
| "eval_steps_per_second": 3.186, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.040678463681393155, | |
| "grad_norm": 35.75, | |
| "learning_rate": 0.0008134915677701437, | |
| "loss": 21.8983, | |
| "step": 13025 | |
| }, | |
| { | |
| "epoch": 0.0407565413468085, | |
| "grad_norm": 31.375, | |
| "learning_rate": 0.000815053091817614, | |
| "loss": 22.2354, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 0.04083461901222384, | |
| "grad_norm": 28.375, | |
| "learning_rate": 0.0008166146158650843, | |
| "loss": 22.3099, | |
| "step": 13075 | |
| }, | |
| { | |
| "epoch": 0.04091269667763918, | |
| "grad_norm": 34.5, | |
| "learning_rate": 0.0008181761399125547, | |
| "loss": 22.3739, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.04099077434305452, | |
| "grad_norm": 29.5, | |
| "learning_rate": 0.000819737663960025, | |
| "loss": 22.4598, | |
| "step": 13125 | |
| }, | |
| { | |
| "epoch": 0.04106885200846987, | |
| "grad_norm": 32.25, | |
| "learning_rate": 0.0008212991880074954, | |
| "loss": 22.7993, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 0.04114692967388521, | |
| "grad_norm": 31.375, | |
| "learning_rate": 0.0008228607120549657, | |
| "loss": 22.7376, | |
| "step": 13175 | |
| }, | |
| { | |
| "epoch": 0.04122500733930055, | |
| "grad_norm": 31.5, | |
| "learning_rate": 0.0008244222361024359, | |
| "loss": 22.6221, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.04130308500471589, | |
| "grad_norm": 29.375, | |
| "learning_rate": 0.0008259837601499063, | |
| "loss": 22.6237, | |
| "step": 13225 | |
| }, | |
| { | |
| "epoch": 0.041381162670131234, | |
| "grad_norm": 28.0, | |
| "learning_rate": 0.0008275452841973766, | |
| "loss": 22.4565, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 0.04145924033554658, | |
| "grad_norm": 28.875, | |
| "learning_rate": 0.0008291068082448469, | |
| "loss": 22.4236, | |
| "step": 13275 | |
| }, | |
| { | |
| "epoch": 0.041537318000961915, | |
| "grad_norm": 26.625, | |
| "learning_rate": 0.0008306683322923173, | |
| "loss": 22.4627, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.04161539566637726, | |
| "grad_norm": 30.75, | |
| "learning_rate": 0.0008322298563397877, | |
| "loss": 22.5395, | |
| "step": 13325 | |
| }, | |
| { | |
| "epoch": 0.0416934733317926, | |
| "grad_norm": 29.375, | |
| "learning_rate": 0.000833791380387258, | |
| "loss": 22.4437, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 0.041771550997207946, | |
| "grad_norm": 26.375, | |
| "learning_rate": 0.0008353529044347283, | |
| "loss": 22.5234, | |
| "step": 13375 | |
| }, | |
| { | |
| "epoch": 0.04184962866262328, | |
| "grad_norm": 28.0, | |
| "learning_rate": 0.0008369144284821986, | |
| "loss": 22.9237, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.041927706328038626, | |
| "grad_norm": 32.0, | |
| "learning_rate": 0.0008384759525296691, | |
| "loss": 22.9402, | |
| "step": 13425 | |
| }, | |
| { | |
| "epoch": 0.04200578399345397, | |
| "grad_norm": 30.375, | |
| "learning_rate": 0.0008400374765771394, | |
| "loss": 23.1061, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 0.04208386165886931, | |
| "grad_norm": 32.0, | |
| "learning_rate": 0.0008415990006246096, | |
| "loss": 22.9162, | |
| "step": 13475 | |
| }, | |
| { | |
| "epoch": 0.04216193932428465, | |
| "grad_norm": 30.25, | |
| "learning_rate": 0.0008431605246720799, | |
| "loss": 23.2072, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.042240016989699994, | |
| "grad_norm": 31.375, | |
| "learning_rate": 0.0008447220487195503, | |
| "loss": 23.2287, | |
| "step": 13525 | |
| }, | |
| { | |
| "epoch": 0.04231809465511534, | |
| "grad_norm": 29.5, | |
| "learning_rate": 0.0008462835727670206, | |
| "loss": 23.1901, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 0.04239617232053068, | |
| "grad_norm": 28.25, | |
| "learning_rate": 0.000847845096814491, | |
| "loss": 23.3087, | |
| "step": 13575 | |
| }, | |
| { | |
| "epoch": 0.04247424998594602, | |
| "grad_norm": 33.75, | |
| "learning_rate": 0.0008494066208619613, | |
| "loss": 23.5178, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.04255232765136136, | |
| "grad_norm": 27.875, | |
| "learning_rate": 0.0008509681449094317, | |
| "loss": 23.4003, | |
| "step": 13625 | |
| }, | |
| { | |
| "epoch": 0.042630405316776705, | |
| "grad_norm": 26.25, | |
| "learning_rate": 0.000852529668956902, | |
| "loss": 23.4554, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 0.04270848298219205, | |
| "grad_norm": 24.875, | |
| "learning_rate": 0.0008540911930043723, | |
| "loss": 23.3269, | |
| "step": 13675 | |
| }, | |
| { | |
| "epoch": 0.042786560647607386, | |
| "grad_norm": 24.875, | |
| "learning_rate": 0.0008556527170518426, | |
| "loss": 23.2309, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.04286463831302273, | |
| "grad_norm": 29.375, | |
| "learning_rate": 0.0008572142410993128, | |
| "loss": 23.0514, | |
| "step": 13725 | |
| }, | |
| { | |
| "epoch": 0.04294271597843807, | |
| "grad_norm": 27.125, | |
| "learning_rate": 0.0008587757651467833, | |
| "loss": 22.984, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 0.04302079364385342, | |
| "grad_norm": 30.25, | |
| "learning_rate": 0.0008603372891942536, | |
| "loss": 22.9465, | |
| "step": 13775 | |
| }, | |
| { | |
| "epoch": 0.04309887130926875, | |
| "grad_norm": 30.0, | |
| "learning_rate": 0.0008618988132417239, | |
| "loss": 23.0119, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.0431769489746841, | |
| "grad_norm": 30.25, | |
| "learning_rate": 0.0008634603372891942, | |
| "loss": 22.9153, | |
| "step": 13825 | |
| }, | |
| { | |
| "epoch": 0.04325502664009944, | |
| "grad_norm": 25.25, | |
| "learning_rate": 0.0008650218613366646, | |
| "loss": 23.1027, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 0.043333104305514784, | |
| "grad_norm": 33.75, | |
| "learning_rate": 0.000866583385384135, | |
| "loss": 23.0265, | |
| "step": 13875 | |
| }, | |
| { | |
| "epoch": 0.04341118197093012, | |
| "grad_norm": 29.625, | |
| "learning_rate": 0.0008681449094316053, | |
| "loss": 23.1209, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.043489259636345465, | |
| "grad_norm": 30.0, | |
| "learning_rate": 0.0008697064334790756, | |
| "loss": 23.2931, | |
| "step": 13925 | |
| }, | |
| { | |
| "epoch": 0.04356733730176081, | |
| "grad_norm": 31.25, | |
| "learning_rate": 0.000871267957526546, | |
| "loss": 23.6223, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 0.04364541496717615, | |
| "grad_norm": 29.125, | |
| "learning_rate": 0.0008728294815740162, | |
| "loss": 23.4989, | |
| "step": 13975 | |
| }, | |
| { | |
| "epoch": 0.04372349263259149, | |
| "grad_norm": 30.125, | |
| "learning_rate": 0.0008743910056214865, | |
| "loss": 23.923, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.04372349263259149, | |
| "eval_loss": 23.799776077270508, | |
| "eval_runtime": 102.2075, | |
| "eval_samples_per_second": 50.906, | |
| "eval_steps_per_second": 3.19, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.04380157029800683, | |
| "grad_norm": 32.0, | |
| "learning_rate": 0.0008759525296689569, | |
| "loss": 23.9569, | |
| "step": 14025 | |
| }, | |
| { | |
| "epoch": 0.043879647963422176, | |
| "grad_norm": 30.75, | |
| "learning_rate": 0.0008775140537164273, | |
| "loss": 23.764, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 0.04395772562883752, | |
| "grad_norm": 29.75, | |
| "learning_rate": 0.0008790755777638976, | |
| "loss": 23.4492, | |
| "step": 14075 | |
| }, | |
| { | |
| "epoch": 0.04403580329425286, | |
| "grad_norm": 28.125, | |
| "learning_rate": 0.0008806371018113679, | |
| "loss": 23.5056, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.0441138809596682, | |
| "grad_norm": 31.25, | |
| "learning_rate": 0.0008821986258588382, | |
| "loss": 23.7418, | |
| "step": 14125 | |
| }, | |
| { | |
| "epoch": 0.044191958625083544, | |
| "grad_norm": 31.0, | |
| "learning_rate": 0.0008837601499063086, | |
| "loss": 23.7158, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 0.04427003629049889, | |
| "grad_norm": 35.25, | |
| "learning_rate": 0.000885321673953779, | |
| "loss": 24.0083, | |
| "step": 14175 | |
| }, | |
| { | |
| "epoch": 0.044348113955914224, | |
| "grad_norm": 34.75, | |
| "learning_rate": 0.0008868831980012493, | |
| "loss": 23.95, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.04442619162132957, | |
| "grad_norm": 34.5, | |
| "learning_rate": 0.0008884447220487196, | |
| "loss": 24.0242, | |
| "step": 14225 | |
| }, | |
| { | |
| "epoch": 0.04450426928674491, | |
| "grad_norm": 34.0, | |
| "learning_rate": 0.0008900062460961898, | |
| "loss": 24.2818, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 0.044582346952160255, | |
| "grad_norm": 32.25, | |
| "learning_rate": 0.0008915677701436602, | |
| "loss": 24.4323, | |
| "step": 14275 | |
| }, | |
| { | |
| "epoch": 0.04466042461757559, | |
| "grad_norm": 31.125, | |
| "learning_rate": 0.0008931292941911305, | |
| "loss": 24.8361, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.044738502282990936, | |
| "grad_norm": 32.5, | |
| "learning_rate": 0.0008946908182386009, | |
| "loss": 24.8959, | |
| "step": 14325 | |
| }, | |
| { | |
| "epoch": 0.04481657994840628, | |
| "grad_norm": 30.75, | |
| "learning_rate": 0.0008962523422860712, | |
| "loss": 24.7795, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 0.04489465761382162, | |
| "grad_norm": 45.75, | |
| "learning_rate": 0.0008978138663335416, | |
| "loss": 24.9758, | |
| "step": 14375 | |
| }, | |
| { | |
| "epoch": 0.04497273527923696, | |
| "grad_norm": 31.625, | |
| "learning_rate": 0.0008993753903810119, | |
| "loss": 25.0523, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.0450508129446523, | |
| "grad_norm": 36.25, | |
| "learning_rate": 0.0009009369144284822, | |
| "loss": 24.8775, | |
| "step": 14425 | |
| }, | |
| { | |
| "epoch": 0.04512889061006765, | |
| "grad_norm": 34.25, | |
| "learning_rate": 0.0009024984384759525, | |
| "loss": 24.9395, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 0.04520696827548299, | |
| "grad_norm": 36.5, | |
| "learning_rate": 0.000904059962523423, | |
| "loss": 25.0047, | |
| "step": 14475 | |
| }, | |
| { | |
| "epoch": 0.04528504594089833, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.0009056214865708932, | |
| "loss": 24.9923, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.04536312360631367, | |
| "grad_norm": 32.75, | |
| "learning_rate": 0.0009071830106183635, | |
| "loss": 25.1583, | |
| "step": 14525 | |
| }, | |
| { | |
| "epoch": 0.045441201271729015, | |
| "grad_norm": 31.25, | |
| "learning_rate": 0.0009087445346658338, | |
| "loss": 25.1936, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 0.04551927893714436, | |
| "grad_norm": 31.75, | |
| "learning_rate": 0.0009103060587133042, | |
| "loss": 24.9059, | |
| "step": 14575 | |
| }, | |
| { | |
| "epoch": 0.045597356602559695, | |
| "grad_norm": 34.0, | |
| "learning_rate": 0.0009118675827607745, | |
| "loss": 25.1417, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.04567543426797504, | |
| "grad_norm": 32.25, | |
| "learning_rate": 0.0009134291068082449, | |
| "loss": 25.2183, | |
| "step": 14625 | |
| }, | |
| { | |
| "epoch": 0.04575351193339038, | |
| "grad_norm": 35.25, | |
| "learning_rate": 0.0009149906308557152, | |
| "loss": 25.3087, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 0.045831589598805726, | |
| "grad_norm": 31.5, | |
| "learning_rate": 0.0009165521549031856, | |
| "loss": 25.6569, | |
| "step": 14675 | |
| }, | |
| { | |
| "epoch": 0.04590966726422106, | |
| "grad_norm": 40.5, | |
| "learning_rate": 0.0009181136789506559, | |
| "loss": 25.9421, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.04598774492963641, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.0009196752029981262, | |
| "loss": 26.0395, | |
| "step": 14725 | |
| }, | |
| { | |
| "epoch": 0.04606582259505175, | |
| "grad_norm": 41.25, | |
| "learning_rate": 0.0009212367270455964, | |
| "loss": 26.0641, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 0.046143900260467094, | |
| "grad_norm": 39.5, | |
| "learning_rate": 0.0009227982510930668, | |
| "loss": 26.1332, | |
| "step": 14775 | |
| }, | |
| { | |
| "epoch": 0.04622197792588243, | |
| "grad_norm": 36.5, | |
| "learning_rate": 0.0009243597751405372, | |
| "loss": 26.102, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.046300055591297774, | |
| "grad_norm": 33.75, | |
| "learning_rate": 0.0009259212991880075, | |
| "loss": 26.1986, | |
| "step": 14825 | |
| }, | |
| { | |
| "epoch": 0.04637813325671312, | |
| "grad_norm": 36.5, | |
| "learning_rate": 0.0009274828232354778, | |
| "loss": 26.0567, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 0.04645621092212846, | |
| "grad_norm": 38.0, | |
| "learning_rate": 0.0009290443472829481, | |
| "loss": 26.2836, | |
| "step": 14875 | |
| }, | |
| { | |
| "epoch": 0.0465342885875438, | |
| "grad_norm": 36.0, | |
| "learning_rate": 0.0009306058713304186, | |
| "loss": 26.6167, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.04661236625295914, | |
| "grad_norm": 44.25, | |
| "learning_rate": 0.0009321673953778889, | |
| "loss": 26.4313, | |
| "step": 14925 | |
| }, | |
| { | |
| "epoch": 0.046690443918374486, | |
| "grad_norm": 36.25, | |
| "learning_rate": 0.0009337289194253592, | |
| "loss": 26.1888, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 0.04676852158378983, | |
| "grad_norm": 37.5, | |
| "learning_rate": 0.0009352904434728295, | |
| "loss": 26.2063, | |
| "step": 14975 | |
| }, | |
| { | |
| "epoch": 0.046846599249205166, | |
| "grad_norm": 36.25, | |
| "learning_rate": 0.0009368519675202999, | |
| "loss": 26.3716, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.046846599249205166, | |
| "eval_loss": 26.39820098876953, | |
| "eval_runtime": 102.1335, | |
| "eval_samples_per_second": 50.943, | |
| "eval_steps_per_second": 3.192, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.04692467691462051, | |
| "grad_norm": 35.5, | |
| "learning_rate": 0.0009384134915677701, | |
| "loss": 26.4646, | |
| "step": 15025 | |
| }, | |
| { | |
| "epoch": 0.04700275458003585, | |
| "grad_norm": 37.25, | |
| "learning_rate": 0.0009399750156152404, | |
| "loss": 26.457, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 0.0470808322454512, | |
| "grad_norm": 43.25, | |
| "learning_rate": 0.0009415365396627108, | |
| "loss": 26.4532, | |
| "step": 15075 | |
| }, | |
| { | |
| "epoch": 0.047158909910866534, | |
| "grad_norm": 34.75, | |
| "learning_rate": 0.0009430980637101812, | |
| "loss": 26.32, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.04723698757628188, | |
| "grad_norm": 58.5, | |
| "learning_rate": 0.0009446595877576515, | |
| "loss": 26.367, | |
| "step": 15125 | |
| }, | |
| { | |
| "epoch": 0.04731506524169722, | |
| "grad_norm": 44.25, | |
| "learning_rate": 0.0009462211118051218, | |
| "loss": 26.4783, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 0.047393142907112565, | |
| "grad_norm": 35.25, | |
| "learning_rate": 0.0009477826358525921, | |
| "loss": 26.3163, | |
| "step": 15175 | |
| }, | |
| { | |
| "epoch": 0.0474712205725279, | |
| "grad_norm": 36.0, | |
| "learning_rate": 0.0009493441599000626, | |
| "loss": 26.6294, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.047549298237943245, | |
| "grad_norm": 38.25, | |
| "learning_rate": 0.0009509056839475329, | |
| "loss": 26.6693, | |
| "step": 15225 | |
| }, | |
| { | |
| "epoch": 0.04762737590335859, | |
| "grad_norm": 42.25, | |
| "learning_rate": 0.0009524672079950032, | |
| "loss": 26.9737, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 0.04770545356877393, | |
| "grad_norm": 33.75, | |
| "learning_rate": 0.0009540287320424734, | |
| "loss": 26.9355, | |
| "step": 15275 | |
| }, | |
| { | |
| "epoch": 0.04778353123418927, | |
| "grad_norm": 37.75, | |
| "learning_rate": 0.0009555902560899437, | |
| "loss": 27.0918, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.04786160889960461, | |
| "grad_norm": 37.5, | |
| "learning_rate": 0.0009571517801374141, | |
| "loss": 27.2465, | |
| "step": 15325 | |
| }, | |
| { | |
| "epoch": 0.04793968656501996, | |
| "grad_norm": 35.25, | |
| "learning_rate": 0.0009587133041848845, | |
| "loss": 27.1683, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 0.0480177642304353, | |
| "grad_norm": 35.75, | |
| "learning_rate": 0.0009602748282323548, | |
| "loss": 27.0435, | |
| "step": 15375 | |
| }, | |
| { | |
| "epoch": 0.048095841895850644, | |
| "grad_norm": 39.0, | |
| "learning_rate": 0.0009618363522798251, | |
| "loss": 27.2943, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.04817391956126598, | |
| "grad_norm": 37.5, | |
| "learning_rate": 0.0009633978763272955, | |
| "loss": 27.1815, | |
| "step": 15425 | |
| }, | |
| { | |
| "epoch": 0.048251997226681324, | |
| "grad_norm": 38.75, | |
| "learning_rate": 0.0009649594003747658, | |
| "loss": 27.2386, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 0.04833007489209667, | |
| "grad_norm": 43.0, | |
| "learning_rate": 0.0009665209244222361, | |
| "loss": 27.5126, | |
| "step": 15475 | |
| }, | |
| { | |
| "epoch": 0.04840815255751201, | |
| "grad_norm": 44.75, | |
| "learning_rate": 0.0009680824484697065, | |
| "loss": 27.6576, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.04848623022292735, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.0009696439725171768, | |
| "loss": 27.6394, | |
| "step": 15525 | |
| }, | |
| { | |
| "epoch": 0.04856430788834269, | |
| "grad_norm": 46.5, | |
| "learning_rate": 0.0009712054965646471, | |
| "loss": 27.9862, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 0.048642385553758036, | |
| "grad_norm": 36.5, | |
| "learning_rate": 0.0009727670206121174, | |
| "loss": 27.6303, | |
| "step": 15575 | |
| }, | |
| { | |
| "epoch": 0.04872046321917338, | |
| "grad_norm": 36.25, | |
| "learning_rate": 0.0009743285446595877, | |
| "loss": 27.6376, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.048798540884588716, | |
| "grad_norm": 37.25, | |
| "learning_rate": 0.0009758900687070581, | |
| "loss": 27.792, | |
| "step": 15625 | |
| }, | |
| { | |
| "epoch": 0.04887661855000406, | |
| "grad_norm": 37.0, | |
| "learning_rate": 0.0009774515927545285, | |
| "loss": 27.8976, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 0.048954696215419403, | |
| "grad_norm": 44.75, | |
| "learning_rate": 0.0009790131168019988, | |
| "loss": 28.1314, | |
| "step": 15675 | |
| }, | |
| { | |
| "epoch": 0.04903277388083475, | |
| "grad_norm": 41.5, | |
| "learning_rate": 0.000980574640849469, | |
| "loss": 28.1346, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.049110851546250084, | |
| "grad_norm": 42.0, | |
| "learning_rate": 0.0009821361648969394, | |
| "loss": 28.3701, | |
| "step": 15725 | |
| }, | |
| { | |
| "epoch": 0.04918892921166543, | |
| "grad_norm": 38.5, | |
| "learning_rate": 0.0009836976889444097, | |
| "loss": 28.2846, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 0.04926700687708077, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.0009852592129918803, | |
| "loss": 28.4163, | |
| "step": 15775 | |
| }, | |
| { | |
| "epoch": 0.049345084542496115, | |
| "grad_norm": 37.25, | |
| "learning_rate": 0.0009868207370393504, | |
| "loss": 28.4691, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.04942316220791145, | |
| "grad_norm": 40.75, | |
| "learning_rate": 0.0009883822610868207, | |
| "loss": 28.3626, | |
| "step": 15825 | |
| }, | |
| { | |
| "epoch": 0.049501239873326795, | |
| "grad_norm": 38.75, | |
| "learning_rate": 0.000989943785134291, | |
| "loss": 28.2031, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 0.04957931753874214, | |
| "grad_norm": 54.25, | |
| "learning_rate": 0.0009915053091817613, | |
| "loss": 28.2261, | |
| "step": 15875 | |
| }, | |
| { | |
| "epoch": 0.04965739520415748, | |
| "grad_norm": 37.0, | |
| "learning_rate": 0.0009930668332292318, | |
| "loss": 27.9449, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.04973547286957282, | |
| "grad_norm": 34.25, | |
| "learning_rate": 0.0009946283572767022, | |
| "loss": 27.9998, | |
| "step": 15925 | |
| }, | |
| { | |
| "epoch": 0.04981355053498816, | |
| "grad_norm": 38.5, | |
| "learning_rate": 0.0009961898813241725, | |
| "loss": 27.9713, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 0.04989162820040351, | |
| "grad_norm": 34.5, | |
| "learning_rate": 0.0009977514053716428, | |
| "loss": 28.3091, | |
| "step": 15975 | |
| }, | |
| { | |
| "epoch": 0.04996970586581885, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.000999312929419113, | |
| "loss": 28.207, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.04996970586581885, | |
| "eval_loss": 28.333789825439453, | |
| "eval_runtime": 102.3237, | |
| "eval_samples_per_second": 50.848, | |
| "eval_steps_per_second": 3.186, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.05004778353123419, | |
| "grad_norm": 37.0, | |
| "learning_rate": 0.000999999994773354, | |
| "loss": 28.2375, | |
| "step": 16025 | |
| }, | |
| { | |
| "epoch": 0.05012586119664953, | |
| "grad_norm": 45.5, | |
| "learning_rate": 0.0009999999594401602, | |
| "loss": 28.1327, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 0.050203938862064874, | |
| "grad_norm": 42.75, | |
| "learning_rate": 0.0009999998907737678, | |
| "loss": 28.3186, | |
| "step": 16075 | |
| }, | |
| { | |
| "epoch": 0.05028201652748022, | |
| "grad_norm": 35.0, | |
| "learning_rate": 0.0009999997887741804, | |
| "loss": 27.9558, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.050360094192895555, | |
| "grad_norm": 47.75, | |
| "learning_rate": 0.0009999996534414057, | |
| "loss": 28.2493, | |
| "step": 16125 | |
| }, | |
| { | |
| "epoch": 0.0504381718583109, | |
| "grad_norm": 40.25, | |
| "learning_rate": 0.000999999484775452, | |
| "loss": 28.1951, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 0.05051624952372624, | |
| "grad_norm": 36.25, | |
| "learning_rate": 0.000999999282776331, | |
| "loss": 28.3094, | |
| "step": 16175 | |
| }, | |
| { | |
| "epoch": 0.050594327189141586, | |
| "grad_norm": 36.25, | |
| "learning_rate": 0.000999999047444056, | |
| "loss": 28.579, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.05067240485455692, | |
| "grad_norm": 38.5, | |
| "learning_rate": 0.0009999987787786427, | |
| "loss": 28.4296, | |
| "step": 16225 | |
| }, | |
| { | |
| "epoch": 0.050750482519972266, | |
| "grad_norm": 43.0, | |
| "learning_rate": 0.0009999984767801089, | |
| "loss": 28.686, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 0.05082856018538761, | |
| "grad_norm": 46.25, | |
| "learning_rate": 0.0009999981414484749, | |
| "loss": 28.5111, | |
| "step": 16275 | |
| }, | |
| { | |
| "epoch": 0.050906637850802954, | |
| "grad_norm": 44.75, | |
| "learning_rate": 0.000999997772783763, | |
| "loss": 28.7081, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.05098471551621829, | |
| "grad_norm": 43.0, | |
| "learning_rate": 0.0009999973707859977, | |
| "loss": 28.9352, | |
| "step": 16325 | |
| }, | |
| { | |
| "epoch": 0.051062793181633634, | |
| "grad_norm": 42.0, | |
| "learning_rate": 0.000999996935455206, | |
| "loss": 28.8936, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 0.05114087084704898, | |
| "grad_norm": 37.25, | |
| "learning_rate": 0.0009999964667914167, | |
| "loss": 28.9924, | |
| "step": 16375 | |
| }, | |
| { | |
| "epoch": 0.05121894851246432, | |
| "grad_norm": 37.75, | |
| "learning_rate": 0.0009999959647946613, | |
| "loss": 28.6103, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.05129702617787966, | |
| "grad_norm": 36.25, | |
| "learning_rate": 0.0009999954294649732, | |
| "loss": 28.7174, | |
| "step": 16425 | |
| }, | |
| { | |
| "epoch": 0.051375103843295, | |
| "grad_norm": 48.25, | |
| "learning_rate": 0.0009999948608023876, | |
| "loss": 28.5916, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 0.051453181508710345, | |
| "grad_norm": 37.0, | |
| "learning_rate": 0.0009999942588069433, | |
| "loss": 28.8703, | |
| "step": 16475 | |
| }, | |
| { | |
| "epoch": 0.05153125917412569, | |
| "grad_norm": 37.75, | |
| "learning_rate": 0.0009999936234786795, | |
| "loss": 29.1448, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.051609336839541026, | |
| "grad_norm": 36.25, | |
| "learning_rate": 0.0009999929548176391, | |
| "loss": 28.8964, | |
| "step": 16525 | |
| }, | |
| { | |
| "epoch": 0.05168741450495637, | |
| "grad_norm": 34.75, | |
| "learning_rate": 0.0009999922528238668, | |
| "loss": 28.6221, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 0.05176549217037171, | |
| "grad_norm": 39.5, | |
| "learning_rate": 0.000999991517497409, | |
| "loss": 28.9006, | |
| "step": 16575 | |
| }, | |
| { | |
| "epoch": 0.05184356983578706, | |
| "grad_norm": 35.25, | |
| "learning_rate": 0.0009999907488383148, | |
| "loss": 28.6834, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.05192164750120239, | |
| "grad_norm": 36.25, | |
| "learning_rate": 0.0009999899468466358, | |
| "loss": 28.4863, | |
| "step": 16625 | |
| }, | |
| { | |
| "epoch": 0.05199972516661774, | |
| "grad_norm": 34.75, | |
| "learning_rate": 0.0009999891115224251, | |
| "loss": 28.381, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 0.05207780283203308, | |
| "grad_norm": 42.25, | |
| "learning_rate": 0.0009999882428657384, | |
| "loss": 28.4007, | |
| "step": 16675 | |
| }, | |
| { | |
| "epoch": 0.052155880497448424, | |
| "grad_norm": 49.25, | |
| "learning_rate": 0.0009999873408766337, | |
| "loss": 28.3731, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.05223395816286376, | |
| "grad_norm": 45.0, | |
| "learning_rate": 0.0009999864055551713, | |
| "loss": 28.1782, | |
| "step": 16725 | |
| }, | |
| { | |
| "epoch": 0.052312035828279105, | |
| "grad_norm": 38.5, | |
| "learning_rate": 0.0009999854369014132, | |
| "loss": 28.2612, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 0.05239011349369445, | |
| "grad_norm": 41.5, | |
| "learning_rate": 0.0009999844349154244, | |
| "loss": 28.0716, | |
| "step": 16775 | |
| }, | |
| { | |
| "epoch": 0.05246819115910979, | |
| "grad_norm": 34.0, | |
| "learning_rate": 0.0009999833995972711, | |
| "loss": 27.842, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.05254626882452513, | |
| "grad_norm": 32.5, | |
| "learning_rate": 0.000999982330947023, | |
| "loss": 28.2459, | |
| "step": 16825 | |
| }, | |
| { | |
| "epoch": 0.05262434648994047, | |
| "grad_norm": 36.75, | |
| "learning_rate": 0.000999981228964751, | |
| "loss": 28.2205, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 0.052702424155355816, | |
| "grad_norm": 41.25, | |
| "learning_rate": 0.0009999800936505287, | |
| "loss": 28.2134, | |
| "step": 16875 | |
| }, | |
| { | |
| "epoch": 0.05278050182077116, | |
| "grad_norm": 41.0, | |
| "learning_rate": 0.0009999789250044312, | |
| "loss": 28.0064, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.0528585794861865, | |
| "grad_norm": 42.0, | |
| "learning_rate": 0.0009999777230265375, | |
| "loss": 28.2604, | |
| "step": 16925 | |
| }, | |
| { | |
| "epoch": 0.05293665715160184, | |
| "grad_norm": 40.75, | |
| "learning_rate": 0.0009999764877169268, | |
| "loss": 28.5458, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 0.053014734817017184, | |
| "grad_norm": 38.0, | |
| "learning_rate": 0.0009999752190756818, | |
| "loss": 28.4853, | |
| "step": 16975 | |
| }, | |
| { | |
| "epoch": 0.05309281248243253, | |
| "grad_norm": 38.75, | |
| "learning_rate": 0.000999973917102887, | |
| "loss": 28.8174, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.05309281248243253, | |
| "eval_loss": 28.75542449951172, | |
| "eval_runtime": 102.5252, | |
| "eval_samples_per_second": 50.749, | |
| "eval_steps_per_second": 3.18, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.053170890147847864, | |
| "grad_norm": 43.0, | |
| "learning_rate": 0.0009999725817986295, | |
| "loss": 28.8356, | |
| "step": 17025 | |
| }, | |
| { | |
| "epoch": 0.05324896781326321, | |
| "grad_norm": 38.0, | |
| "learning_rate": 0.0009999712131629978, | |
| "loss": 28.9959, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 0.05332704547867855, | |
| "grad_norm": 37.25, | |
| "learning_rate": 0.0009999698111960835, | |
| "loss": 28.713, | |
| "step": 17075 | |
| }, | |
| { | |
| "epoch": 0.053405123144093895, | |
| "grad_norm": 43.5, | |
| "learning_rate": 0.00099996837589798, | |
| "loss": 28.8244, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.05348320080950923, | |
| "grad_norm": 36.0, | |
| "learning_rate": 0.000999966907268783, | |
| "loss": 28.8987, | |
| "step": 17125 | |
| }, | |
| { | |
| "epoch": 0.053561278474924576, | |
| "grad_norm": 40.0, | |
| "learning_rate": 0.0009999654053085903, | |
| "loss": 28.6699, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 0.05363935614033992, | |
| "grad_norm": 36.25, | |
| "learning_rate": 0.000999963870017502, | |
| "loss": 28.8461, | |
| "step": 17175 | |
| }, | |
| { | |
| "epoch": 0.05371743380575526, | |
| "grad_norm": 34.0, | |
| "learning_rate": 0.0009999623013956208, | |
| "loss": 28.6992, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.0537955114711706, | |
| "grad_norm": 36.0, | |
| "learning_rate": 0.0009999606994430508, | |
| "loss": 28.6228, | |
| "step": 17225 | |
| }, | |
| { | |
| "epoch": 0.05387358913658594, | |
| "grad_norm": 31.0, | |
| "learning_rate": 0.000999959064159899, | |
| "loss": 28.801, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 0.05395166680200129, | |
| "grad_norm": 40.0, | |
| "learning_rate": 0.0009999573955462747, | |
| "loss": 28.9502, | |
| "step": 17275 | |
| }, | |
| { | |
| "epoch": 0.05402974446741663, | |
| "grad_norm": 45.25, | |
| "learning_rate": 0.0009999556936022887, | |
| "loss": 29.1268, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.05410782213283197, | |
| "grad_norm": 40.25, | |
| "learning_rate": 0.0009999539583280548, | |
| "loss": 29.3132, | |
| "step": 17325 | |
| }, | |
| { | |
| "epoch": 0.05418589979824731, | |
| "grad_norm": 48.75, | |
| "learning_rate": 0.0009999521897236885, | |
| "loss": 29.2909, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 0.054263977463662655, | |
| "grad_norm": 41.75, | |
| "learning_rate": 0.0009999503877893075, | |
| "loss": 29.5531, | |
| "step": 17375 | |
| }, | |
| { | |
| "epoch": 0.054342055129078, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.0009999485525250323, | |
| "loss": 29.7544, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.054420132794493335, | |
| "grad_norm": 35.25, | |
| "learning_rate": 0.0009999466839309852, | |
| "loss": 29.7906, | |
| "step": 17425 | |
| }, | |
| { | |
| "epoch": 0.05449821045990868, | |
| "grad_norm": 39.5, | |
| "learning_rate": 0.0009999447820072907, | |
| "loss": 30.0083, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 0.05457628812532402, | |
| "grad_norm": 44.0, | |
| "learning_rate": 0.0009999428467540755, | |
| "loss": 29.8602, | |
| "step": 17475 | |
| }, | |
| { | |
| "epoch": 0.054654365790739366, | |
| "grad_norm": 41.25, | |
| "learning_rate": 0.0009999408781714686, | |
| "loss": 30.1297, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.0547324434561547, | |
| "grad_norm": 36.75, | |
| "learning_rate": 0.0009999388762596015, | |
| "loss": 30.0162, | |
| "step": 17525 | |
| }, | |
| { | |
| "epoch": 0.05481052112157005, | |
| "grad_norm": 39.5, | |
| "learning_rate": 0.0009999368410186075, | |
| "loss": 30.2735, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 0.05488859878698539, | |
| "grad_norm": 38.25, | |
| "learning_rate": 0.0009999347724486223, | |
| "loss": 30.0508, | |
| "step": 17575 | |
| }, | |
| { | |
| "epoch": 0.054966676452400734, | |
| "grad_norm": 39.0, | |
| "learning_rate": 0.0009999326705497837, | |
| "loss": 30.0643, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.05504475411781607, | |
| "grad_norm": 45.25, | |
| "learning_rate": 0.0009999305353222319, | |
| "loss": 30.1616, | |
| "step": 17625 | |
| }, | |
| { | |
| "epoch": 0.055122831783231414, | |
| "grad_norm": 37.5, | |
| "learning_rate": 0.0009999283667661094, | |
| "loss": 29.9471, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 0.05520090944864676, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.0009999261648815604, | |
| "loss": 30.057, | |
| "step": 17675 | |
| }, | |
| { | |
| "epoch": 0.0552789871140621, | |
| "grad_norm": 44.0, | |
| "learning_rate": 0.0009999239296687322, | |
| "loss": 30.1771, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.05535706477947744, | |
| "grad_norm": 37.75, | |
| "learning_rate": 0.0009999216611277734, | |
| "loss": 30.6398, | |
| "step": 17725 | |
| }, | |
| { | |
| "epoch": 0.05543514244489278, | |
| "grad_norm": 49.75, | |
| "learning_rate": 0.000999919359258835, | |
| "loss": 30.2826, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 0.055513220110308126, | |
| "grad_norm": 38.75, | |
| "learning_rate": 0.0009999170240620715, | |
| "loss": 30.4671, | |
| "step": 17775 | |
| }, | |
| { | |
| "epoch": 0.05559129777572347, | |
| "grad_norm": 36.5, | |
| "learning_rate": 0.0009999146555376376, | |
| "loss": 30.3188, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.055669375441138806, | |
| "grad_norm": 41.25, | |
| "learning_rate": 0.0009999122536856913, | |
| "loss": 30.5831, | |
| "step": 17825 | |
| }, | |
| { | |
| "epoch": 0.05574745310655415, | |
| "grad_norm": 37.25, | |
| "learning_rate": 0.000999909818506393, | |
| "loss": 30.3014, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 0.055825530771969493, | |
| "grad_norm": 43.5, | |
| "learning_rate": 0.0009999073499999051, | |
| "loss": 30.3619, | |
| "step": 17875 | |
| }, | |
| { | |
| "epoch": 0.05590360843738484, | |
| "grad_norm": 41.25, | |
| "learning_rate": 0.0009999048481663922, | |
| "loss": 30.207, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.05598168610280018, | |
| "grad_norm": 55.75, | |
| "learning_rate": 0.0009999023130060208, | |
| "loss": 30.5041, | |
| "step": 17925 | |
| }, | |
| { | |
| "epoch": 0.05605976376821552, | |
| "grad_norm": 52.0, | |
| "learning_rate": 0.00099989974451896, | |
| "loss": 30.6651, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 0.05613784143363086, | |
| "grad_norm": 51.25, | |
| "learning_rate": 0.000999897142705381, | |
| "loss": 30.8339, | |
| "step": 17975 | |
| }, | |
| { | |
| "epoch": 0.056215919099046205, | |
| "grad_norm": 48.75, | |
| "learning_rate": 0.0009998945075654572, | |
| "loss": 30.9781, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.056215919099046205, | |
| "eval_loss": 31.040813446044922, | |
| "eval_runtime": 102.3512, | |
| "eval_samples_per_second": 50.835, | |
| "eval_steps_per_second": 3.185, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.05629399676446155, | |
| "grad_norm": 43.5, | |
| "learning_rate": 0.0009998918390993648, | |
| "loss": 30.9913, | |
| "step": 18025 | |
| }, | |
| { | |
| "epoch": 0.056372074429876885, | |
| "grad_norm": 42.75, | |
| "learning_rate": 0.000999889137307281, | |
| "loss": 31.086, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 0.05645015209529223, | |
| "grad_norm": 41.0, | |
| "learning_rate": 0.0009998864021893864, | |
| "loss": 31.0512, | |
| "step": 18075 | |
| }, | |
| { | |
| "epoch": 0.05652822976070757, | |
| "grad_norm": 42.75, | |
| "learning_rate": 0.0009998836337458629, | |
| "loss": 31.2091, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.056606307426122916, | |
| "grad_norm": 44.25, | |
| "learning_rate": 0.0009998808319768954, | |
| "loss": 31.1535, | |
| "step": 18125 | |
| }, | |
| { | |
| "epoch": 0.05668438509153825, | |
| "grad_norm": 43.5, | |
| "learning_rate": 0.0009998779968826707, | |
| "loss": 31.3788, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 0.0567624627569536, | |
| "grad_norm": 43.75, | |
| "learning_rate": 0.0009998751284633779, | |
| "loss": 31.3632, | |
| "step": 18175 | |
| }, | |
| { | |
| "epoch": 0.05684054042236894, | |
| "grad_norm": 39.0, | |
| "learning_rate": 0.0009998722267192076, | |
| "loss": 31.101, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.056918618087784284, | |
| "grad_norm": 38.25, | |
| "learning_rate": 0.000999869291650354, | |
| "loss": 30.8788, | |
| "step": 18225 | |
| }, | |
| { | |
| "epoch": 0.05699669575319962, | |
| "grad_norm": 36.75, | |
| "learning_rate": 0.0009998663232570122, | |
| "loss": 31.0841, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 0.057074773418614964, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.0009998633215393805, | |
| "loss": 31.4425, | |
| "step": 18275 | |
| }, | |
| { | |
| "epoch": 0.05715285108403031, | |
| "grad_norm": 37.5, | |
| "learning_rate": 0.000999860286497659, | |
| "loss": 31.6592, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.05723092874944565, | |
| "grad_norm": 40.0, | |
| "learning_rate": 0.0009998572181320496, | |
| "loss": 31.3277, | |
| "step": 18325 | |
| }, | |
| { | |
| "epoch": 0.05730900641486099, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.0009998541164427575, | |
| "loss": 31.3697, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 0.05738708408027633, | |
| "grad_norm": 35.0, | |
| "learning_rate": 0.0009998509814299888, | |
| "loss": 31.2663, | |
| "step": 18375 | |
| }, | |
| { | |
| "epoch": 0.057465161745691676, | |
| "grad_norm": 37.25, | |
| "learning_rate": 0.000999847813093953, | |
| "loss": 31.6682, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.05754323941110702, | |
| "grad_norm": 38.75, | |
| "learning_rate": 0.0009998446114348612, | |
| "loss": 31.7364, | |
| "step": 18425 | |
| }, | |
| { | |
| "epoch": 0.057621317076522356, | |
| "grad_norm": 48.5, | |
| "learning_rate": 0.0009998413764529266, | |
| "loss": 31.8273, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 0.0576993947419377, | |
| "grad_norm": 39.5, | |
| "learning_rate": 0.0009998381081483651, | |
| "loss": 32.178, | |
| "step": 18475 | |
| }, | |
| { | |
| "epoch": 0.057777472407353044, | |
| "grad_norm": 38.75, | |
| "learning_rate": 0.0009998348065213946, | |
| "loss": 32.3324, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.05785555007276839, | |
| "grad_norm": 41.75, | |
| "learning_rate": 0.000999831471572235, | |
| "loss": 32.6464, | |
| "step": 18525 | |
| }, | |
| { | |
| "epoch": 0.057933627738183724, | |
| "grad_norm": 42.0, | |
| "learning_rate": 0.0009998281033011091, | |
| "loss": 32.1848, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 0.05801170540359907, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.000999824701708241, | |
| "loss": 32.543, | |
| "step": 18575 | |
| }, | |
| { | |
| "epoch": 0.05808978306901441, | |
| "grad_norm": 48.5, | |
| "learning_rate": 0.0009998212667938578, | |
| "loss": 32.4726, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.058167860734429755, | |
| "grad_norm": 45.0, | |
| "learning_rate": 0.000999817798558188, | |
| "loss": 32.2877, | |
| "step": 18625 | |
| }, | |
| { | |
| "epoch": 0.05824593839984509, | |
| "grad_norm": 38.25, | |
| "learning_rate": 0.0009998142970014633, | |
| "loss": 32.4187, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 0.058324016065260435, | |
| "grad_norm": 51.5, | |
| "learning_rate": 0.0009998107621239168, | |
| "loss": 32.6334, | |
| "step": 18675 | |
| }, | |
| { | |
| "epoch": 0.05840209373067578, | |
| "grad_norm": 48.5, | |
| "learning_rate": 0.0009998071939257842, | |
| "loss": 33.0217, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.05848017139609112, | |
| "grad_norm": 50.0, | |
| "learning_rate": 0.0009998035924073036, | |
| "loss": 32.839, | |
| "step": 18725 | |
| }, | |
| { | |
| "epoch": 0.05855824906150646, | |
| "grad_norm": 41.75, | |
| "learning_rate": 0.000999799957568715, | |
| "loss": 32.84, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 0.0586363267269218, | |
| "grad_norm": 55.5, | |
| "learning_rate": 0.0009997962894102608, | |
| "loss": 33.0097, | |
| "step": 18775 | |
| }, | |
| { | |
| "epoch": 0.05871440439233715, | |
| "grad_norm": 52.5, | |
| "learning_rate": 0.0009997925879321854, | |
| "loss": 33.0055, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.05879248205775249, | |
| "grad_norm": 47.25, | |
| "learning_rate": 0.0009997888531347358, | |
| "loss": 33.3652, | |
| "step": 18825 | |
| }, | |
| { | |
| "epoch": 0.05887055972316783, | |
| "grad_norm": 41.25, | |
| "learning_rate": 0.0009997850850181605, | |
| "loss": 33.1608, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 0.05894863738858317, | |
| "grad_norm": 42.75, | |
| "learning_rate": 0.000999781283582711, | |
| "loss": 33.2872, | |
| "step": 18875 | |
| }, | |
| { | |
| "epoch": 0.059026715053998514, | |
| "grad_norm": 43.25, | |
| "learning_rate": 0.0009997774488286408, | |
| "loss": 33.0581, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.05910479271941386, | |
| "grad_norm": 48.0, | |
| "learning_rate": 0.0009997735807562055, | |
| "loss": 33.0212, | |
| "step": 18925 | |
| }, | |
| { | |
| "epoch": 0.059182870384829195, | |
| "grad_norm": 39.0, | |
| "learning_rate": 0.000999769679365663, | |
| "loss": 32.7047, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 0.05926094805024454, | |
| "grad_norm": 41.25, | |
| "learning_rate": 0.0009997657446572735, | |
| "loss": 32.7831, | |
| "step": 18975 | |
| }, | |
| { | |
| "epoch": 0.05933902571565988, | |
| "grad_norm": 42.75, | |
| "learning_rate": 0.0009997617766312988, | |
| "loss": 32.8744, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.05933902571565988, | |
| "eval_loss": 32.887264251708984, | |
| "eval_runtime": 102.2215, | |
| "eval_samples_per_second": 50.899, | |
| "eval_steps_per_second": 3.189, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.059417103381075226, | |
| "grad_norm": 41.5, | |
| "learning_rate": 0.0009997577752880041, | |
| "loss": 32.8132, | |
| "step": 19025 | |
| }, | |
| { | |
| "epoch": 0.05949518104649056, | |
| "grad_norm": 43.75, | |
| "learning_rate": 0.0009997537406276557, | |
| "loss": 32.9501, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 0.059573258711905906, | |
| "grad_norm": 45.25, | |
| "learning_rate": 0.0009997496726505228, | |
| "loss": 32.7061, | |
| "step": 19075 | |
| }, | |
| { | |
| "epoch": 0.05965133637732125, | |
| "grad_norm": 37.5, | |
| "learning_rate": 0.0009997455713568763, | |
| "loss": 32.7181, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.059729414042736594, | |
| "grad_norm": 41.0, | |
| "learning_rate": 0.00099974143674699, | |
| "loss": 32.554, | |
| "step": 19125 | |
| }, | |
| { | |
| "epoch": 0.05980749170815193, | |
| "grad_norm": 41.5, | |
| "learning_rate": 0.0009997372688211395, | |
| "loss": 32.7137, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 0.059885569373567274, | |
| "grad_norm": 45.0, | |
| "learning_rate": 0.0009997330675796023, | |
| "loss": 33.0025, | |
| "step": 19175 | |
| }, | |
| { | |
| "epoch": 0.05996364703898262, | |
| "grad_norm": 42.0, | |
| "learning_rate": 0.000999728833022659, | |
| "loss": 32.9643, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.06004172470439796, | |
| "grad_norm": 52.5, | |
| "learning_rate": 0.0009997245651505915, | |
| "loss": 32.8268, | |
| "step": 19225 | |
| }, | |
| { | |
| "epoch": 0.0601198023698133, | |
| "grad_norm": 43.0, | |
| "learning_rate": 0.0009997202639636844, | |
| "loss": 32.8, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 0.06019788003522864, | |
| "grad_norm": 56.5, | |
| "learning_rate": 0.0009997159294622246, | |
| "loss": 32.9133, | |
| "step": 19275 | |
| }, | |
| { | |
| "epoch": 0.060275957700643985, | |
| "grad_norm": 44.25, | |
| "learning_rate": 0.000999711561646501, | |
| "loss": 32.8573, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.06035403536605933, | |
| "grad_norm": 44.0, | |
| "learning_rate": 0.0009997071605168043, | |
| "loss": 32.7512, | |
| "step": 19325 | |
| }, | |
| { | |
| "epoch": 0.060432113031474666, | |
| "grad_norm": 36.5, | |
| "learning_rate": 0.000999702726073429, | |
| "loss": 32.9202, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 0.06051019069689001, | |
| "grad_norm": 40.0, | |
| "learning_rate": 0.0009996982583166695, | |
| "loss": 32.942, | |
| "step": 19375 | |
| }, | |
| { | |
| "epoch": 0.06058826836230535, | |
| "grad_norm": 39.0, | |
| "learning_rate": 0.0009996937572468246, | |
| "loss": 32.8775, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.0606663460277207, | |
| "grad_norm": 37.0, | |
| "learning_rate": 0.000999689222864194, | |
| "loss": 32.8532, | |
| "step": 19425 | |
| }, | |
| { | |
| "epoch": 0.06074442369313603, | |
| "grad_norm": 47.25, | |
| "learning_rate": 0.0009996846551690798, | |
| "loss": 32.9941, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 0.06082250135855138, | |
| "grad_norm": 38.0, | |
| "learning_rate": 0.0009996800541617868, | |
| "loss": 32.8616, | |
| "step": 19475 | |
| }, | |
| { | |
| "epoch": 0.06090057902396672, | |
| "grad_norm": 39.5, | |
| "learning_rate": 0.0009996754198426216, | |
| "loss": 32.9031, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.060978656689382064, | |
| "grad_norm": 44.5, | |
| "learning_rate": 0.0009996707522118933, | |
| "loss": 33.0028, | |
| "step": 19525 | |
| }, | |
| { | |
| "epoch": 0.0610567343547974, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.0009996660512699128, | |
| "loss": 32.8195, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 0.061134812020212745, | |
| "grad_norm": 40.75, | |
| "learning_rate": 0.0009996613170169936, | |
| "loss": 32.571, | |
| "step": 19575 | |
| }, | |
| { | |
| "epoch": 0.06121288968562809, | |
| "grad_norm": 36.75, | |
| "learning_rate": 0.0009996565494534517, | |
| "loss": 32.5517, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.06129096735104343, | |
| "grad_norm": 38.0, | |
| "learning_rate": 0.0009996517485796044, | |
| "loss": 32.5484, | |
| "step": 19625 | |
| }, | |
| { | |
| "epoch": 0.06136904501645877, | |
| "grad_norm": 41.75, | |
| "learning_rate": 0.000999646914395772, | |
| "loss": 32.5895, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 0.06144712268187411, | |
| "grad_norm": 42.0, | |
| "learning_rate": 0.0009996420469022766, | |
| "loss": 32.8765, | |
| "step": 19675 | |
| }, | |
| { | |
| "epoch": 0.061525200347289456, | |
| "grad_norm": 38.5, | |
| "learning_rate": 0.0009996371460994431, | |
| "loss": 32.8793, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.0616032780127048, | |
| "grad_norm": 40.25, | |
| "learning_rate": 0.0009996322119875977, | |
| "loss": 33.0708, | |
| "step": 19725 | |
| }, | |
| { | |
| "epoch": 0.06168135567812014, | |
| "grad_norm": 38.0, | |
| "learning_rate": 0.00099962724456707, | |
| "loss": 33.188, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 0.06175943334353548, | |
| "grad_norm": 49.0, | |
| "learning_rate": 0.0009996222438381904, | |
| "loss": 33.2918, | |
| "step": 19775 | |
| }, | |
| { | |
| "epoch": 0.061837511008950824, | |
| "grad_norm": 44.75, | |
| "learning_rate": 0.0009996172098012928, | |
| "loss": 33.4949, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.06191558867436617, | |
| "grad_norm": 43.25, | |
| "learning_rate": 0.0009996121424567126, | |
| "loss": 33.8741, | |
| "step": 19825 | |
| }, | |
| { | |
| "epoch": 0.061993666339781504, | |
| "grad_norm": 41.75, | |
| "learning_rate": 0.0009996070418047877, | |
| "loss": 33.6041, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 0.06207174400519685, | |
| "grad_norm": 40.25, | |
| "learning_rate": 0.000999601907845858, | |
| "loss": 33.6722, | |
| "step": 19875 | |
| }, | |
| { | |
| "epoch": 0.06214982167061219, | |
| "grad_norm": 40.5, | |
| "learning_rate": 0.000999596740580266, | |
| "loss": 33.484, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.062227899336027535, | |
| "grad_norm": 46.25, | |
| "learning_rate": 0.000999591540008356, | |
| "loss": 33.7352, | |
| "step": 19925 | |
| }, | |
| { | |
| "epoch": 0.06230597700144287, | |
| "grad_norm": 48.5, | |
| "learning_rate": 0.0009995863061304747, | |
| "loss": 33.9541, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 0.062384054666858216, | |
| "grad_norm": 44.0, | |
| "learning_rate": 0.0009995810389469711, | |
| "loss": 34.2383, | |
| "step": 19975 | |
| }, | |
| { | |
| "epoch": 0.06246213233227356, | |
| "grad_norm": 40.75, | |
| "learning_rate": 0.0009995757384581964, | |
| "loss": 33.8251, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.06246213233227356, | |
| "eval_loss": 34.19303512573242, | |
| "eval_runtime": 102.3811, | |
| "eval_samples_per_second": 50.82, | |
| "eval_steps_per_second": 3.184, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.0625402099976889, | |
| "grad_norm": 50.0, | |
| "learning_rate": 0.000999570404664504, | |
| "loss": 34.3706, | |
| "step": 20025 | |
| }, | |
| { | |
| "epoch": 0.06261828766310425, | |
| "grad_norm": 45.75, | |
| "learning_rate": 0.0009995650375662492, | |
| "loss": 34.1775, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 0.06269636532851959, | |
| "grad_norm": 43.5, | |
| "learning_rate": 0.0009995596371637897, | |
| "loss": 34.3327, | |
| "step": 20075 | |
| }, | |
| { | |
| "epoch": 0.06277444299393492, | |
| "grad_norm": 43.25, | |
| "learning_rate": 0.0009995542034574863, | |
| "loss": 34.3871, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.06285252065935026, | |
| "grad_norm": 42.75, | |
| "learning_rate": 0.0009995487364477004, | |
| "loss": 33.8116, | |
| "step": 20125 | |
| }, | |
| { | |
| "epoch": 0.06293059832476561, | |
| "grad_norm": 37.5, | |
| "learning_rate": 0.0009995432361347971, | |
| "loss": 33.9015, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 0.06300867599018095, | |
| "grad_norm": 38.5, | |
| "learning_rate": 0.0009995377025191427, | |
| "loss": 33.8639, | |
| "step": 20175 | |
| }, | |
| { | |
| "epoch": 0.0630867536555963, | |
| "grad_norm": 37.25, | |
| "learning_rate": 0.0009995321356011063, | |
| "loss": 33.6663, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.06316483132101164, | |
| "grad_norm": 40.5, | |
| "learning_rate": 0.0009995265353810589, | |
| "loss": 33.8264, | |
| "step": 20225 | |
| }, | |
| { | |
| "epoch": 0.06324290898642698, | |
| "grad_norm": 45.25, | |
| "learning_rate": 0.0009995209018593737, | |
| "loss": 33.6851, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 0.06332098665184233, | |
| "grad_norm": 42.0, | |
| "learning_rate": 0.0009995152350364266, | |
| "loss": 33.5799, | |
| "step": 20275 | |
| }, | |
| { | |
| "epoch": 0.06339906431725766, | |
| "grad_norm": 43.25, | |
| "learning_rate": 0.000999509534912595, | |
| "loss": 33.6905, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.063477141982673, | |
| "grad_norm": 37.25, | |
| "learning_rate": 0.0009995038014882593, | |
| "loss": 33.4839, | |
| "step": 20325 | |
| }, | |
| { | |
| "epoch": 0.06355521964808834, | |
| "grad_norm": 35.75, | |
| "learning_rate": 0.0009994980347638016, | |
| "loss": 33.6105, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 0.06363329731350369, | |
| "grad_norm": 38.0, | |
| "learning_rate": 0.0009994922347396063, | |
| "loss": 33.9047, | |
| "step": 20375 | |
| }, | |
| { | |
| "epoch": 0.06371137497891903, | |
| "grad_norm": 40.25, | |
| "learning_rate": 0.00099948640141606, | |
| "loss": 34.1876, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.06378945264433437, | |
| "grad_norm": 45.75, | |
| "learning_rate": 0.0009994805347935517, | |
| "loss": 33.9303, | |
| "step": 20425 | |
| }, | |
| { | |
| "epoch": 0.06386753030974972, | |
| "grad_norm": 42.75, | |
| "learning_rate": 0.0009994746348724727, | |
| "loss": 33.951, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 0.06394560797516506, | |
| "grad_norm": 50.0, | |
| "learning_rate": 0.000999468701653216, | |
| "loss": 34.056, | |
| "step": 20475 | |
| }, | |
| { | |
| "epoch": 0.0640236856405804, | |
| "grad_norm": 50.5, | |
| "learning_rate": 0.0009994627351361772, | |
| "loss": 33.9114, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.06410176330599573, | |
| "grad_norm": 42.25, | |
| "learning_rate": 0.0009994567353217541, | |
| "loss": 34.2422, | |
| "step": 20525 | |
| }, | |
| { | |
| "epoch": 0.06417984097141108, | |
| "grad_norm": 44.25, | |
| "learning_rate": 0.0009994507022103465, | |
| "loss": 34.0631, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 0.06425791863682642, | |
| "grad_norm": 39.75, | |
| "learning_rate": 0.000999444635802357, | |
| "loss": 33.8447, | |
| "step": 20575 | |
| }, | |
| { | |
| "epoch": 0.06433599630224177, | |
| "grad_norm": 44.75, | |
| "learning_rate": 0.00099943853609819, | |
| "loss": 33.8587, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.06441407396765711, | |
| "grad_norm": 39.25, | |
| "learning_rate": 0.0009994324030982518, | |
| "loss": 33.943, | |
| "step": 20625 | |
| }, | |
| { | |
| "epoch": 0.06449215163307245, | |
| "grad_norm": 41.75, | |
| "learning_rate": 0.0009994262368029515, | |
| "loss": 33.9425, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 0.0645702292984878, | |
| "grad_norm": 44.5, | |
| "learning_rate": 0.0009994200372127, | |
| "loss": 34.0832, | |
| "step": 20675 | |
| }, | |
| { | |
| "epoch": 0.06464830696390314, | |
| "grad_norm": 39.25, | |
| "learning_rate": 0.000999413804327911, | |
| "loss": 33.9888, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.06472638462931847, | |
| "grad_norm": 43.75, | |
| "learning_rate": 0.0009994075381489994, | |
| "loss": 34.1022, | |
| "step": 20725 | |
| }, | |
| { | |
| "epoch": 0.06480446229473381, | |
| "grad_norm": 44.25, | |
| "learning_rate": 0.0009994012386763836, | |
| "loss": 33.9719, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 0.06488253996014916, | |
| "grad_norm": 42.0, | |
| "learning_rate": 0.000999394905910483, | |
| "loss": 33.7568, | |
| "step": 20775 | |
| }, | |
| { | |
| "epoch": 0.0649606176255645, | |
| "grad_norm": 43.75, | |
| "learning_rate": 0.0009993885398517201, | |
| "loss": 33.7079, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.06503869529097984, | |
| "grad_norm": 40.0, | |
| "learning_rate": 0.0009993821405005195, | |
| "loss": 33.8396, | |
| "step": 20825 | |
| }, | |
| { | |
| "epoch": 0.06511677295639519, | |
| "grad_norm": 42.5, | |
| "learning_rate": 0.0009993757078573073, | |
| "loss": 33.6027, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 0.06519485062181053, | |
| "grad_norm": 42.5, | |
| "learning_rate": 0.0009993692419225126, | |
| "loss": 33.5388, | |
| "step": 20875 | |
| }, | |
| { | |
| "epoch": 0.06527292828722588, | |
| "grad_norm": 55.0, | |
| "learning_rate": 0.0009993627426965667, | |
| "loss": 33.775, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.0653510059526412, | |
| "grad_norm": 39.0, | |
| "learning_rate": 0.0009993562101799024, | |
| "loss": 33.8984, | |
| "step": 20925 | |
| }, | |
| { | |
| "epoch": 0.06542908361805655, | |
| "grad_norm": 41.5, | |
| "learning_rate": 0.0009993496443729557, | |
| "loss": 33.8582, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 0.06550716128347189, | |
| "grad_norm": 37.25, | |
| "learning_rate": 0.0009993430452761639, | |
| "loss": 33.8915, | |
| "step": 20975 | |
| }, | |
| { | |
| "epoch": 0.06558523894888724, | |
| "grad_norm": 35.0, | |
| "learning_rate": 0.0009993364128899672, | |
| "loss": 33.5705, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.06558523894888724, | |
| "eval_loss": 33.73247146606445, | |
| "eval_runtime": 102.3252, | |
| "eval_samples_per_second": 50.848, | |
| "eval_steps_per_second": 3.186, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.06566331661430258, | |
| "grad_norm": 37.0, | |
| "learning_rate": 0.0009993297472148076, | |
| "loss": 33.5467, | |
| "step": 21025 | |
| }, | |
| { | |
| "epoch": 0.06574139427971792, | |
| "grad_norm": 38.5, | |
| "learning_rate": 0.0009993230482511295, | |
| "loss": 33.6705, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 0.06581947194513327, | |
| "grad_norm": 39.0, | |
| "learning_rate": 0.0009993163159993798, | |
| "loss": 33.7872, | |
| "step": 21075 | |
| }, | |
| { | |
| "epoch": 0.06589754961054861, | |
| "grad_norm": 45.5, | |
| "learning_rate": 0.0009993095504600067, | |
| "loss": 33.6316, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.06597562727596394, | |
| "grad_norm": 38.0, | |
| "learning_rate": 0.0009993027516334617, | |
| "loss": 33.8796, | |
| "step": 21125 | |
| }, | |
| { | |
| "epoch": 0.06605370494137928, | |
| "grad_norm": 43.75, | |
| "learning_rate": 0.000999295919520198, | |
| "loss": 34.0526, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 0.06613178260679463, | |
| "grad_norm": 36.0, | |
| "learning_rate": 0.000999289054120671, | |
| "loss": 34.1438, | |
| "step": 21175 | |
| }, | |
| { | |
| "epoch": 0.06620986027220997, | |
| "grad_norm": 38.0, | |
| "learning_rate": 0.0009992821554353382, | |
| "loss": 33.7974, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.06628793793762532, | |
| "grad_norm": 46.0, | |
| "learning_rate": 0.00099927522346466, | |
| "loss": 33.8107, | |
| "step": 21225 | |
| }, | |
| { | |
| "epoch": 0.06636601560304066, | |
| "grad_norm": 45.75, | |
| "learning_rate": 0.0009992682582090982, | |
| "loss": 33.8952, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 0.066444093268456, | |
| "grad_norm": 39.5, | |
| "learning_rate": 0.0009992612596691171, | |
| "loss": 34.201, | |
| "step": 21275 | |
| }, | |
| { | |
| "epoch": 0.06652217093387135, | |
| "grad_norm": 49.25, | |
| "learning_rate": 0.0009992542278451832, | |
| "loss": 34.2007, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.06660024859928668, | |
| "grad_norm": 42.0, | |
| "learning_rate": 0.0009992471627377657, | |
| "loss": 34.3501, | |
| "step": 21325 | |
| }, | |
| { | |
| "epoch": 0.06667832626470202, | |
| "grad_norm": 48.75, | |
| "learning_rate": 0.0009992400643473354, | |
| "loss": 34.4321, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 0.06675640393011736, | |
| "grad_norm": 43.25, | |
| "learning_rate": 0.0009992329326743653, | |
| "loss": 34.638, | |
| "step": 21375 | |
| }, | |
| { | |
| "epoch": 0.06683448159553271, | |
| "grad_norm": 41.75, | |
| "learning_rate": 0.000999225767719331, | |
| "loss": 34.588, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.06691255926094805, | |
| "grad_norm": 44.5, | |
| "learning_rate": 0.0009992185694827102, | |
| "loss": 34.7111, | |
| "step": 21425 | |
| }, | |
| { | |
| "epoch": 0.0669906369263634, | |
| "grad_norm": 50.5, | |
| "learning_rate": 0.0009992113379649829, | |
| "loss": 34.7677, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 0.06706871459177874, | |
| "grad_norm": 62.0, | |
| "learning_rate": 0.000999204073166631, | |
| "loss": 35.0234, | |
| "step": 21475 | |
| }, | |
| { | |
| "epoch": 0.06714679225719408, | |
| "grad_norm": 48.0, | |
| "learning_rate": 0.0009991967750881388, | |
| "loss": 35.0909, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.06722486992260941, | |
| "grad_norm": 49.5, | |
| "learning_rate": 0.000999189443729993, | |
| "loss": 35.4811, | |
| "step": 21525 | |
| }, | |
| { | |
| "epoch": 0.06730294758802476, | |
| "grad_norm": 58.0, | |
| "learning_rate": 0.0009991820790926824, | |
| "loss": 35.2726, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 0.0673810252534401, | |
| "grad_norm": 55.5, | |
| "learning_rate": 0.0009991746811766975, | |
| "loss": 35.629, | |
| "step": 21575 | |
| }, | |
| { | |
| "epoch": 0.06745910291885544, | |
| "grad_norm": 44.0, | |
| "learning_rate": 0.000999167249982532, | |
| "loss": 35.4736, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.06753718058427079, | |
| "grad_norm": 45.75, | |
| "learning_rate": 0.0009991597855106814, | |
| "loss": 35.2275, | |
| "step": 21625 | |
| }, | |
| { | |
| "epoch": 0.06761525824968613, | |
| "grad_norm": 41.5, | |
| "learning_rate": 0.0009991522877616428, | |
| "loss": 35.2907, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 0.06769333591510147, | |
| "grad_norm": 56.5, | |
| "learning_rate": 0.000999144756735916, | |
| "loss": 35.2988, | |
| "step": 21675 | |
| }, | |
| { | |
| "epoch": 0.06777141358051682, | |
| "grad_norm": 56.0, | |
| "learning_rate": 0.000999137192434004, | |
| "loss": 35.2948, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.06784949124593215, | |
| "grad_norm": 42.0, | |
| "learning_rate": 0.0009991295948564103, | |
| "loss": 35.1186, | |
| "step": 21725 | |
| }, | |
| { | |
| "epoch": 0.06792756891134749, | |
| "grad_norm": 43.25, | |
| "learning_rate": 0.0009991219640036416, | |
| "loss": 35.115, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 0.06800564657676283, | |
| "grad_norm": 43.75, | |
| "learning_rate": 0.0009991142998762065, | |
| "loss": 35.347, | |
| "step": 21775 | |
| }, | |
| { | |
| "epoch": 0.06808372424217818, | |
| "grad_norm": 45.0, | |
| "learning_rate": 0.000999106602474616, | |
| "loss": 35.3008, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.06816180190759352, | |
| "grad_norm": 66.0, | |
| "learning_rate": 0.0009990988717993832, | |
| "loss": 35.321, | |
| "step": 21825 | |
| }, | |
| { | |
| "epoch": 0.06823987957300887, | |
| "grad_norm": 56.0, | |
| "learning_rate": 0.0009990911078510238, | |
| "loss": 35.373, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 0.06831795723842421, | |
| "grad_norm": 49.25, | |
| "learning_rate": 0.000999083310630055, | |
| "loss": 35.2404, | |
| "step": 21875 | |
| }, | |
| { | |
| "epoch": 0.06839603490383955, | |
| "grad_norm": 46.0, | |
| "learning_rate": 0.000999075480136997, | |
| "loss": 35.2177, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.06847411256925488, | |
| "grad_norm": 43.5, | |
| "learning_rate": 0.0009990676163723715, | |
| "loss": 35.1759, | |
| "step": 21925 | |
| }, | |
| { | |
| "epoch": 0.06855219023467023, | |
| "grad_norm": 54.5, | |
| "learning_rate": 0.000999059719336703, | |
| "loss": 34.7193, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 0.06863026790008557, | |
| "grad_norm": 48.25, | |
| "learning_rate": 0.0009990517890305175, | |
| "loss": 34.6676, | |
| "step": 21975 | |
| }, | |
| { | |
| "epoch": 0.06870834556550091, | |
| "grad_norm": 44.75, | |
| "learning_rate": 0.0009990438254543442, | |
| "loss": 34.4965, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.06870834556550091, | |
| "eval_loss": 34.531646728515625, | |
| "eval_runtime": 102.6371, | |
| "eval_samples_per_second": 50.693, | |
| "eval_steps_per_second": 3.176, | |
| "step": 22000 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 320194, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.7899608404454277e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |