Text Generation
Transformers
Safetensors
qwen3
32B
restoration
joseon-dynasty
conversational
text-generation-inference
Instructions to use DAMI-Lab/ARI-32B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use DAMI-Lab/ARI-32B with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="DAMI-Lab/ARI-32B") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("DAMI-Lab/ARI-32B") model = AutoModelForCausalLM.from_pretrained("DAMI-Lab/ARI-32B") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use DAMI-Lab/ARI-32B with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "DAMI-Lab/ARI-32B" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DAMI-Lab/ARI-32B", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/DAMI-Lab/ARI-32B
- SGLang
How to use DAMI-Lab/ARI-32B with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "DAMI-Lab/ARI-32B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DAMI-Lab/ARI-32B", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "DAMI-Lab/ARI-32B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DAMI-Lab/ARI-32B", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use DAMI-Lab/ARI-32B with Docker Model Runner:
docker model run hf.co/DAMI-Lab/ARI-32B
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9587999085893375, | |
| "eval_steps": 500, | |
| "global_step": 60000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.3703370141983031, | |
| "epoch": 0.0016323332571577813, | |
| "grad_norm": 37.0, | |
| "learning_rate": 9.595300261096606e-08, | |
| "loss": 0.5452, | |
| "mean_token_accuracy": 0.8990191352367402, | |
| "num_tokens": 7314577.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.3720017457008362, | |
| "epoch": 0.0032646665143155626, | |
| "grad_norm": 41.5, | |
| "learning_rate": 1.9386422976501306e-07, | |
| "loss": 0.5496, | |
| "mean_token_accuracy": 0.8973769438266754, | |
| "num_tokens": 14588101.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.3811770510673522, | |
| "epoch": 0.004896999771473344, | |
| "grad_norm": 30.25, | |
| "learning_rate": 2.9177545691906004e-07, | |
| "loss": 0.496, | |
| "mean_token_accuracy": 0.9052803874015808, | |
| "num_tokens": 22193383.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.3709602856636047, | |
| "epoch": 0.006529333028631125, | |
| "grad_norm": 10.0, | |
| "learning_rate": 3.896866840731071e-07, | |
| "loss": 0.412, | |
| "mean_token_accuracy": 0.9263910567760467, | |
| "num_tokens": 29160848.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.3785104417800904, | |
| "epoch": 0.008161666285788906, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 4.875979112271541e-07, | |
| "loss": 0.3539, | |
| "mean_token_accuracy": 0.9421093094348908, | |
| "num_tokens": 35829209.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.4016437172889709, | |
| "epoch": 0.009793999542946689, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 5.855091383812011e-07, | |
| "loss": 0.3663, | |
| "mean_token_accuracy": 0.9401827538013459, | |
| "num_tokens": 42980520.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.4059402322769166, | |
| "epoch": 0.01142633280010447, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 6.83420365535248e-07, | |
| "loss": 0.3226, | |
| "mean_token_accuracy": 0.9456453359127045, | |
| "num_tokens": 49839839.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.4296678924560546, | |
| "epoch": 0.01305866605726225, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 7.813315926892951e-07, | |
| "loss": 0.3287, | |
| "mean_token_accuracy": 0.9431667315959931, | |
| "num_tokens": 57114933.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.4395745277404786, | |
| "epoch": 0.014690999314420031, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 8.79242819843342e-07, | |
| "loss": 0.3109, | |
| "mean_token_accuracy": 0.9427984356880188, | |
| "num_tokens": 64178392.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.4591435599327087, | |
| "epoch": 0.016323332571577812, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 9.77154046997389e-07, | |
| "loss": 0.3093, | |
| "mean_token_accuracy": 0.9412086343765259, | |
| "num_tokens": 71359680.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.016323332571577812, | |
| "eval_entropy": 1.4762075742085774, | |
| "eval_loss": 0.324710875749588, | |
| "eval_mean_token_accuracy": 0.9405147298177083, | |
| "eval_num_tokens": 71359680.0, | |
| "eval_runtime": 743.1027, | |
| "eval_samples_per_second": 12.994, | |
| "eval_steps_per_second": 0.102, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.5086383247375488, | |
| "epoch": 0.017955665828735593, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 1.075065274151436e-06, | |
| "loss": 0.2984, | |
| "mean_token_accuracy": 0.9443651103973388, | |
| "num_tokens": 78414153.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.5364352035522462, | |
| "epoch": 0.019587999085893378, | |
| "grad_norm": 5.96875, | |
| "learning_rate": 1.172976501305483e-06, | |
| "loss": 0.2817, | |
| "mean_token_accuracy": 0.9466875243186951, | |
| "num_tokens": 85753500.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.5489245629310608, | |
| "epoch": 0.02122033234305116, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.27088772845953e-06, | |
| "loss": 0.2808, | |
| "mean_token_accuracy": 0.9457735085487365, | |
| "num_tokens": 92871116.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.5694326043128968, | |
| "epoch": 0.02285266560020894, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.368798955613577e-06, | |
| "loss": 0.2981, | |
| "mean_token_accuracy": 0.9427003371715545, | |
| "num_tokens": 100022246.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.5554496097564696, | |
| "epoch": 0.02448499885736672, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.466710182767624e-06, | |
| "loss": 0.2684, | |
| "mean_token_accuracy": 0.9482632505893708, | |
| "num_tokens": 107214871.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.5609500217437744, | |
| "epoch": 0.0261173321145245, | |
| "grad_norm": 4.25, | |
| "learning_rate": 1.5646214099216712e-06, | |
| "loss": 0.269, | |
| "mean_token_accuracy": 0.9476964402198792, | |
| "num_tokens": 114188698.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.5485923647880555, | |
| "epoch": 0.027749665371682282, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1.662532637075718e-06, | |
| "loss": 0.2584, | |
| "mean_token_accuracy": 0.9505482971668243, | |
| "num_tokens": 121264967.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.546497621536255, | |
| "epoch": 0.029381998628840063, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.760443864229765e-06, | |
| "loss": 0.2473, | |
| "mean_token_accuracy": 0.9520090806484223, | |
| "num_tokens": 127984569.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.5344351577758788, | |
| "epoch": 0.031014331885997844, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.8583550913838121e-06, | |
| "loss": 0.2728, | |
| "mean_token_accuracy": 0.9471143686771393, | |
| "num_tokens": 135471574.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.5414926147460937, | |
| "epoch": 0.032646665143155625, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.956266318537859e-06, | |
| "loss": 0.2573, | |
| "mean_token_accuracy": 0.95045654296875, | |
| "num_tokens": 142637684.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.032646665143155625, | |
| "eval_entropy": 1.544297873179118, | |
| "eval_loss": 0.2730070948600769, | |
| "eval_mean_token_accuracy": 0.9475241343180338, | |
| "eval_num_tokens": 142637684.0, | |
| "eval_runtime": 746.6617, | |
| "eval_samples_per_second": 12.932, | |
| "eval_steps_per_second": 0.102, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 1.5506397199630737, | |
| "epoch": 0.034278998400313405, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 2.054177545691906e-06, | |
| "loss": 0.2524, | |
| "mean_token_accuracy": 0.9507447695732116, | |
| "num_tokens": 149507260.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 1.544835422039032, | |
| "epoch": 0.035911331657471186, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 2.152088772845953e-06, | |
| "loss": 0.2477, | |
| "mean_token_accuracy": 0.9511877942085266, | |
| "num_tokens": 156809907.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.545062973499298, | |
| "epoch": 0.03754366491462897, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 2.25e-06, | |
| "loss": 0.2381, | |
| "mean_token_accuracy": 0.9530003690719604, | |
| "num_tokens": 163878508.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.5258307456970215, | |
| "epoch": 0.039175998171786755, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 2.347911227154047e-06, | |
| "loss": 0.2225, | |
| "mean_token_accuracy": 0.9549448072910309, | |
| "num_tokens": 170927568.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.5170037484169006, | |
| "epoch": 0.040808331428944536, | |
| "grad_norm": 1.375, | |
| "learning_rate": 2.445822454308094e-06, | |
| "loss": 0.2238, | |
| "mean_token_accuracy": 0.9546501576900482, | |
| "num_tokens": 178389889.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.5287164187431335, | |
| "epoch": 0.04244066468610232, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 2.543733681462141e-06, | |
| "loss": 0.2471, | |
| "mean_token_accuracy": 0.9508370912075043, | |
| "num_tokens": 186069673.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 1.5002574920654297, | |
| "epoch": 0.0440729979432601, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.641644908616188e-06, | |
| "loss": 0.2332, | |
| "mean_token_accuracy": 0.953369448184967, | |
| "num_tokens": 193396348.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 1.5036597728729248, | |
| "epoch": 0.04570533120041788, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 2.739556135770235e-06, | |
| "loss": 0.2245, | |
| "mean_token_accuracy": 0.9541668140888214, | |
| "num_tokens": 200593430.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 1.4917612624168397, | |
| "epoch": 0.04733766445757566, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.837467362924282e-06, | |
| "loss": 0.2193, | |
| "mean_token_accuracy": 0.9556196844577789, | |
| "num_tokens": 207639383.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 1.4869768619537354, | |
| "epoch": 0.04896999771473344, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.935378590078329e-06, | |
| "loss": 0.2195, | |
| "mean_token_accuracy": 0.9560010933876038, | |
| "num_tokens": 214438210.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.04896999771473344, | |
| "eval_entropy": 1.4914683151245116, | |
| "eval_loss": 0.24329085648059845, | |
| "eval_mean_token_accuracy": 0.9516400265693664, | |
| "eval_num_tokens": 214438210.0, | |
| "eval_runtime": 743.0423, | |
| "eval_samples_per_second": 12.995, | |
| "eval_steps_per_second": 0.102, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 1.478922975063324, | |
| "epoch": 0.05060233097189122, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 3.033289817232376e-06, | |
| "loss": 0.2176, | |
| "mean_token_accuracy": 0.9555383801460267, | |
| "num_tokens": 221777722.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.4821615958213805, | |
| "epoch": 0.052234664229049, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 3.131201044386423e-06, | |
| "loss": 0.2317, | |
| "mean_token_accuracy": 0.9527480769157409, | |
| "num_tokens": 229386810.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.4608716344833375, | |
| "epoch": 0.05386699748620678, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 3.22911227154047e-06, | |
| "loss": 0.2195, | |
| "mean_token_accuracy": 0.9549958789348603, | |
| "num_tokens": 237068902.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 1.4769957304000854, | |
| "epoch": 0.055499330743364564, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 3.327023498694517e-06, | |
| "loss": 0.2305, | |
| "mean_token_accuracy": 0.9542369735240936, | |
| "num_tokens": 243773147.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 1.4610289931297302, | |
| "epoch": 0.057131664000522345, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 3.424934725848564e-06, | |
| "loss": 0.2108, | |
| "mean_token_accuracy": 0.9570872175693512, | |
| "num_tokens": 250400651.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 1.4512082767486572, | |
| "epoch": 0.058763997257680126, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 3.522845953002611e-06, | |
| "loss": 0.2065, | |
| "mean_token_accuracy": 0.9566819512844086, | |
| "num_tokens": 257634063.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 1.4433103609085083, | |
| "epoch": 0.060396330514837906, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 3.6207571801566577e-06, | |
| "loss": 0.2022, | |
| "mean_token_accuracy": 0.9584151470661163, | |
| "num_tokens": 264346086.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 1.441446192264557, | |
| "epoch": 0.06202866377199569, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 3.7186684073107047e-06, | |
| "loss": 0.2172, | |
| "mean_token_accuracy": 0.9561198997497559, | |
| "num_tokens": 271468769.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 1.4351094341278077, | |
| "epoch": 0.06366099702915347, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 3.816579634464752e-06, | |
| "loss": 0.2056, | |
| "mean_token_accuracy": 0.9582554578781128, | |
| "num_tokens": 278724065.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 1.4327974796295166, | |
| "epoch": 0.06529333028631125, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 3.914490861618799e-06, | |
| "loss": 0.2062, | |
| "mean_token_accuracy": 0.956677463054657, | |
| "num_tokens": 285941177.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.06529333028631125, | |
| "eval_entropy": 1.4438136545817057, | |
| "eval_loss": 0.2242203801870346, | |
| "eval_mean_token_accuracy": 0.9541537570953369, | |
| "eval_num_tokens": 285941177.0, | |
| "eval_runtime": 744.3872, | |
| "eval_samples_per_second": 12.972, | |
| "eval_steps_per_second": 0.102, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 1.444144995212555, | |
| "epoch": 0.06692566354346903, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 4.012402088772846e-06, | |
| "loss": 0.1935, | |
| "mean_token_accuracy": 0.9598511147499085, | |
| "num_tokens": 292840917.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 1.4242860412597655, | |
| "epoch": 0.06855799680062681, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 4.1103133159268925e-06, | |
| "loss": 0.1947, | |
| "mean_token_accuracy": 0.9596167039871216, | |
| "num_tokens": 300067261.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 1.4138830995559692, | |
| "epoch": 0.07019033005778459, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 4.2082245430809395e-06, | |
| "loss": 0.1999, | |
| "mean_token_accuracy": 0.9582617676258087, | |
| "num_tokens": 307304735.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 1.4222793292999267, | |
| "epoch": 0.07182266331494237, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 4.3061357702349865e-06, | |
| "loss": 0.1896, | |
| "mean_token_accuracy": 0.9599699079990387, | |
| "num_tokens": 314065537.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 1.4278799700737, | |
| "epoch": 0.07345499657210015, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 4.4040469973890336e-06, | |
| "loss": 0.1845, | |
| "mean_token_accuracy": 0.960889185667038, | |
| "num_tokens": 320938314.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 1.40878901720047, | |
| "epoch": 0.07508732982925793, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 4.501958224543081e-06, | |
| "loss": 0.1917, | |
| "mean_token_accuracy": 0.9593756330013276, | |
| "num_tokens": 327763547.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 1.4114052319526673, | |
| "epoch": 0.07671966308641573, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 4.599869451697128e-06, | |
| "loss": 0.1938, | |
| "mean_token_accuracy": 0.9592675876617431, | |
| "num_tokens": 335032376.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 1.4144614338874817, | |
| "epoch": 0.07835199634357351, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 4.697780678851175e-06, | |
| "loss": 0.1962, | |
| "mean_token_accuracy": 0.958512544631958, | |
| "num_tokens": 342636323.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 1.4119537329673768, | |
| "epoch": 0.07998432960073129, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 4.795691906005222e-06, | |
| "loss": 0.1882, | |
| "mean_token_accuracy": 0.9601361775398254, | |
| "num_tokens": 349613670.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 1.4156457328796386, | |
| "epoch": 0.08161666285788907, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 4.893603133159269e-06, | |
| "loss": 0.19, | |
| "mean_token_accuracy": 0.9593257677555084, | |
| "num_tokens": 356562537.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.08161666285788907, | |
| "eval_entropy": 1.407395658493042, | |
| "eval_loss": 0.20515179634094238, | |
| "eval_mean_token_accuracy": 0.9569398101170857, | |
| "eval_num_tokens": 356562537.0, | |
| "eval_runtime": 748.9688, | |
| "eval_samples_per_second": 12.892, | |
| "eval_steps_per_second": 0.101, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 1.4004506325721742, | |
| "epoch": 0.08324899611504685, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 4.991514360313316e-06, | |
| "loss": 0.1797, | |
| "mean_token_accuracy": 0.9611039471626281, | |
| "num_tokens": 363822144.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 1.4148499584197998, | |
| "epoch": 0.08488132937220463, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 5.089425587467363e-06, | |
| "loss": 0.1868, | |
| "mean_token_accuracy": 0.9603874254226684, | |
| "num_tokens": 370830813.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 1.4092473483085632, | |
| "epoch": 0.08651366262936241, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 5.18733681462141e-06, | |
| "loss": 0.1729, | |
| "mean_token_accuracy": 0.9630460107326507, | |
| "num_tokens": 378019973.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 1.424285671710968, | |
| "epoch": 0.0881459958865202, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 5.285248041775457e-06, | |
| "loss": 0.1829, | |
| "mean_token_accuracy": 0.9613824605941772, | |
| "num_tokens": 384991554.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 1.410207018852234, | |
| "epoch": 0.08977832914367798, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 5.383159268929505e-06, | |
| "loss": 0.1632, | |
| "mean_token_accuracy": 0.9650751757621765, | |
| "num_tokens": 392065700.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 1.4243709874153136, | |
| "epoch": 0.09141066240083576, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 5.481070496083552e-06, | |
| "loss": 0.1728, | |
| "mean_token_accuracy": 0.9626587212085724, | |
| "num_tokens": 399270239.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 1.413210895061493, | |
| "epoch": 0.09304299565799354, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 5.578981723237598e-06, | |
| "loss": 0.1791, | |
| "mean_token_accuracy": 0.9606540656089783, | |
| "num_tokens": 406647892.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 1.4289642930030824, | |
| "epoch": 0.09467532891515132, | |
| "grad_norm": 2.0, | |
| "learning_rate": 5.676892950391645e-06, | |
| "loss": 0.1853, | |
| "mean_token_accuracy": 0.9598627412319183, | |
| "num_tokens": 414138654.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 1.435717191696167, | |
| "epoch": 0.0963076621723091, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 5.774804177545692e-06, | |
| "loss": 0.1768, | |
| "mean_token_accuracy": 0.9624214172363281, | |
| "num_tokens": 421217231.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 1.428927412033081, | |
| "epoch": 0.09793999542946688, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 5.872715404699739e-06, | |
| "loss": 0.1829, | |
| "mean_token_accuracy": 0.9605919003486634, | |
| "num_tokens": 428485940.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.09793999542946688, | |
| "eval_entropy": 1.4376725546518963, | |
| "eval_loss": 0.19405308365821838, | |
| "eval_mean_token_accuracy": 0.958539453347524, | |
| "eval_num_tokens": 428485940.0, | |
| "eval_runtime": 754.0644, | |
| "eval_samples_per_second": 12.805, | |
| "eval_steps_per_second": 0.101, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 1.438420045375824, | |
| "epoch": 0.09957232868662466, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 5.970626631853786e-06, | |
| "loss": 0.175, | |
| "mean_token_accuracy": 0.9624645209312439, | |
| "num_tokens": 435253940.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 1.43366064786911, | |
| "epoch": 0.10120466194378244, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 5.9999946455996105e-06, | |
| "loss": 0.1691, | |
| "mean_token_accuracy": 0.9642149293422699, | |
| "num_tokens": 441868038.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 1.4457367968559265, | |
| "epoch": 0.10283699520094022, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 5.999968420011062e-06, | |
| "loss": 0.1845, | |
| "mean_token_accuracy": 0.9603316211700439, | |
| "num_tokens": 449366573.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 1.4273823165893555, | |
| "epoch": 0.104469328458098, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 5.999920339963868e-06, | |
| "loss": 0.1727, | |
| "mean_token_accuracy": 0.9620600152015686, | |
| "num_tokens": 456397259.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 1.4280832529067993, | |
| "epoch": 0.10610166171525579, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 5.999850405808289e-06, | |
| "loss": 0.1706, | |
| "mean_token_accuracy": 0.9628064668178559, | |
| "num_tokens": 463197302.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 1.4216149377822875, | |
| "epoch": 0.10773399497241357, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 5.999758618053787e-06, | |
| "loss": 0.1685, | |
| "mean_token_accuracy": 0.9630639374256134, | |
| "num_tokens": 469919649.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 1.4262100625038148, | |
| "epoch": 0.10936632822957135, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 5.999644977369027e-06, | |
| "loss": 0.1735, | |
| "mean_token_accuracy": 0.9620367133617401, | |
| "num_tokens": 476875581.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 1.4126947259902953, | |
| "epoch": 0.11099866148672913, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 5.9995094845818684e-06, | |
| "loss": 0.1697, | |
| "mean_token_accuracy": 0.9630028975009918, | |
| "num_tokens": 483984060.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 1.436975917816162, | |
| "epoch": 0.11263099474388691, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 5.999352140679363e-06, | |
| "loss": 0.1789, | |
| "mean_token_accuracy": 0.9613595926761627, | |
| "num_tokens": 491263590.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "entropy": 1.4022981858253478, | |
| "epoch": 0.11426332800104469, | |
| "grad_norm": 1.0, | |
| "learning_rate": 5.999172946807744e-06, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9631788098812103, | |
| "num_tokens": 498685855.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.11426332800104469, | |
| "eval_entropy": 1.4040173705418904, | |
| "eval_loss": 0.18694917857646942, | |
| "eval_mean_token_accuracy": 0.9596376585960388, | |
| "eval_num_tokens": 498685855.0, | |
| "eval_runtime": 746.2297, | |
| "eval_samples_per_second": 12.94, | |
| "eval_steps_per_second": 0.102, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 1.4095824003219604, | |
| "epoch": 0.11589566125820247, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 5.998971904272421e-06, | |
| "loss": 0.174, | |
| "mean_token_accuracy": 0.9623571968078614, | |
| "num_tokens": 506263057.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 1.406613359451294, | |
| "epoch": 0.11752799451536025, | |
| "grad_norm": 1.375, | |
| "learning_rate": 5.998749014537968e-06, | |
| "loss": 0.1674, | |
| "mean_token_accuracy": 0.9625373089313507, | |
| "num_tokens": 513004369.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 1.405109441280365, | |
| "epoch": 0.11916032777251803, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 5.998504279228114e-06, | |
| "loss": 0.1655, | |
| "mean_token_accuracy": 0.9644717895984649, | |
| "num_tokens": 519807945.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 1.4100734496116638, | |
| "epoch": 0.12079266102967581, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 5.99823770012573e-06, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9630844235420227, | |
| "num_tokens": 526976842.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 1.407007110118866, | |
| "epoch": 0.1224249942868336, | |
| "grad_norm": 1.25, | |
| "learning_rate": 5.997949279172815e-06, | |
| "loss": 0.1655, | |
| "mean_token_accuracy": 0.9636348211765289, | |
| "num_tokens": 533987687.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 1.4068945956230163, | |
| "epoch": 0.12405732754399137, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 5.9976390184704885e-06, | |
| "loss": 0.1701, | |
| "mean_token_accuracy": 0.962993438243866, | |
| "num_tokens": 540885515.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 1.4040190553665162, | |
| "epoch": 0.12568966080114916, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 5.997306920278967e-06, | |
| "loss": 0.1736, | |
| "mean_token_accuracy": 0.9614431858062744, | |
| "num_tokens": 548248278.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "entropy": 1.3793179154396058, | |
| "epoch": 0.12732199405830694, | |
| "grad_norm": 1.5, | |
| "learning_rate": 5.99695298701755e-06, | |
| "loss": 0.1484, | |
| "mean_token_accuracy": 0.9667005109786987, | |
| "num_tokens": 555190537.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 1.3872941756248474, | |
| "epoch": 0.12895432731546472, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 5.996577221264605e-06, | |
| "loss": 0.1648, | |
| "mean_token_accuracy": 0.9634542167186737, | |
| "num_tokens": 562395377.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "entropy": 1.3957655119895935, | |
| "epoch": 0.1305866605726225, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 5.9961796257575485e-06, | |
| "loss": 0.1684, | |
| "mean_token_accuracy": 0.9631939339637756, | |
| "num_tokens": 569190032.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.1305866605726225, | |
| "eval_entropy": 1.4019947210947672, | |
| "eval_loss": 0.18201017379760742, | |
| "eval_mean_token_accuracy": 0.960600491364797, | |
| "eval_num_tokens": 569190032.0, | |
| "eval_runtime": 751.8458, | |
| "eval_samples_per_second": 12.843, | |
| "eval_steps_per_second": 0.101, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 1.3909747576713563, | |
| "epoch": 0.13221899382978028, | |
| "grad_norm": 6.75, | |
| "learning_rate": 5.99576020339282e-06, | |
| "loss": 0.1614, | |
| "mean_token_accuracy": 0.9640302300453186, | |
| "num_tokens": 576222815.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "entropy": 1.3952715015411377, | |
| "epoch": 0.13385132708693806, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 5.995318957225869e-06, | |
| "loss": 0.1572, | |
| "mean_token_accuracy": 0.9658111941814422, | |
| "num_tokens": 582993007.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 1.4083420133590698, | |
| "epoch": 0.13548366034409584, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 5.994855890471128e-06, | |
| "loss": 0.1634, | |
| "mean_token_accuracy": 0.964879697561264, | |
| "num_tokens": 589829579.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "entropy": 1.4027349591255187, | |
| "epoch": 0.13711599360125362, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 5.9943710065019905e-06, | |
| "loss": 0.1586, | |
| "mean_token_accuracy": 0.9652115881443024, | |
| "num_tokens": 596795182.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 1.4036769008636474, | |
| "epoch": 0.1387483268584114, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 5.993864308850785e-06, | |
| "loss": 0.1644, | |
| "mean_token_accuracy": 0.9636000382900238, | |
| "num_tokens": 603861008.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "entropy": 1.4230398559570312, | |
| "epoch": 0.14038066011556918, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 5.9933358012087526e-06, | |
| "loss": 0.1651, | |
| "mean_token_accuracy": 0.9630668413639069, | |
| "num_tokens": 611150442.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 1.4209274005889894, | |
| "epoch": 0.14201299337272696, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 5.992785487426016e-06, | |
| "loss": 0.1613, | |
| "mean_token_accuracy": 0.9643425738811493, | |
| "num_tokens": 617582752.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "entropy": 1.409316577911377, | |
| "epoch": 0.14364532662988475, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 5.992213371511554e-06, | |
| "loss": 0.1606, | |
| "mean_token_accuracy": 0.9648306250572205, | |
| "num_tokens": 624322193.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 1.406817865371704, | |
| "epoch": 0.14527765988704253, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 5.991619457633171e-06, | |
| "loss": 0.1659, | |
| "mean_token_accuracy": 0.9639068651199341, | |
| "num_tokens": 631733777.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "entropy": 1.4001825642585755, | |
| "epoch": 0.1469099931442003, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 5.991003750117468e-06, | |
| "loss": 0.1601, | |
| "mean_token_accuracy": 0.9647636806964874, | |
| "num_tokens": 639594509.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.1469099931442003, | |
| "eval_entropy": 1.4037580092748005, | |
| "eval_loss": 0.17858904600143433, | |
| "eval_mean_token_accuracy": 0.961095765431722, | |
| "eval_num_tokens": 639594509.0, | |
| "eval_runtime": 747.1098, | |
| "eval_samples_per_second": 12.924, | |
| "eval_steps_per_second": 0.102, | |
| "step": 4500 | |
| }, | |
| { | |
| "entropy": 1.3959466004371643, | |
| "epoch": 0.1485423264013581, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 5.990366253449812e-06, | |
| "loss": 0.1605, | |
| "mean_token_accuracy": 0.9646508944034576, | |
| "num_tokens": 647057874.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "entropy": 1.3969790387153624, | |
| "epoch": 0.15017465965851587, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 5.989706972274299e-06, | |
| "loss": 0.1617, | |
| "mean_token_accuracy": 0.9644351935386658, | |
| "num_tokens": 654287916.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "entropy": 1.3977243065834046, | |
| "epoch": 0.15180699291567368, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 5.989025911393723e-06, | |
| "loss": 0.1733, | |
| "mean_token_accuracy": 0.9629066979885101, | |
| "num_tokens": 661344302.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "entropy": 1.4031280422210692, | |
| "epoch": 0.15343932617283146, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 5.988323075769544e-06, | |
| "loss": 0.1478, | |
| "mean_token_accuracy": 0.967905158996582, | |
| "num_tokens": 667929168.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "entropy": 1.4076430988311768, | |
| "epoch": 0.15507165942998924, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 5.987598470521845e-06, | |
| "loss": 0.1585, | |
| "mean_token_accuracy": 0.965575454235077, | |
| "num_tokens": 675162945.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "entropy": 1.4047666358947755, | |
| "epoch": 0.15670399268714702, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 5.986852100929301e-06, | |
| "loss": 0.1633, | |
| "mean_token_accuracy": 0.9638245010375976, | |
| "num_tokens": 682171614.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "entropy": 1.4109639859199523, | |
| "epoch": 0.1583363259443048, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 5.986083972429135e-06, | |
| "loss": 0.1641, | |
| "mean_token_accuracy": 0.964219799041748, | |
| "num_tokens": 689397569.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "entropy": 1.4028720164299011, | |
| "epoch": 0.15996865920146258, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 5.985294090617086e-06, | |
| "loss": 0.1608, | |
| "mean_token_accuracy": 0.9645184981822967, | |
| "num_tokens": 696209610.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "entropy": 1.3975288462638855, | |
| "epoch": 0.16160099245862036, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 5.98448246124736e-06, | |
| "loss": 0.1551, | |
| "mean_token_accuracy": 0.9647789108753204, | |
| "num_tokens": 703107550.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "entropy": 1.3928989100456237, | |
| "epoch": 0.16323332571577814, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 5.983649090232592e-06, | |
| "loss": 0.1592, | |
| "mean_token_accuracy": 0.9648973512649536, | |
| "num_tokens": 710156450.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.16323332571577814, | |
| "eval_entropy": 1.4044678370157877, | |
| "eval_loss": 0.1755974143743515, | |
| "eval_mean_token_accuracy": 0.961607707341512, | |
| "eval_num_tokens": 710156450.0, | |
| "eval_runtime": 749.1287, | |
| "eval_samples_per_second": 12.89, | |
| "eval_steps_per_second": 0.101, | |
| "step": 5000 | |
| }, | |
| { | |
| "entropy": 1.4239799737930299, | |
| "epoch": 0.16486565897293592, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 5.982793983643805e-06, | |
| "loss": 0.1637, | |
| "mean_token_accuracy": 0.9640812170505524, | |
| "num_tokens": 717234054.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "entropy": 1.4224350619316102, | |
| "epoch": 0.1664979922300937, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.98191714771036e-06, | |
| "loss": 0.1624, | |
| "mean_token_accuracy": 0.9640639245510101, | |
| "num_tokens": 724297107.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "entropy": 1.4389384937286378, | |
| "epoch": 0.1681303254872515, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 5.981018588819916e-06, | |
| "loss": 0.1681, | |
| "mean_token_accuracy": 0.9633942902088165, | |
| "num_tokens": 731287758.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "entropy": 1.427138111591339, | |
| "epoch": 0.16976265874440927, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 5.980098313518383e-06, | |
| "loss": 0.1669, | |
| "mean_token_accuracy": 0.9635147547721863, | |
| "num_tokens": 738353484.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "entropy": 1.43255943775177, | |
| "epoch": 0.17139499200156705, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 5.97915632850987e-06, | |
| "loss": 0.1602, | |
| "mean_token_accuracy": 0.9645324110984802, | |
| "num_tokens": 745152314.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "entropy": 1.4146737170219421, | |
| "epoch": 0.17302732525872483, | |
| "grad_norm": 1.375, | |
| "learning_rate": 5.97819264065664e-06, | |
| "loss": 0.1566, | |
| "mean_token_accuracy": 0.9652618932723999, | |
| "num_tokens": 752080428.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "entropy": 1.41352454662323, | |
| "epoch": 0.1746596585158826, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 5.977207256979058e-06, | |
| "loss": 0.1472, | |
| "mean_token_accuracy": 0.9666665005683899, | |
| "num_tokens": 758654299.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "entropy": 1.4163859128952025, | |
| "epoch": 0.1762919917730404, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 5.976200184655544e-06, | |
| "loss": 0.1646, | |
| "mean_token_accuracy": 0.9645817792415619, | |
| "num_tokens": 765762864.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "entropy": 1.4168593525886535, | |
| "epoch": 0.17792432503019817, | |
| "grad_norm": 1.625, | |
| "learning_rate": 5.9751714310225135e-06, | |
| "loss": 0.1558, | |
| "mean_token_accuracy": 0.9653662145137787, | |
| "num_tokens": 772523965.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "entropy": 1.4106249260902404, | |
| "epoch": 0.17955665828735595, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 5.974121003574331e-06, | |
| "loss": 0.1605, | |
| "mean_token_accuracy": 0.9646015942096711, | |
| "num_tokens": 779873569.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.17955665828735595, | |
| "eval_entropy": 1.4159186140696207, | |
| "eval_loss": 0.17334794998168945, | |
| "eval_mean_token_accuracy": 0.961771670182546, | |
| "eval_num_tokens": 779873569.0, | |
| "eval_runtime": 752.9376, | |
| "eval_samples_per_second": 12.824, | |
| "eval_steps_per_second": 0.101, | |
| "step": 5500 | |
| }, | |
| { | |
| "entropy": 1.4060875940322877, | |
| "epoch": 0.18118899154451373, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 5.973048909963251e-06, | |
| "loss": 0.166, | |
| "mean_token_accuracy": 0.9633875727653504, | |
| "num_tokens": 787666740.0, | |
| "step": 5550 | |
| }, | |
| { | |
| "entropy": 1.4012414264678954, | |
| "epoch": 0.18282132480167151, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 5.971955157999365e-06, | |
| "loss": 0.1542, | |
| "mean_token_accuracy": 0.9654585599899292, | |
| "num_tokens": 794513657.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "entropy": 1.3988840198516845, | |
| "epoch": 0.1844536580588293, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 5.970839755650541e-06, | |
| "loss": 0.1595, | |
| "mean_token_accuracy": 0.9642168319225312, | |
| "num_tokens": 801328321.0, | |
| "step": 5650 | |
| }, | |
| { | |
| "entropy": 1.4012188339233398, | |
| "epoch": 0.18608599131598708, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 5.969702711042371e-06, | |
| "loss": 0.167, | |
| "mean_token_accuracy": 0.9627443432807923, | |
| "num_tokens": 808895137.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "entropy": 1.4030301642417908, | |
| "epoch": 0.18771832457314486, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 5.968544032458105e-06, | |
| "loss": 0.1518, | |
| "mean_token_accuracy": 0.9666016948223114, | |
| "num_tokens": 815564055.0, | |
| "step": 5750 | |
| }, | |
| { | |
| "entropy": 1.3949448704719543, | |
| "epoch": 0.18935065783030264, | |
| "grad_norm": 1.25, | |
| "learning_rate": 5.967363728338598e-06, | |
| "loss": 0.1542, | |
| "mean_token_accuracy": 0.9651316654682159, | |
| "num_tokens": 822772629.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "entropy": 1.4053015112876892, | |
| "epoch": 0.19098299108746042, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 5.966161807282244e-06, | |
| "loss": 0.1475, | |
| "mean_token_accuracy": 0.9664002013206482, | |
| "num_tokens": 829474695.0, | |
| "step": 5850 | |
| }, | |
| { | |
| "entropy": 1.3961756944656372, | |
| "epoch": 0.1926153243446182, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 5.96493827804491e-06, | |
| "loss": 0.1636, | |
| "mean_token_accuracy": 0.9642155694961548, | |
| "num_tokens": 836807189.0, | |
| "step": 5900 | |
| }, | |
| { | |
| "entropy": 1.39143967628479, | |
| "epoch": 0.19424765760177598, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 5.963693149539883e-06, | |
| "loss": 0.1592, | |
| "mean_token_accuracy": 0.9642041945457458, | |
| "num_tokens": 843989373.0, | |
| "step": 5950 | |
| }, | |
| { | |
| "entropy": 1.395959141254425, | |
| "epoch": 0.19587999085893376, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 5.962426430837792e-06, | |
| "loss": 0.1613, | |
| "mean_token_accuracy": 0.964136803150177, | |
| "num_tokens": 851299727.0, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.19587999085893376, | |
| "eval_entropy": 1.4013945213953654, | |
| "eval_loss": 0.17208388447761536, | |
| "eval_mean_token_accuracy": 0.9621260579427083, | |
| "eval_num_tokens": 851299727.0, | |
| "eval_runtime": 753.3633, | |
| "eval_samples_per_second": 12.817, | |
| "eval_steps_per_second": 0.101, | |
| "step": 6000 | |
| }, | |
| { | |
| "entropy": 1.4012908124923706, | |
| "epoch": 0.19751232411609154, | |
| "grad_norm": 0.0113525390625, | |
| "learning_rate": 5.961138131166554e-06, | |
| "loss": 0.1554, | |
| "mean_token_accuracy": 0.9652227807044983, | |
| "num_tokens": 858064092.0, | |
| "step": 6050 | |
| }, | |
| { | |
| "entropy": 1.388651340007782, | |
| "epoch": 0.19914465737324932, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 5.959828259911295e-06, | |
| "loss": 0.1569, | |
| "mean_token_accuracy": 0.9651960909366608, | |
| "num_tokens": 865252118.0, | |
| "step": 6100 | |
| }, | |
| { | |
| "entropy": 1.4042778515815735, | |
| "epoch": 0.2007769906304071, | |
| "grad_norm": 1.375, | |
| "learning_rate": 5.958496826614294e-06, | |
| "loss": 0.1661, | |
| "mean_token_accuracy": 0.9626063787937165, | |
| "num_tokens": 872468561.0, | |
| "step": 6150 | |
| }, | |
| { | |
| "entropy": 1.401300666332245, | |
| "epoch": 0.20240932388756488, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 5.957143840974904e-06, | |
| "loss": 0.149, | |
| "mean_token_accuracy": 0.9666438174247741, | |
| "num_tokens": 879011998.0, | |
| "step": 6200 | |
| }, | |
| { | |
| "entropy": 1.4077980709075928, | |
| "epoch": 0.20404165714472267, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 5.955769312849484e-06, | |
| "loss": 0.1605, | |
| "mean_token_accuracy": 0.9650888216495513, | |
| "num_tokens": 886346540.0, | |
| "step": 6250 | |
| }, | |
| { | |
| "entropy": 1.394054229259491, | |
| "epoch": 0.20567399040188045, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.954373252251329e-06, | |
| "loss": 0.1537, | |
| "mean_token_accuracy": 0.9657756268978119, | |
| "num_tokens": 893393322.0, | |
| "step": 6300 | |
| }, | |
| { | |
| "entropy": 1.3978914856910705, | |
| "epoch": 0.20730632365903823, | |
| "grad_norm": 1.5, | |
| "learning_rate": 5.952955669350596e-06, | |
| "loss": 0.1515, | |
| "mean_token_accuracy": 0.9658310306072235, | |
| "num_tokens": 899920970.0, | |
| "step": 6350 | |
| }, | |
| { | |
| "entropy": 1.4074292516708373, | |
| "epoch": 0.208938656916196, | |
| "grad_norm": 1.875, | |
| "learning_rate": 5.95151657447423e-06, | |
| "loss": 0.1568, | |
| "mean_token_accuracy": 0.9654946303367615, | |
| "num_tokens": 906895264.0, | |
| "step": 6400 | |
| }, | |
| { | |
| "entropy": 1.4000064754486083, | |
| "epoch": 0.2105709901733538, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 5.950055978105885e-06, | |
| "loss": 0.1495, | |
| "mean_token_accuracy": 0.9668013238906861, | |
| "num_tokens": 913671071.0, | |
| "step": 6450 | |
| }, | |
| { | |
| "entropy": 1.4307255530357361, | |
| "epoch": 0.21220332343051157, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 5.948573890885859e-06, | |
| "loss": 0.1663, | |
| "mean_token_accuracy": 0.9624395740032196, | |
| "num_tokens": 921099610.0, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.21220332343051157, | |
| "eval_entropy": 1.4208130852381389, | |
| "eval_loss": 0.16986840963363647, | |
| "eval_mean_token_accuracy": 0.962581082979838, | |
| "eval_num_tokens": 921099610.0, | |
| "eval_runtime": 746.9082, | |
| "eval_samples_per_second": 12.928, | |
| "eval_steps_per_second": 0.102, | |
| "step": 6500 | |
| }, | |
| { | |
| "entropy": 1.409722430706024, | |
| "epoch": 0.21383565668766935, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 5.947070323610999e-06, | |
| "loss": 0.1449, | |
| "mean_token_accuracy": 0.9680650508403779, | |
| "num_tokens": 928163378.0, | |
| "step": 6550 | |
| }, | |
| { | |
| "entropy": 1.4334656882286072, | |
| "epoch": 0.21546798994482713, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 5.945545287234639e-06, | |
| "loss": 0.1563, | |
| "mean_token_accuracy": 0.964540822505951, | |
| "num_tokens": 934586868.0, | |
| "step": 6600 | |
| }, | |
| { | |
| "entropy": 1.4435390138626099, | |
| "epoch": 0.2171003232019849, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 5.943998792866509e-06, | |
| "loss": 0.15, | |
| "mean_token_accuracy": 0.9669582867622375, | |
| "num_tokens": 941380079.0, | |
| "step": 6650 | |
| }, | |
| { | |
| "entropy": 1.4167057871818542, | |
| "epoch": 0.2187326564591427, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 5.942430851772662e-06, | |
| "loss": 0.1627, | |
| "mean_token_accuracy": 0.9633757710456848, | |
| "num_tokens": 949348630.0, | |
| "step": 6700 | |
| }, | |
| { | |
| "entropy": 1.4309338593482972, | |
| "epoch": 0.22036498971630047, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 5.9408414753753836e-06, | |
| "loss": 0.1546, | |
| "mean_token_accuracy": 0.9655015981197357, | |
| "num_tokens": 956312502.0, | |
| "step": 6750 | |
| }, | |
| { | |
| "entropy": 1.4343366432189941, | |
| "epoch": 0.22199732297345826, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 5.939230675253119e-06, | |
| "loss": 0.1489, | |
| "mean_token_accuracy": 0.96628955245018, | |
| "num_tokens": 963500996.0, | |
| "step": 6800 | |
| }, | |
| { | |
| "entropy": 1.4271987533569337, | |
| "epoch": 0.22362965623061604, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 5.9375984631403785e-06, | |
| "loss": 0.1616, | |
| "mean_token_accuracy": 0.9645246016979218, | |
| "num_tokens": 970915430.0, | |
| "step": 6850 | |
| }, | |
| { | |
| "entropy": 1.4355636262893676, | |
| "epoch": 0.22526198948777382, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 5.935944850927657e-06, | |
| "loss": 0.1533, | |
| "mean_token_accuracy": 0.9660027372837067, | |
| "num_tokens": 978036240.0, | |
| "step": 6900 | |
| }, | |
| { | |
| "entropy": 1.425994610786438, | |
| "epoch": 0.2268943227449316, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 5.934269850661349e-06, | |
| "loss": 0.152, | |
| "mean_token_accuracy": 0.9652906250953674, | |
| "num_tokens": 985047502.0, | |
| "step": 6950 | |
| }, | |
| { | |
| "entropy": 1.4223777842521668, | |
| "epoch": 0.22852665600208938, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 5.932573474543658e-06, | |
| "loss": 0.156, | |
| "mean_token_accuracy": 0.9658907651901245, | |
| "num_tokens": 992373619.0, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.22852665600208938, | |
| "eval_entropy": 1.4304504505793254, | |
| "eval_loss": 0.16900277137756348, | |
| "eval_mean_token_accuracy": 0.9627823217709859, | |
| "eval_num_tokens": 992373619.0, | |
| "eval_runtime": 749.871, | |
| "eval_samples_per_second": 12.877, | |
| "eval_steps_per_second": 0.101, | |
| "step": 7000 | |
| }, | |
| { | |
| "entropy": 1.4272488355636597, | |
| "epoch": 0.23015898925924716, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 5.930855734932506e-06, | |
| "loss": 0.1454, | |
| "mean_token_accuracy": 0.9678001618385315, | |
| "num_tokens": 999289508.0, | |
| "step": 7050 | |
| }, | |
| { | |
| "entropy": 1.4248918747901917, | |
| "epoch": 0.23179132251640494, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 5.92911664434145e-06, | |
| "loss": 0.1558, | |
| "mean_token_accuracy": 0.9652420032024384, | |
| "num_tokens": 1006528812.0, | |
| "step": 7100 | |
| }, | |
| { | |
| "entropy": 1.4263224506378174, | |
| "epoch": 0.23342365577356272, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 5.927356215439584e-06, | |
| "loss": 0.1494, | |
| "mean_token_accuracy": 0.9668344402313233, | |
| "num_tokens": 1013541923.0, | |
| "step": 7150 | |
| }, | |
| { | |
| "entropy": 1.4219363307952881, | |
| "epoch": 0.2350559890307205, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 5.92557446105145e-06, | |
| "loss": 0.1486, | |
| "mean_token_accuracy": 0.9674091839790344, | |
| "num_tokens": 1020511787.0, | |
| "step": 7200 | |
| }, | |
| { | |
| "entropy": 1.4293452215194702, | |
| "epoch": 0.23668832228787828, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 5.923771394156943e-06, | |
| "loss": 0.158, | |
| "mean_token_accuracy": 0.9649367642402649, | |
| "num_tokens": 1027894747.0, | |
| "step": 7250 | |
| }, | |
| { | |
| "entropy": 1.4306922936439515, | |
| "epoch": 0.23832065554503606, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 5.921947027891219e-06, | |
| "loss": 0.1528, | |
| "mean_token_accuracy": 0.9656593954563141, | |
| "num_tokens": 1035110900.0, | |
| "step": 7300 | |
| }, | |
| { | |
| "entropy": 1.4252450680732727, | |
| "epoch": 0.23995298880219385, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 5.9201013755445955e-06, | |
| "loss": 0.1535, | |
| "mean_token_accuracy": 0.966197533607483, | |
| "num_tokens": 1042230443.0, | |
| "step": 7350 | |
| }, | |
| { | |
| "entropy": 1.4307914185523987, | |
| "epoch": 0.24158532205935163, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 5.91823445056246e-06, | |
| "loss": 0.147, | |
| "mean_token_accuracy": 0.9667865073680878, | |
| "num_tokens": 1049389935.0, | |
| "step": 7400 | |
| }, | |
| { | |
| "entropy": 1.4161819124221802, | |
| "epoch": 0.2432176553165094, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 5.916346266545167e-06, | |
| "loss": 0.1468, | |
| "mean_token_accuracy": 0.96632697224617, | |
| "num_tokens": 1056551888.0, | |
| "step": 7450 | |
| }, | |
| { | |
| "entropy": 1.4254303669929504, | |
| "epoch": 0.2448499885736672, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 5.914436837247941e-06, | |
| "loss": 0.1525, | |
| "mean_token_accuracy": 0.965957795381546, | |
| "num_tokens": 1063197615.0, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.2448499885736672, | |
| "eval_entropy": 1.4276245164871215, | |
| "eval_loss": 0.16787172853946686, | |
| "eval_mean_token_accuracy": 0.9633070985476176, | |
| "eval_num_tokens": 1063197615.0, | |
| "eval_runtime": 751.5202, | |
| "eval_samples_per_second": 12.849, | |
| "eval_steps_per_second": 0.101, | |
| "step": 7500 | |
| }, | |
| { | |
| "entropy": 1.4108345437049865, | |
| "epoch": 0.24648232183082497, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 5.912506176580776e-06, | |
| "loss": 0.147, | |
| "mean_token_accuracy": 0.9670144832134246, | |
| "num_tokens": 1069874223.0, | |
| "step": 7550 | |
| }, | |
| { | |
| "entropy": 1.4212487936019897, | |
| "epoch": 0.24811465508798275, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 5.910554298608335e-06, | |
| "loss": 0.1509, | |
| "mean_token_accuracy": 0.96580601811409, | |
| "num_tokens": 1076764997.0, | |
| "step": 7600 | |
| }, | |
| { | |
| "entropy": 1.4187248206138612, | |
| "epoch": 0.24974698834514053, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 5.908581217549845e-06, | |
| "loss": 0.1528, | |
| "mean_token_accuracy": 0.9664638650417328, | |
| "num_tokens": 1083894428.0, | |
| "step": 7650 | |
| }, | |
| { | |
| "entropy": 1.400410017967224, | |
| "epoch": 0.2513793216022983, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 5.906586947778998e-06, | |
| "loss": 0.1448, | |
| "mean_token_accuracy": 0.9671175360679627, | |
| "num_tokens": 1090645918.0, | |
| "step": 7700 | |
| }, | |
| { | |
| "entropy": 1.4053305006027221, | |
| "epoch": 0.2530116548594561, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 5.9045715038238436e-06, | |
| "loss": 0.1509, | |
| "mean_token_accuracy": 0.9659091722965241, | |
| "num_tokens": 1097654372.0, | |
| "step": 7750 | |
| }, | |
| { | |
| "entropy": 1.3977803254127503, | |
| "epoch": 0.2546439881166139, | |
| "grad_norm": 1.25, | |
| "learning_rate": 5.902534900366681e-06, | |
| "loss": 0.1547, | |
| "mean_token_accuracy": 0.9661977970600129, | |
| "num_tokens": 1104996723.0, | |
| "step": 7800 | |
| }, | |
| { | |
| "entropy": 1.3928361082077025, | |
| "epoch": 0.25627632137377165, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 5.900477152243954e-06, | |
| "loss": 0.1467, | |
| "mean_token_accuracy": 0.9668631637096405, | |
| "num_tokens": 1111808272.0, | |
| "step": 7850 | |
| }, | |
| { | |
| "entropy": 1.3904444289207458, | |
| "epoch": 0.25790865463092943, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 5.8983982744461446e-06, | |
| "loss": 0.1523, | |
| "mean_token_accuracy": 0.9658901369571686, | |
| "num_tokens": 1118777348.0, | |
| "step": 7900 | |
| }, | |
| { | |
| "entropy": 1.4107188177108765, | |
| "epoch": 0.2595409878880872, | |
| "grad_norm": 1.25, | |
| "learning_rate": 5.896298282117662e-06, | |
| "loss": 0.1508, | |
| "mean_token_accuracy": 0.9659655904769897, | |
| "num_tokens": 1125395200.0, | |
| "step": 7950 | |
| }, | |
| { | |
| "entropy": 1.409360373020172, | |
| "epoch": 0.261173321145245, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 5.894177190556733e-06, | |
| "loss": 0.1523, | |
| "mean_token_accuracy": 0.9658745980262756, | |
| "num_tokens": 1132194770.0, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.261173321145245, | |
| "eval_entropy": 1.3953218412399293, | |
| "eval_loss": 0.16749244928359985, | |
| "eval_mean_token_accuracy": 0.9630522100130717, | |
| "eval_num_tokens": 1132194770.0, | |
| "eval_runtime": 749.5556, | |
| "eval_samples_per_second": 12.882, | |
| "eval_steps_per_second": 0.101, | |
| "step": 8000 | |
| }, | |
| { | |
| "entropy": 1.395931794643402, | |
| "epoch": 0.2628056544024028, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 5.892035015215289e-06, | |
| "loss": 0.1475, | |
| "mean_token_accuracy": 0.967307710647583, | |
| "num_tokens": 1139223324.0, | |
| "step": 8050 | |
| }, | |
| { | |
| "entropy": 1.4096789264678955, | |
| "epoch": 0.26443798765956056, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 5.889871771698854e-06, | |
| "loss": 0.1512, | |
| "mean_token_accuracy": 0.9665188312530517, | |
| "num_tokens": 1146048693.0, | |
| "step": 8100 | |
| }, | |
| { | |
| "entropy": 1.3959095120429992, | |
| "epoch": 0.26607032091671834, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 5.887687475766435e-06, | |
| "loss": 0.1517, | |
| "mean_token_accuracy": 0.9670138394832611, | |
| "num_tokens": 1153416156.0, | |
| "step": 8150 | |
| }, | |
| { | |
| "entropy": 1.384766845703125, | |
| "epoch": 0.2677026541738761, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 5.8854821433303995e-06, | |
| "loss": 0.1478, | |
| "mean_token_accuracy": 0.9662396657466888, | |
| "num_tokens": 1160327310.0, | |
| "step": 8200 | |
| }, | |
| { | |
| "entropy": 1.3720842933654784, | |
| "epoch": 0.2693349874310339, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 5.883255790456365e-06, | |
| "loss": 0.1369, | |
| "mean_token_accuracy": 0.9690599977970124, | |
| "num_tokens": 1166793829.0, | |
| "step": 8250 | |
| }, | |
| { | |
| "entropy": 1.3875324892997742, | |
| "epoch": 0.2709673206881917, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 5.881008433363083e-06, | |
| "loss": 0.1484, | |
| "mean_token_accuracy": 0.9658425927162171, | |
| "num_tokens": 1173770645.0, | |
| "step": 8300 | |
| }, | |
| { | |
| "entropy": 1.382853055000305, | |
| "epoch": 0.27259965394534946, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 5.878740088422315e-06, | |
| "loss": 0.1633, | |
| "mean_token_accuracy": 0.9631330251693726, | |
| "num_tokens": 1181126599.0, | |
| "step": 8350 | |
| }, | |
| { | |
| "entropy": 1.3965485191345215, | |
| "epoch": 0.27423198720250724, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 5.87645077215872e-06, | |
| "loss": 0.1498, | |
| "mean_token_accuracy": 0.9665160596370697, | |
| "num_tokens": 1188172115.0, | |
| "step": 8400 | |
| }, | |
| { | |
| "entropy": 1.3885521602630615, | |
| "epoch": 0.275864320459665, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 5.874140501249728e-06, | |
| "loss": 0.1468, | |
| "mean_token_accuracy": 0.9669651210308075, | |
| "num_tokens": 1195102960.0, | |
| "step": 8450 | |
| }, | |
| { | |
| "entropy": 1.3970652842521667, | |
| "epoch": 0.2774966537168228, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 5.8718092925254235e-06, | |
| "loss": 0.1469, | |
| "mean_token_accuracy": 0.9666703069210052, | |
| "num_tokens": 1201982519.0, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.2774966537168228, | |
| "eval_entropy": 1.390671566327413, | |
| "eval_loss": 0.1665239781141281, | |
| "eval_mean_token_accuracy": 0.9631963141759237, | |
| "eval_num_tokens": 1201982519.0, | |
| "eval_runtime": 751.0843, | |
| "eval_samples_per_second": 12.856, | |
| "eval_steps_per_second": 0.101, | |
| "step": 8500 | |
| }, | |
| { | |
| "entropy": 1.3743389058113098, | |
| "epoch": 0.2791289869739806, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 5.86945716296842e-06, | |
| "loss": 0.1413, | |
| "mean_token_accuracy": 0.9681530177593232, | |
| "num_tokens": 1208731712.0, | |
| "step": 8550 | |
| }, | |
| { | |
| "entropy": 1.3886315321922302, | |
| "epoch": 0.28076132023113837, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 5.867084129713738e-06, | |
| "loss": 0.1553, | |
| "mean_token_accuracy": 0.9659830582141876, | |
| "num_tokens": 1215816513.0, | |
| "step": 8600 | |
| }, | |
| { | |
| "entropy": 1.3884997010231017, | |
| "epoch": 0.28239365348829615, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 5.864690210048677e-06, | |
| "loss": 0.1499, | |
| "mean_token_accuracy": 0.9667926502227783, | |
| "num_tokens": 1222796740.0, | |
| "step": 8650 | |
| }, | |
| { | |
| "entropy": 1.3810113263130188, | |
| "epoch": 0.28402598674545393, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 5.862275421412695e-06, | |
| "loss": 0.1428, | |
| "mean_token_accuracy": 0.968780642747879, | |
| "num_tokens": 1229478573.0, | |
| "step": 8700 | |
| }, | |
| { | |
| "entropy": 1.3753751254081725, | |
| "epoch": 0.2856583200026117, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 5.859839781397276e-06, | |
| "loss": 0.1552, | |
| "mean_token_accuracy": 0.9648597013950347, | |
| "num_tokens": 1236888916.0, | |
| "step": 8750 | |
| }, | |
| { | |
| "entropy": 1.3797388172149658, | |
| "epoch": 0.2872906532597695, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 5.857383307745805e-06, | |
| "loss": 0.1555, | |
| "mean_token_accuracy": 0.9654771292209625, | |
| "num_tokens": 1243945893.0, | |
| "step": 8800 | |
| }, | |
| { | |
| "entropy": 1.37408056974411, | |
| "epoch": 0.28892298651692727, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 5.854906018353436e-06, | |
| "loss": 0.1531, | |
| "mean_token_accuracy": 0.9655951869487762, | |
| "num_tokens": 1250815278.0, | |
| "step": 8850 | |
| }, | |
| { | |
| "entropy": 1.3714010500907898, | |
| "epoch": 0.29055531977408505, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 5.852407931266967e-06, | |
| "loss": 0.1416, | |
| "mean_token_accuracy": 0.967999415397644, | |
| "num_tokens": 1257589618.0, | |
| "step": 8900 | |
| }, | |
| { | |
| "entropy": 1.370381121635437, | |
| "epoch": 0.29218765303124283, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 5.849889064684703e-06, | |
| "loss": 0.156, | |
| "mean_token_accuracy": 0.965356330871582, | |
| "num_tokens": 1264949457.0, | |
| "step": 8950 | |
| }, | |
| { | |
| "entropy": 1.3676560163497924, | |
| "epoch": 0.2938199862884006, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 5.847349436956325e-06, | |
| "loss": 0.1609, | |
| "mean_token_accuracy": 0.9641006827354431, | |
| "num_tokens": 1272234102.0, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.2938199862884006, | |
| "eval_entropy": 1.38709463596344, | |
| "eval_loss": 0.1650434136390686, | |
| "eval_mean_token_accuracy": 0.9634115918477376, | |
| "eval_num_tokens": 1272234102.0, | |
| "eval_runtime": 751.7405, | |
| "eval_samples_per_second": 12.845, | |
| "eval_steps_per_second": 0.101, | |
| "step": 9000 | |
| }, | |
| { | |
| "entropy": 1.3809342765808106, | |
| "epoch": 0.2954523195455584, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 5.844789066582758e-06, | |
| "loss": 0.1432, | |
| "mean_token_accuracy": 0.9673550212383271, | |
| "num_tokens": 1279098038.0, | |
| "step": 9050 | |
| }, | |
| { | |
| "entropy": 1.40355872631073, | |
| "epoch": 0.2970846528027162, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 5.842207972216034e-06, | |
| "loss": 0.1521, | |
| "mean_token_accuracy": 0.9661613535881043, | |
| "num_tokens": 1286173347.0, | |
| "step": 9100 | |
| }, | |
| { | |
| "entropy": 1.3974176049232483, | |
| "epoch": 0.29871698605987396, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 5.839606172659159e-06, | |
| "loss": 0.1521, | |
| "mean_token_accuracy": 0.9656483995914459, | |
| "num_tokens": 1293330750.0, | |
| "step": 9150 | |
| }, | |
| { | |
| "entropy": 1.3879291534423828, | |
| "epoch": 0.30034931931703174, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 5.8369836868659706e-06, | |
| "loss": 0.1553, | |
| "mean_token_accuracy": 0.9647518181800843, | |
| "num_tokens": 1300327068.0, | |
| "step": 9200 | |
| }, | |
| { | |
| "entropy": 1.3752172994613647, | |
| "epoch": 0.3019816525741896, | |
| "grad_norm": 1.125, | |
| "learning_rate": 5.8343405339410085e-06, | |
| "loss": 0.1383, | |
| "mean_token_accuracy": 0.9691135132312775, | |
| "num_tokens": 1307021605.0, | |
| "step": 9250 | |
| }, | |
| { | |
| "entropy": 1.3903837966918946, | |
| "epoch": 0.30361398583134735, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 5.831676733139364e-06, | |
| "loss": 0.1458, | |
| "mean_token_accuracy": 0.9677986741065979, | |
| "num_tokens": 1314238674.0, | |
| "step": 9300 | |
| }, | |
| { | |
| "entropy": 1.3943523359298706, | |
| "epoch": 0.30524631908850514, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 5.828992303866552e-06, | |
| "loss": 0.158, | |
| "mean_token_accuracy": 0.9646199941635132, | |
| "num_tokens": 1321715116.0, | |
| "step": 9350 | |
| }, | |
| { | |
| "entropy": 1.4028909087181092, | |
| "epoch": 0.3068786523456629, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 5.82628726567836e-06, | |
| "loss": 0.1615, | |
| "mean_token_accuracy": 0.9638377511501313, | |
| "num_tokens": 1328922385.0, | |
| "step": 9400 | |
| }, | |
| { | |
| "entropy": 1.405231008529663, | |
| "epoch": 0.3085109856028207, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 5.823561638280711e-06, | |
| "loss": 0.1621, | |
| "mean_token_accuracy": 0.9635949361324311, | |
| "num_tokens": 1336385571.0, | |
| "step": 9450 | |
| }, | |
| { | |
| "entropy": 1.370991678237915, | |
| "epoch": 0.3101433188599785, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 5.82081544152952e-06, | |
| "loss": 0.1515, | |
| "mean_token_accuracy": 0.9661216616630555, | |
| "num_tokens": 1343638497.0, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.3101433188599785, | |
| "eval_entropy": 1.3891894817352295, | |
| "eval_loss": 0.16506299376487732, | |
| "eval_mean_token_accuracy": 0.9635340309143067, | |
| "eval_num_tokens": 1343638497.0, | |
| "eval_runtime": 749.2359, | |
| "eval_samples_per_second": 12.888, | |
| "eval_steps_per_second": 0.101, | |
| "step": 9500 | |
| }, | |
| { | |
| "entropy": 1.3911400294303895, | |
| "epoch": 0.31177565211713626, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 5.818048695430541e-06, | |
| "loss": 0.1496, | |
| "mean_token_accuracy": 0.9663948690891266, | |
| "num_tokens": 1350638403.0, | |
| "step": 9550 | |
| }, | |
| { | |
| "entropy": 1.3863462066650392, | |
| "epoch": 0.31340798537429404, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 5.815261420139235e-06, | |
| "loss": 0.1495, | |
| "mean_token_accuracy": 0.9667435109615325, | |
| "num_tokens": 1357942983.0, | |
| "step": 9600 | |
| }, | |
| { | |
| "entropy": 1.3836952257156372, | |
| "epoch": 0.3150403186314518, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 5.812453635960613e-06, | |
| "loss": 0.136, | |
| "mean_token_accuracy": 0.9696350061893463, | |
| "num_tokens": 1364441123.0, | |
| "step": 9650 | |
| }, | |
| { | |
| "entropy": 1.386801562309265, | |
| "epoch": 0.3166726518886096, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 5.809625363349091e-06, | |
| "loss": 0.1537, | |
| "mean_token_accuracy": 0.9660963475704193, | |
| "num_tokens": 1371638128.0, | |
| "step": 9700 | |
| }, | |
| { | |
| "entropy": 1.4033276891708375, | |
| "epoch": 0.3183049851457674, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 5.806776622908341e-06, | |
| "loss": 0.1489, | |
| "mean_token_accuracy": 0.9672618007659912, | |
| "num_tokens": 1378797795.0, | |
| "step": 9750 | |
| }, | |
| { | |
| "entropy": 1.3945609354972839, | |
| "epoch": 0.31993731840292516, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 5.8039074353911425e-06, | |
| "loss": 0.1476, | |
| "mean_token_accuracy": 0.9665350615978241, | |
| "num_tokens": 1385958442.0, | |
| "step": 9800 | |
| }, | |
| { | |
| "entropy": 1.3897303318977356, | |
| "epoch": 0.32156965166008294, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 5.801017821699229e-06, | |
| "loss": 0.1492, | |
| "mean_token_accuracy": 0.9658246648311615, | |
| "num_tokens": 1392915332.0, | |
| "step": 9850 | |
| }, | |
| { | |
| "entropy": 1.397285952568054, | |
| "epoch": 0.3232019849172407, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 5.798107802883135e-06, | |
| "loss": 0.1538, | |
| "mean_token_accuracy": 0.9644203245639801, | |
| "num_tokens": 1399970378.0, | |
| "step": 9900 | |
| }, | |
| { | |
| "entropy": 1.396610188484192, | |
| "epoch": 0.3248343181743985, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 5.795177400142047e-06, | |
| "loss": 0.1399, | |
| "mean_token_accuracy": 0.9683949732780457, | |
| "num_tokens": 1406942412.0, | |
| "step": 9950 | |
| }, | |
| { | |
| "entropy": 1.3993594455718994, | |
| "epoch": 0.3264666514315563, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 5.792226634823645e-06, | |
| "loss": 0.166, | |
| "mean_token_accuracy": 0.9635122084617614, | |
| "num_tokens": 1414672963.0, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.3264666514315563, | |
| "eval_entropy": 1.3897196292877196, | |
| "eval_loss": 0.16665887832641602, | |
| "eval_mean_token_accuracy": 0.9631382012367249, | |
| "eval_num_tokens": 1414672963.0, | |
| "eval_runtime": 743.4472, | |
| "eval_samples_per_second": 12.988, | |
| "eval_steps_per_second": 0.102, | |
| "step": 10000 | |
| }, | |
| { | |
| "entropy": 1.3787381386756896, | |
| "epoch": 0.32809898468871407, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 5.789255528423945e-06, | |
| "loss": 0.1449, | |
| "mean_token_accuracy": 0.9675530314445495, | |
| "num_tokens": 1422037159.0, | |
| "step": 10050 | |
| }, | |
| { | |
| "entropy": 1.3835061955451966, | |
| "epoch": 0.32973131794587185, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 5.7862641025871535e-06, | |
| "loss": 0.1493, | |
| "mean_token_accuracy": 0.966714415550232, | |
| "num_tokens": 1428834412.0, | |
| "step": 10100 | |
| }, | |
| { | |
| "entropy": 1.3994423723220826, | |
| "epoch": 0.33136365120302963, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 5.783252379105494e-06, | |
| "loss": 0.1478, | |
| "mean_token_accuracy": 0.9666897785663605, | |
| "num_tokens": 1435825992.0, | |
| "step": 10150 | |
| }, | |
| { | |
| "entropy": 1.4079461932182311, | |
| "epoch": 0.3329959844601874, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 5.780220379919062e-06, | |
| "loss": 0.1597, | |
| "mean_token_accuracy": 0.9649293422698975, | |
| "num_tokens": 1443057429.0, | |
| "step": 10200 | |
| }, | |
| { | |
| "entropy": 1.4087351322174073, | |
| "epoch": 0.3346283177173452, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 5.777168127115654e-06, | |
| "loss": 0.1495, | |
| "mean_token_accuracy": 0.9670829331874847, | |
| "num_tokens": 1450143525.0, | |
| "step": 10250 | |
| }, | |
| { | |
| "entropy": 1.4217532348632813, | |
| "epoch": 0.336260650974503, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 5.774095642930618e-06, | |
| "loss": 0.1538, | |
| "mean_token_accuracy": 0.9653639376163483, | |
| "num_tokens": 1456853199.0, | |
| "step": 10300 | |
| }, | |
| { | |
| "entropy": 1.4182784628868104, | |
| "epoch": 0.33789298423166075, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 5.771002949746681e-06, | |
| "loss": 0.1592, | |
| "mean_token_accuracy": 0.9639875698089599, | |
| "num_tokens": 1464260063.0, | |
| "step": 10350 | |
| }, | |
| { | |
| "entropy": 1.3954361629486085, | |
| "epoch": 0.33952531748881853, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 5.76789007009379e-06, | |
| "loss": 0.1444, | |
| "mean_token_accuracy": 0.9671654045581818, | |
| "num_tokens": 1471363389.0, | |
| "step": 10400 | |
| }, | |
| { | |
| "entropy": 1.3968884110450746, | |
| "epoch": 0.3411576507459763, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.7647570266489535e-06, | |
| "loss": 0.1325, | |
| "mean_token_accuracy": 0.9698529148101807, | |
| "num_tokens": 1478273376.0, | |
| "step": 10450 | |
| }, | |
| { | |
| "entropy": 1.397156729698181, | |
| "epoch": 0.3427899840031341, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 5.7616038422360674e-06, | |
| "loss": 0.1458, | |
| "mean_token_accuracy": 0.9674423885345459, | |
| "num_tokens": 1485177982.0, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.3427899840031341, | |
| "eval_entropy": 1.4138343874613444, | |
| "eval_loss": 0.1639399528503418, | |
| "eval_mean_token_accuracy": 0.9635584902763367, | |
| "eval_num_tokens": 1485177982.0, | |
| "eval_runtime": 748.9668, | |
| "eval_samples_per_second": 12.892, | |
| "eval_steps_per_second": 0.101, | |
| "step": 10500 | |
| }, | |
| { | |
| "entropy": 1.4251120805740356, | |
| "epoch": 0.3444223172602919, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 5.758430539825751e-06, | |
| "loss": 0.1423, | |
| "mean_token_accuracy": 0.968468290567398, | |
| "num_tokens": 1491740592.0, | |
| "step": 10550 | |
| }, | |
| { | |
| "entropy": 1.4330598759651183, | |
| "epoch": 0.34605465051744966, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 5.755237142535185e-06, | |
| "loss": 0.1584, | |
| "mean_token_accuracy": 0.9640131962299346, | |
| "num_tokens": 1499230338.0, | |
| "step": 10600 | |
| }, | |
| { | |
| "entropy": 1.4190063452720643, | |
| "epoch": 0.34768698377460744, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 5.752023673627936e-06, | |
| "loss": 0.1549, | |
| "mean_token_accuracy": 0.9651542448997498, | |
| "num_tokens": 1506807165.0, | |
| "step": 10650 | |
| }, | |
| { | |
| "entropy": 1.398562343120575, | |
| "epoch": 0.3493193170317652, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 5.748790156513793e-06, | |
| "loss": 0.1429, | |
| "mean_token_accuracy": 0.9676708686351776, | |
| "num_tokens": 1513566074.0, | |
| "step": 10700 | |
| }, | |
| { | |
| "entropy": 1.4046012330055238, | |
| "epoch": 0.350951650288923, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 5.74553661474859e-06, | |
| "loss": 0.1475, | |
| "mean_token_accuracy": 0.9663785743713379, | |
| "num_tokens": 1520732578.0, | |
| "step": 10750 | |
| }, | |
| { | |
| "entropy": 1.4011975502967835, | |
| "epoch": 0.3525839835460808, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 5.742263072034044e-06, | |
| "loss": 0.133, | |
| "mean_token_accuracy": 0.9697633123397827, | |
| "num_tokens": 1527341695.0, | |
| "step": 10800 | |
| }, | |
| { | |
| "entropy": 1.4020631575584412, | |
| "epoch": 0.35421631680323856, | |
| "grad_norm": 1.375, | |
| "learning_rate": 5.738969552217573e-06, | |
| "loss": 0.1529, | |
| "mean_token_accuracy": 0.9653546392917634, | |
| "num_tokens": 1534441768.0, | |
| "step": 10850 | |
| }, | |
| { | |
| "entropy": 1.4063316154479981, | |
| "epoch": 0.35584865006039634, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 5.735656079292128e-06, | |
| "loss": 0.1541, | |
| "mean_token_accuracy": 0.9655212807655335, | |
| "num_tokens": 1542032259.0, | |
| "step": 10900 | |
| }, | |
| { | |
| "entropy": 1.3955928492546081, | |
| "epoch": 0.3574809833175541, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 5.732322677396013e-06, | |
| "loss": 0.1379, | |
| "mean_token_accuracy": 0.9682280993461609, | |
| "num_tokens": 1549145530.0, | |
| "step": 10950 | |
| }, | |
| { | |
| "entropy": 1.417706184387207, | |
| "epoch": 0.3591133165747119, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 5.728969370812716e-06, | |
| "loss": 0.1502, | |
| "mean_token_accuracy": 0.9673383378982544, | |
| "num_tokens": 1556005225.0, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.3591133165747119, | |
| "eval_entropy": 1.4156667073567708, | |
| "eval_loss": 0.1644313782453537, | |
| "eval_mean_token_accuracy": 0.9634674708048503, | |
| "eval_num_tokens": 1556005225.0, | |
| "eval_runtime": 749.0504, | |
| "eval_samples_per_second": 12.891, | |
| "eval_steps_per_second": 0.101, | |
| "step": 11000 | |
| }, | |
| { | |
| "entropy": 1.4165368866920471, | |
| "epoch": 0.3607456498318697, | |
| "grad_norm": 3.625, | |
| "learning_rate": 5.725596183970729e-06, | |
| "loss": 0.1491, | |
| "mean_token_accuracy": 0.9657526600360871, | |
| "num_tokens": 1563254781.0, | |
| "step": 11050 | |
| }, | |
| { | |
| "entropy": 1.4074536633491517, | |
| "epoch": 0.36237798308902747, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 5.722203141443365e-06, | |
| "loss": 0.1452, | |
| "mean_token_accuracy": 0.9683119165897369, | |
| "num_tokens": 1569838865.0, | |
| "step": 11100 | |
| }, | |
| { | |
| "entropy": 1.4159915471076965, | |
| "epoch": 0.36401031634618525, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 5.718790267948591e-06, | |
| "loss": 0.1505, | |
| "mean_token_accuracy": 0.9666063642501831, | |
| "num_tokens": 1576948931.0, | |
| "step": 11150 | |
| }, | |
| { | |
| "entropy": 1.4229880380630493, | |
| "epoch": 0.36564264960334303, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 5.715357588348832e-06, | |
| "loss": 0.1522, | |
| "mean_token_accuracy": 0.9668408262729645, | |
| "num_tokens": 1584164705.0, | |
| "step": 11200 | |
| }, | |
| { | |
| "entropy": 1.396066279411316, | |
| "epoch": 0.3672749828605008, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.711905127650807e-06, | |
| "loss": 0.1373, | |
| "mean_token_accuracy": 0.9693786513805389, | |
| "num_tokens": 1591058327.0, | |
| "step": 11250 | |
| }, | |
| { | |
| "entropy": 1.4097445344924926, | |
| "epoch": 0.3689073161176586, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 5.7084329110053294e-06, | |
| "loss": 0.1486, | |
| "mean_token_accuracy": 0.9671066462993622, | |
| "num_tokens": 1598091112.0, | |
| "step": 11300 | |
| }, | |
| { | |
| "entropy": 1.4326444959640503, | |
| "epoch": 0.37053964937481637, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 5.70494096370714e-06, | |
| "loss": 0.1495, | |
| "mean_token_accuracy": 0.9672650814056396, | |
| "num_tokens": 1605344616.0, | |
| "step": 11350 | |
| }, | |
| { | |
| "entropy": 1.4539379978179932, | |
| "epoch": 0.37217198263197415, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 5.701429311194713e-06, | |
| "loss": 0.1593, | |
| "mean_token_accuracy": 0.9647262859344482, | |
| "num_tokens": 1612587700.0, | |
| "step": 11400 | |
| }, | |
| { | |
| "entropy": 1.4599957299232482, | |
| "epoch": 0.37380431588913193, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 5.6978979790500695e-06, | |
| "loss": 0.1428, | |
| "mean_token_accuracy": 0.96798752784729, | |
| "num_tokens": 1619869251.0, | |
| "step": 11450 | |
| }, | |
| { | |
| "entropy": 1.456819851398468, | |
| "epoch": 0.3754366491462897, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 5.694346992998601e-06, | |
| "loss": 0.157, | |
| "mean_token_accuracy": 0.9647147190570832, | |
| "num_tokens": 1627103999.0, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.3754366491462897, | |
| "eval_entropy": 1.4559897645314535, | |
| "eval_loss": 0.16348470747470856, | |
| "eval_mean_token_accuracy": 0.9634300549825032, | |
| "eval_num_tokens": 1627103999.0, | |
| "eval_runtime": 750.9869, | |
| "eval_samples_per_second": 12.858, | |
| "eval_steps_per_second": 0.101, | |
| "step": 11500 | |
| }, | |
| { | |
| "entropy": 1.445942795276642, | |
| "epoch": 0.3770689824034475, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 5.690776378908871e-06, | |
| "loss": 0.1559, | |
| "mean_token_accuracy": 0.9648426783084869, | |
| "num_tokens": 1634135214.0, | |
| "step": 11550 | |
| }, | |
| { | |
| "entropy": 1.4337683749198913, | |
| "epoch": 0.3787013156606053, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 5.687186162792432e-06, | |
| "loss": 0.1392, | |
| "mean_token_accuracy": 0.9684048485755921, | |
| "num_tokens": 1641123881.0, | |
| "step": 11600 | |
| }, | |
| { | |
| "entropy": 1.4271648550033569, | |
| "epoch": 0.38033364891776306, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 5.683576370803637e-06, | |
| "loss": 0.1442, | |
| "mean_token_accuracy": 0.9671970081329345, | |
| "num_tokens": 1648087593.0, | |
| "step": 11650 | |
| }, | |
| { | |
| "entropy": 1.4187105298042297, | |
| "epoch": 0.38196598217492084, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 5.679947029239446e-06, | |
| "loss": 0.1558, | |
| "mean_token_accuracy": 0.9652733910083771, | |
| "num_tokens": 1655675590.0, | |
| "step": 11700 | |
| }, | |
| { | |
| "entropy": 1.4136518168449401, | |
| "epoch": 0.3835983154320786, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 5.676298164539235e-06, | |
| "loss": 0.1344, | |
| "mean_token_accuracy": 0.9695696973800659, | |
| "num_tokens": 1662994220.0, | |
| "step": 11750 | |
| }, | |
| { | |
| "entropy": 1.4156992936134338, | |
| "epoch": 0.3852306486892364, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 5.672629803284603e-06, | |
| "loss": 0.1445, | |
| "mean_token_accuracy": 0.967056336402893, | |
| "num_tokens": 1670282239.0, | |
| "step": 11800 | |
| }, | |
| { | |
| "entropy": 1.381675865650177, | |
| "epoch": 0.3868629819463942, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 5.668941972199185e-06, | |
| "loss": 0.1318, | |
| "mean_token_accuracy": 0.9707480573654175, | |
| "num_tokens": 1677092297.0, | |
| "step": 11850 | |
| }, | |
| { | |
| "entropy": 1.413828341960907, | |
| "epoch": 0.38849531520355196, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 5.665234698148447e-06, | |
| "loss": 0.1398, | |
| "mean_token_accuracy": 0.9692868089675903, | |
| "num_tokens": 1683596527.0, | |
| "step": 11900 | |
| }, | |
| { | |
| "entropy": 1.4255395197868348, | |
| "epoch": 0.39012764846070974, | |
| "grad_norm": 1.25, | |
| "learning_rate": 5.661508008139494e-06, | |
| "loss": 0.1428, | |
| "mean_token_accuracy": 0.9684118723869324, | |
| "num_tokens": 1690356557.0, | |
| "step": 11950 | |
| }, | |
| { | |
| "entropy": 1.421637599468231, | |
| "epoch": 0.3917599817178675, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 5.657761929320876e-06, | |
| "loss": 0.1358, | |
| "mean_token_accuracy": 0.9686246883869171, | |
| "num_tokens": 1696780737.0, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.3917599817178675, | |
| "eval_entropy": 1.4270846033096314, | |
| "eval_loss": 0.16205987334251404, | |
| "eval_mean_token_accuracy": 0.9638035273551941, | |
| "eval_num_tokens": 1696780737.0, | |
| "eval_runtime": 744.4564, | |
| "eval_samples_per_second": 12.971, | |
| "eval_steps_per_second": 0.102, | |
| "step": 12000 | |
| }, | |
| { | |
| "entropy": 1.4344228434562682, | |
| "epoch": 0.3933923149750253, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 5.65399648898239e-06, | |
| "loss": 0.1479, | |
| "mean_token_accuracy": 0.9664253509044647, | |
| "num_tokens": 1703868095.0, | |
| "step": 12050 | |
| }, | |
| { | |
| "entropy": 1.40860848903656, | |
| "epoch": 0.3950246482321831, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 5.650211714554876e-06, | |
| "loss": 0.1593, | |
| "mean_token_accuracy": 0.9646110832691193, | |
| "num_tokens": 1711471948.0, | |
| "step": 12100 | |
| }, | |
| { | |
| "entropy": 1.420129976272583, | |
| "epoch": 0.39665698148934087, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 5.6464076336100246e-06, | |
| "loss": 0.1455, | |
| "mean_token_accuracy": 0.9673341345787049, | |
| "num_tokens": 1718673490.0, | |
| "step": 12150 | |
| }, | |
| { | |
| "entropy": 1.4015104007720947, | |
| "epoch": 0.39828931474649865, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 5.642584273860171e-06, | |
| "loss": 0.1518, | |
| "mean_token_accuracy": 0.966002242565155, | |
| "num_tokens": 1726275054.0, | |
| "step": 12200 | |
| }, | |
| { | |
| "entropy": 1.4261247539520263, | |
| "epoch": 0.3999216480036564, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 5.6387416631580936e-06, | |
| "loss": 0.1417, | |
| "mean_token_accuracy": 0.9675724005699158, | |
| "num_tokens": 1733190804.0, | |
| "step": 12250 | |
| }, | |
| { | |
| "entropy": 1.4047169375419617, | |
| "epoch": 0.4015539812608142, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 5.634879829496813e-06, | |
| "loss": 0.149, | |
| "mean_token_accuracy": 0.9663934874534607, | |
| "num_tokens": 1740409762.0, | |
| "step": 12300 | |
| }, | |
| { | |
| "entropy": 1.4087544870376587, | |
| "epoch": 0.403186314517972, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 5.630998801009386e-06, | |
| "loss": 0.1422, | |
| "mean_token_accuracy": 0.9677630662918091, | |
| "num_tokens": 1747425003.0, | |
| "step": 12350 | |
| }, | |
| { | |
| "entropy": 1.4205935049057006, | |
| "epoch": 0.40481864777512977, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 5.627098605968702e-06, | |
| "loss": 0.1518, | |
| "mean_token_accuracy": 0.9655173766613007, | |
| "num_tokens": 1754787565.0, | |
| "step": 12400 | |
| }, | |
| { | |
| "entropy": 1.387464382648468, | |
| "epoch": 0.40645098103228755, | |
| "grad_norm": 1.375, | |
| "learning_rate": 5.62317927278728e-06, | |
| "loss": 0.1372, | |
| "mean_token_accuracy": 0.968756947517395, | |
| "num_tokens": 1761849063.0, | |
| "step": 12450 | |
| }, | |
| { | |
| "entropy": 1.3994211435317994, | |
| "epoch": 0.40808331428944533, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 5.619240830017051e-06, | |
| "loss": 0.144, | |
| "mean_token_accuracy": 0.9678842532634735, | |
| "num_tokens": 1768859328.0, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.40808331428944533, | |
| "eval_entropy": 1.405422296524048, | |
| "eval_loss": 0.16121897101402283, | |
| "eval_mean_token_accuracy": 0.9639042560259501, | |
| "eval_num_tokens": 1768859328.0, | |
| "eval_runtime": 749.481, | |
| "eval_samples_per_second": 12.884, | |
| "eval_steps_per_second": 0.101, | |
| "step": 12500 | |
| }, | |
| { | |
| "entropy": 1.419663178920746, | |
| "epoch": 0.4097156475466031, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 5.615283306349166e-06, | |
| "loss": 0.1409, | |
| "mean_token_accuracy": 0.9682516789436341, | |
| "num_tokens": 1775694980.0, | |
| "step": 12550 | |
| }, | |
| { | |
| "entropy": 1.4085765600204467, | |
| "epoch": 0.4113479808037609, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 5.611306730613772e-06, | |
| "loss": 0.1326, | |
| "mean_token_accuracy": 0.9704544687271118, | |
| "num_tokens": 1782348927.0, | |
| "step": 12600 | |
| }, | |
| { | |
| "entropy": 1.4163624548912048, | |
| "epoch": 0.4129803140609187, | |
| "grad_norm": 2.0, | |
| "learning_rate": 5.607311131779812e-06, | |
| "loss": 0.1501, | |
| "mean_token_accuracy": 0.9661952567100525, | |
| "num_tokens": 1789610698.0, | |
| "step": 12650 | |
| }, | |
| { | |
| "entropy": 1.4280620789527894, | |
| "epoch": 0.41461264731807645, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 5.603296538954808e-06, | |
| "loss": 0.143, | |
| "mean_token_accuracy": 0.9683597815036774, | |
| "num_tokens": 1796316393.0, | |
| "step": 12700 | |
| }, | |
| { | |
| "entropy": 1.425929193496704, | |
| "epoch": 0.41624498057523424, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 5.599262981384652e-06, | |
| "loss": 0.1477, | |
| "mean_token_accuracy": 0.9663317048549652, | |
| "num_tokens": 1803325180.0, | |
| "step": 12750 | |
| }, | |
| { | |
| "entropy": 1.4301575493812562, | |
| "epoch": 0.417877313832392, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 5.595210488453392e-06, | |
| "loss": 0.1557, | |
| "mean_token_accuracy": 0.9648576879501343, | |
| "num_tokens": 1810717661.0, | |
| "step": 12800 | |
| }, | |
| { | |
| "entropy": 1.405632803440094, | |
| "epoch": 0.4195096470895498, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 5.591139089683021e-06, | |
| "loss": 0.1397, | |
| "mean_token_accuracy": 0.9687881779670715, | |
| "num_tokens": 1817611883.0, | |
| "step": 12850 | |
| }, | |
| { | |
| "entropy": 1.3980163097381593, | |
| "epoch": 0.4211419803467076, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.587048814733253e-06, | |
| "loss": 0.1319, | |
| "mean_token_accuracy": 0.9697498416900635, | |
| "num_tokens": 1824343282.0, | |
| "step": 12900 | |
| }, | |
| { | |
| "entropy": 1.3963736867904664, | |
| "epoch": 0.42277431360386536, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 5.582939693401319e-06, | |
| "loss": 0.1378, | |
| "mean_token_accuracy": 0.9693096280097961, | |
| "num_tokens": 1831509722.0, | |
| "step": 12950 | |
| }, | |
| { | |
| "entropy": 1.4174233818054198, | |
| "epoch": 0.42440664686102314, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 5.57881175562174e-06, | |
| "loss": 0.1565, | |
| "mean_token_accuracy": 0.9654737007617951, | |
| "num_tokens": 1838857002.0, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.42440664686102314, | |
| "eval_entropy": 1.4053240156173705, | |
| "eval_loss": 0.1606142520904541, | |
| "eval_mean_token_accuracy": 0.9640888079007467, | |
| "eval_num_tokens": 1838857002.0, | |
| "eval_runtime": 748.4765, | |
| "eval_samples_per_second": 12.901, | |
| "eval_steps_per_second": 0.102, | |
| "step": 13000 | |
| }, | |
| { | |
| "entropy": 1.4016055393218994, | |
| "epoch": 0.4260389801181809, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.574665031466116e-06, | |
| "loss": 0.1338, | |
| "mean_token_accuracy": 0.9696434116363526, | |
| "num_tokens": 1845457705.0, | |
| "step": 13050 | |
| }, | |
| { | |
| "entropy": 1.4049336218833923, | |
| "epoch": 0.4276713133753387, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 5.570499551142902e-06, | |
| "loss": 0.1466, | |
| "mean_token_accuracy": 0.9665655505657196, | |
| "num_tokens": 1852644618.0, | |
| "step": 13100 | |
| }, | |
| { | |
| "entropy": 1.4158594751358031, | |
| "epoch": 0.4293036466324965, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 5.566315344997188e-06, | |
| "loss": 0.1405, | |
| "mean_token_accuracy": 0.9682467067241669, | |
| "num_tokens": 1859532727.0, | |
| "step": 13150 | |
| }, | |
| { | |
| "entropy": 1.406804118156433, | |
| "epoch": 0.43093597988965426, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 5.562112443510483e-06, | |
| "loss": 0.1347, | |
| "mean_token_accuracy": 0.969345440864563, | |
| "num_tokens": 1866433523.0, | |
| "step": 13200 | |
| }, | |
| { | |
| "entropy": 1.414643156528473, | |
| "epoch": 0.43256831314681204, | |
| "grad_norm": 1.375, | |
| "learning_rate": 5.557890877300489e-06, | |
| "loss": 0.1455, | |
| "mean_token_accuracy": 0.9673810720443725, | |
| "num_tokens": 1873742139.0, | |
| "step": 13250 | |
| }, | |
| { | |
| "entropy": 1.4243081569671632, | |
| "epoch": 0.4342006464039698, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 5.553650677120876e-06, | |
| "loss": 0.14, | |
| "mean_token_accuracy": 0.9683500599861145, | |
| "num_tokens": 1881001857.0, | |
| "step": 13300 | |
| }, | |
| { | |
| "entropy": 1.4164328074455261, | |
| "epoch": 0.4358329796611276, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.549391873861064e-06, | |
| "loss": 0.1485, | |
| "mean_token_accuracy": 0.9669756269454957, | |
| "num_tokens": 1888442521.0, | |
| "step": 13350 | |
| }, | |
| { | |
| "entropy": 1.4102159285545348, | |
| "epoch": 0.4374653129182854, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 5.545114498545991e-06, | |
| "loss": 0.139, | |
| "mean_token_accuracy": 0.9687536859512329, | |
| "num_tokens": 1895774805.0, | |
| "step": 13400 | |
| }, | |
| { | |
| "entropy": 1.4052273893356324, | |
| "epoch": 0.43909764617544317, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 5.540818582335894e-06, | |
| "loss": 0.1442, | |
| "mean_token_accuracy": 0.9679582285881042, | |
| "num_tokens": 1902866432.0, | |
| "step": 13450 | |
| }, | |
| { | |
| "entropy": 1.4166702628135681, | |
| "epoch": 0.44072997943260095, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 5.536504156526077e-06, | |
| "loss": 0.1481, | |
| "mean_token_accuracy": 0.9664357197284699, | |
| "num_tokens": 1910343947.0, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.44072997943260095, | |
| "eval_entropy": 1.4095580498377482, | |
| "eval_loss": 0.1602867692708969, | |
| "eval_mean_token_accuracy": 0.9641507911682129, | |
| "eval_num_tokens": 1910343947.0, | |
| "eval_runtime": 748.7079, | |
| "eval_samples_per_second": 12.897, | |
| "eval_steps_per_second": 0.102, | |
| "step": 13500 | |
| }, | |
| { | |
| "entropy": 1.4171498656272887, | |
| "epoch": 0.44236231268975873, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 5.5321712525466815e-06, | |
| "loss": 0.1444, | |
| "mean_token_accuracy": 0.967768360376358, | |
| "num_tokens": 1917115548.0, | |
| "step": 13550 | |
| }, | |
| { | |
| "entropy": 1.4215400004386902, | |
| "epoch": 0.4439946459469165, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 5.5278199019624665e-06, | |
| "loss": 0.1558, | |
| "mean_token_accuracy": 0.965044105052948, | |
| "num_tokens": 1924701669.0, | |
| "step": 13600 | |
| }, | |
| { | |
| "entropy": 1.4144377851486205, | |
| "epoch": 0.4456269792040743, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 5.523450136472569e-06, | |
| "loss": 0.1476, | |
| "mean_token_accuracy": 0.9668766951560974, | |
| "num_tokens": 1932051913.0, | |
| "step": 13650 | |
| }, | |
| { | |
| "entropy": 1.4168913602828979, | |
| "epoch": 0.4472593124612321, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 5.519061987910276e-06, | |
| "loss": 0.1344, | |
| "mean_token_accuracy": 0.9691276931762696, | |
| "num_tokens": 1938674407.0, | |
| "step": 13700 | |
| }, | |
| { | |
| "entropy": 1.4047640895843505, | |
| "epoch": 0.44889164571838985, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 5.514655488242795e-06, | |
| "loss": 0.1503, | |
| "mean_token_accuracy": 0.9665549874305726, | |
| "num_tokens": 1945577505.0, | |
| "step": 13750 | |
| }, | |
| { | |
| "entropy": 1.4135752677917481, | |
| "epoch": 0.45052397897554763, | |
| "grad_norm": 2.125, | |
| "learning_rate": 5.510230669571018e-06, | |
| "loss": 0.1492, | |
| "mean_token_accuracy": 0.9664253723621369, | |
| "num_tokens": 1952659833.0, | |
| "step": 13800 | |
| }, | |
| { | |
| "entropy": 1.3945325517654419, | |
| "epoch": 0.4521563122327054, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.505787564129291e-06, | |
| "loss": 0.1376, | |
| "mean_token_accuracy": 0.968870245218277, | |
| "num_tokens": 1959360672.0, | |
| "step": 13850 | |
| }, | |
| { | |
| "entropy": 1.4091899871826172, | |
| "epoch": 0.4537886454898632, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 5.5013262042851764e-06, | |
| "loss": 0.1352, | |
| "mean_token_accuracy": 0.9697363257408143, | |
| "num_tokens": 1965923476.0, | |
| "step": 13900 | |
| }, | |
| { | |
| "entropy": 1.409596972465515, | |
| "epoch": 0.455420978747021, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 5.4968466225392165e-06, | |
| "loss": 0.147, | |
| "mean_token_accuracy": 0.966924901008606, | |
| "num_tokens": 1972776862.0, | |
| "step": 13950 | |
| }, | |
| { | |
| "entropy": 1.4293616366386415, | |
| "epoch": 0.45705331200417876, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 5.4923488515247e-06, | |
| "loss": 0.1417, | |
| "mean_token_accuracy": 0.9687818443775177, | |
| "num_tokens": 1979187620.0, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.45705331200417876, | |
| "eval_entropy": 1.4118124723434449, | |
| "eval_loss": 0.16126905381679535, | |
| "eval_mean_token_accuracy": 0.9638542596499126, | |
| "eval_num_tokens": 1979187620.0, | |
| "eval_runtime": 745.6037, | |
| "eval_samples_per_second": 12.951, | |
| "eval_steps_per_second": 0.102, | |
| "step": 14000 | |
| }, | |
| { | |
| "entropy": 1.415801317691803, | |
| "epoch": 0.45868564526133654, | |
| "grad_norm": 1.25, | |
| "learning_rate": 5.487832924007422e-06, | |
| "loss": 0.1483, | |
| "mean_token_accuracy": 0.9666338682174682, | |
| "num_tokens": 1986603476.0, | |
| "step": 14050 | |
| }, | |
| { | |
| "entropy": 1.4051340079307557, | |
| "epoch": 0.4603179785184943, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 5.4832988728854465e-06, | |
| "loss": 0.1557, | |
| "mean_token_accuracy": 0.964282066822052, | |
| "num_tokens": 1994204498.0, | |
| "step": 14100 | |
| }, | |
| { | |
| "entropy": 1.415133674144745, | |
| "epoch": 0.4619503117756521, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 5.478746731188865e-06, | |
| "loss": 0.1397, | |
| "mean_token_accuracy": 0.9680150365829467, | |
| "num_tokens": 2000941510.0, | |
| "step": 14150 | |
| }, | |
| { | |
| "entropy": 1.4023677587509156, | |
| "epoch": 0.4635826450328099, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 5.474176532079557e-06, | |
| "loss": 0.139, | |
| "mean_token_accuracy": 0.9682874810695649, | |
| "num_tokens": 2007806068.0, | |
| "step": 14200 | |
| }, | |
| { | |
| "entropy": 1.4310323596000671, | |
| "epoch": 0.46521497828996766, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 5.46958830885095e-06, | |
| "loss": 0.1513, | |
| "mean_token_accuracy": 0.9669715762138367, | |
| "num_tokens": 2014807742.0, | |
| "step": 14250 | |
| }, | |
| { | |
| "entropy": 1.4305420732498169, | |
| "epoch": 0.46684731154712544, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 5.464982094927772e-06, | |
| "loss": 0.1486, | |
| "mean_token_accuracy": 0.966749495267868, | |
| "num_tokens": 2021683076.0, | |
| "step": 14300 | |
| }, | |
| { | |
| "entropy": 1.4090232229232789, | |
| "epoch": 0.4684796448042832, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 5.460357923865814e-06, | |
| "loss": 0.1379, | |
| "mean_token_accuracy": 0.9685410165786743, | |
| "num_tokens": 2028415183.0, | |
| "step": 14350 | |
| }, | |
| { | |
| "entropy": 1.4059256029129028, | |
| "epoch": 0.470111978061441, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 5.4557158293516845e-06, | |
| "loss": 0.1368, | |
| "mean_token_accuracy": 0.9691716039180756, | |
| "num_tokens": 2035435433.0, | |
| "step": 14400 | |
| }, | |
| { | |
| "entropy": 1.415133411884308, | |
| "epoch": 0.4717443113185988, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 5.451055845202559e-06, | |
| "loss": 0.1558, | |
| "mean_token_accuracy": 0.9652414166927338, | |
| "num_tokens": 2042639097.0, | |
| "step": 14450 | |
| }, | |
| { | |
| "entropy": 1.4125458192825318, | |
| "epoch": 0.47337664457575657, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 5.446378005365937e-06, | |
| "loss": 0.1465, | |
| "mean_token_accuracy": 0.9667794346809387, | |
| "num_tokens": 2049839883.0, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.47337664457575657, | |
| "eval_entropy": 1.4099786535898844, | |
| "eval_loss": 0.16086123883724213, | |
| "eval_mean_token_accuracy": 0.9639045866330465, | |
| "eval_num_tokens": 2049839883.0, | |
| "eval_runtime": 751.2639, | |
| "eval_samples_per_second": 12.853, | |
| "eval_steps_per_second": 0.101, | |
| "step": 14500 | |
| }, | |
| { | |
| "entropy": 1.4155534076690675, | |
| "epoch": 0.47500897783291435, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 5.441682343919398e-06, | |
| "loss": 0.1515, | |
| "mean_token_accuracy": 0.9656119549274444, | |
| "num_tokens": 2057493706.0, | |
| "step": 14550 | |
| }, | |
| { | |
| "entropy": 1.4232526874542237, | |
| "epoch": 0.47664131109007213, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 5.436968895070349e-06, | |
| "loss": 0.1463, | |
| "mean_token_accuracy": 0.9669792699813843, | |
| "num_tokens": 2064599903.0, | |
| "step": 14600 | |
| }, | |
| { | |
| "entropy": 1.426460826396942, | |
| "epoch": 0.4782736443472299, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 5.432237693155773e-06, | |
| "loss": 0.1551, | |
| "mean_token_accuracy": 0.9656824862957001, | |
| "num_tokens": 2072278266.0, | |
| "step": 14650 | |
| }, | |
| { | |
| "entropy": 1.447964243888855, | |
| "epoch": 0.4799059776043877, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 5.427488772641989e-06, | |
| "loss": 0.1505, | |
| "mean_token_accuracy": 0.9661068272590637, | |
| "num_tokens": 2079638913.0, | |
| "step": 14700 | |
| }, | |
| { | |
| "entropy": 1.443111469745636, | |
| "epoch": 0.48153831086154547, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 5.422722168124386e-06, | |
| "loss": 0.1452, | |
| "mean_token_accuracy": 0.9680136227607727, | |
| "num_tokens": 2086742924.0, | |
| "step": 14750 | |
| }, | |
| { | |
| "entropy": 1.436655399799347, | |
| "epoch": 0.48317064411870325, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 5.417937914327187e-06, | |
| "loss": 0.1466, | |
| "mean_token_accuracy": 0.9671316587924957, | |
| "num_tokens": 2094123207.0, | |
| "step": 14800 | |
| }, | |
| { | |
| "entropy": 1.4242712426185609, | |
| "epoch": 0.48480297737586103, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 5.413136046103181e-06, | |
| "loss": 0.1471, | |
| "mean_token_accuracy": 0.9668898284435272, | |
| "num_tokens": 2101338532.0, | |
| "step": 14850 | |
| }, | |
| { | |
| "entropy": 1.4219823241233827, | |
| "epoch": 0.4864353106330188, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 5.408316598433483e-06, | |
| "loss": 0.1387, | |
| "mean_token_accuracy": 0.9691325139999389, | |
| "num_tokens": 2107835134.0, | |
| "step": 14900 | |
| }, | |
| { | |
| "entropy": 1.4221902179718018, | |
| "epoch": 0.4880676438901766, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 5.403479606427267e-06, | |
| "loss": 0.1344, | |
| "mean_token_accuracy": 0.9696690511703491, | |
| "num_tokens": 2114891465.0, | |
| "step": 14950 | |
| }, | |
| { | |
| "entropy": 1.401020920276642, | |
| "epoch": 0.4896999771473344, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 5.398625105321518e-06, | |
| "loss": 0.143, | |
| "mean_token_accuracy": 0.9679032492637635, | |
| "num_tokens": 2121999447.0, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.4896999771473344, | |
| "eval_entropy": 1.409274689356486, | |
| "eval_loss": 0.1606949120759964, | |
| "eval_mean_token_accuracy": 0.963873782157898, | |
| "eval_num_tokens": 2121999447.0, | |
| "eval_runtime": 749.6338, | |
| "eval_samples_per_second": 12.881, | |
| "eval_steps_per_second": 0.101, | |
| "step": 15000 | |
| }, | |
| { | |
| "entropy": 1.4075271391868591, | |
| "epoch": 0.49133231040449216, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 5.393753130480773e-06, | |
| "loss": 0.1422, | |
| "mean_token_accuracy": 0.967915655374527, | |
| "num_tokens": 2129364587.0, | |
| "step": 15050 | |
| }, | |
| { | |
| "entropy": 1.39142733335495, | |
| "epoch": 0.49296464366164994, | |
| "grad_norm": 1.5, | |
| "learning_rate": 5.388863717396865e-06, | |
| "loss": 0.1378, | |
| "mean_token_accuracy": 0.9694935536384582, | |
| "num_tokens": 2135978375.0, | |
| "step": 15100 | |
| }, | |
| { | |
| "entropy": 1.4059196853637694, | |
| "epoch": 0.4945969769188077, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.383956901688659e-06, | |
| "loss": 0.1495, | |
| "mean_token_accuracy": 0.9660707736015319, | |
| "num_tokens": 2143467430.0, | |
| "step": 15150 | |
| }, | |
| { | |
| "entropy": 1.3810991740226746, | |
| "epoch": 0.4962293101759655, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 5.3790327191017976e-06, | |
| "loss": 0.1421, | |
| "mean_token_accuracy": 0.9680761241912842, | |
| "num_tokens": 2150524540.0, | |
| "step": 15200 | |
| }, | |
| { | |
| "entropy": 1.3994912171363831, | |
| "epoch": 0.4978616434331233, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 5.374091205508442e-06, | |
| "loss": 0.1508, | |
| "mean_token_accuracy": 0.9663672626018525, | |
| "num_tokens": 2157785212.0, | |
| "step": 15250 | |
| }, | |
| { | |
| "entropy": 1.405203297138214, | |
| "epoch": 0.49949397669028106, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 5.369132396907005e-06, | |
| "loss": 0.1478, | |
| "mean_token_accuracy": 0.9665706479549407, | |
| "num_tokens": 2164998488.0, | |
| "step": 15300 | |
| }, | |
| { | |
| "entropy": 1.4063081932067871, | |
| "epoch": 0.5011263099474389, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 5.364156329421892e-06, | |
| "loss": 0.1285, | |
| "mean_token_accuracy": 0.9716930568218232, | |
| "num_tokens": 2171612673.0, | |
| "step": 15350 | |
| }, | |
| { | |
| "entropy": 1.4068853855133057, | |
| "epoch": 0.5027586432045966, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 5.359163039303241e-06, | |
| "loss": 0.1465, | |
| "mean_token_accuracy": 0.9674324905872345, | |
| "num_tokens": 2179025674.0, | |
| "step": 15400 | |
| }, | |
| { | |
| "entropy": 1.4135111689567565, | |
| "epoch": 0.5043909764617545, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 5.354152562926649e-06, | |
| "loss": 0.1471, | |
| "mean_token_accuracy": 0.9673193073272706, | |
| "num_tokens": 2186052673.0, | |
| "step": 15450 | |
| }, | |
| { | |
| "entropy": 1.4252868342399596, | |
| "epoch": 0.5060233097189122, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 5.349124936792918e-06, | |
| "loss": 0.1441, | |
| "mean_token_accuracy": 0.9673807907104492, | |
| "num_tokens": 2193326163.0, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.5060233097189122, | |
| "eval_entropy": 1.409689162572225, | |
| "eval_loss": 0.16070087254047394, | |
| "eval_mean_token_accuracy": 0.9643241794904073, | |
| "eval_num_tokens": 2193326163.0, | |
| "eval_runtime": 746.9801, | |
| "eval_samples_per_second": 12.927, | |
| "eval_steps_per_second": 0.102, | |
| "step": 15500 | |
| }, | |
| { | |
| "entropy": 1.4205117511749268, | |
| "epoch": 0.50765564297607, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 5.344080197527782e-06, | |
| "loss": 0.1492, | |
| "mean_token_accuracy": 0.9669366598129272, | |
| "num_tokens": 2200727296.0, | |
| "step": 15550 | |
| }, | |
| { | |
| "entropy": 1.418469593524933, | |
| "epoch": 0.5092879762332277, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 5.339018381881644e-06, | |
| "loss": 0.1485, | |
| "mean_token_accuracy": 0.9666687226295472, | |
| "num_tokens": 2207977018.0, | |
| "step": 15600 | |
| }, | |
| { | |
| "entropy": 1.4072162437438964, | |
| "epoch": 0.5109203094903856, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 5.333939526729307e-06, | |
| "loss": 0.1393, | |
| "mean_token_accuracy": 0.968210985660553, | |
| "num_tokens": 2214966932.0, | |
| "step": 15650 | |
| }, | |
| { | |
| "entropy": 1.4076303386688231, | |
| "epoch": 0.5125526427475433, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 5.3288436690697e-06, | |
| "loss": 0.1411, | |
| "mean_token_accuracy": 0.9681597316265106, | |
| "num_tokens": 2222114305.0, | |
| "step": 15700 | |
| }, | |
| { | |
| "entropy": 1.4115266394615174, | |
| "epoch": 0.5141849760047011, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 5.323730846025621e-06, | |
| "loss": 0.1436, | |
| "mean_token_accuracy": 0.9676663541793823, | |
| "num_tokens": 2229368804.0, | |
| "step": 15750 | |
| }, | |
| { | |
| "entropy": 1.3984081506729127, | |
| "epoch": 0.5158173092618589, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 5.3186010948434535e-06, | |
| "loss": 0.1277, | |
| "mean_token_accuracy": 0.9710724341869355, | |
| "num_tokens": 2235855134.0, | |
| "step": 15800 | |
| }, | |
| { | |
| "entropy": 1.4158777141571044, | |
| "epoch": 0.5174496425190167, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 5.313454452892903e-06, | |
| "loss": 0.1412, | |
| "mean_token_accuracy": 0.9679831528663635, | |
| "num_tokens": 2242658034.0, | |
| "step": 15850 | |
| }, | |
| { | |
| "entropy": 1.4000438117980958, | |
| "epoch": 0.5190819757761744, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 5.3082909576667206e-06, | |
| "loss": 0.1432, | |
| "mean_token_accuracy": 0.9681457090377807, | |
| "num_tokens": 2249786162.0, | |
| "step": 15900 | |
| }, | |
| { | |
| "entropy": 1.4153186726570128, | |
| "epoch": 0.5207143090333323, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 5.303110646780435e-06, | |
| "loss": 0.1311, | |
| "mean_token_accuracy": 0.9702495551109314, | |
| "num_tokens": 2256415467.0, | |
| "step": 15950 | |
| }, | |
| { | |
| "entropy": 1.4182570719718932, | |
| "epoch": 0.52234664229049, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 5.297913557972074e-06, | |
| "loss": 0.1444, | |
| "mean_token_accuracy": 0.9671289598941804, | |
| "num_tokens": 2263118935.0, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.52234664229049, | |
| "eval_entropy": 1.4267909447352092, | |
| "eval_loss": 0.16035768389701843, | |
| "eval_mean_token_accuracy": 0.9642941602071127, | |
| "eval_num_tokens": 2263118935.0, | |
| "eval_runtime": 746.9288, | |
| "eval_samples_per_second": 12.928, | |
| "eval_steps_per_second": 0.102, | |
| "step": 16000 | |
| }, | |
| { | |
| "entropy": 1.4244341516494752, | |
| "epoch": 0.5239789755476478, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 5.292699729101888e-06, | |
| "loss": 0.1341, | |
| "mean_token_accuracy": 0.9685980105400085, | |
| "num_tokens": 2270004148.0, | |
| "step": 16050 | |
| }, | |
| { | |
| "entropy": 1.433496663570404, | |
| "epoch": 0.5256113088048056, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 5.2874691981520814e-06, | |
| "loss": 0.1489, | |
| "mean_token_accuracy": 0.9664645993709564, | |
| "num_tokens": 2277001614.0, | |
| "step": 16100 | |
| }, | |
| { | |
| "entropy": 1.4589428210258484, | |
| "epoch": 0.5272436420619634, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 5.282222003226528e-06, | |
| "loss": 0.1494, | |
| "mean_token_accuracy": 0.9667097711563111, | |
| "num_tokens": 2283969486.0, | |
| "step": 16150 | |
| }, | |
| { | |
| "entropy": 1.4311462545394897, | |
| "epoch": 0.5288759753191211, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 5.276958182550499e-06, | |
| "loss": 0.1498, | |
| "mean_token_accuracy": 0.966657601594925, | |
| "num_tokens": 2291187492.0, | |
| "step": 16200 | |
| }, | |
| { | |
| "entropy": 1.4505498099327088, | |
| "epoch": 0.530508308576279, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 5.271677774470383e-06, | |
| "loss": 0.1432, | |
| "mean_token_accuracy": 0.9675734841823578, | |
| "num_tokens": 2298489089.0, | |
| "step": 16250 | |
| }, | |
| { | |
| "entropy": 1.4610102701187133, | |
| "epoch": 0.5321406418334367, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 5.2663808174534035e-06, | |
| "loss": 0.1528, | |
| "mean_token_accuracy": 0.9655195689201355, | |
| "num_tokens": 2306159999.0, | |
| "step": 16300 | |
| }, | |
| { | |
| "entropy": 1.4418502616882325, | |
| "epoch": 0.5337729750905945, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 5.261067350087342e-06, | |
| "loss": 0.1448, | |
| "mean_token_accuracy": 0.9679492330551147, | |
| "num_tokens": 2313017319.0, | |
| "step": 16350 | |
| }, | |
| { | |
| "entropy": 1.4529797434806824, | |
| "epoch": 0.5354053083477522, | |
| "grad_norm": 2.0, | |
| "learning_rate": 5.255737411080258e-06, | |
| "loss": 0.1421, | |
| "mean_token_accuracy": 0.9680870652198792, | |
| "num_tokens": 2319830047.0, | |
| "step": 16400 | |
| }, | |
| { | |
| "entropy": 1.4405932116508484, | |
| "epoch": 0.5370376416049101, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 5.250391039260203e-06, | |
| "loss": 0.1341, | |
| "mean_token_accuracy": 0.9692844843864441, | |
| "num_tokens": 2326652556.0, | |
| "step": 16450 | |
| }, | |
| { | |
| "entropy": 1.4391892647743225, | |
| "epoch": 0.5386699748620678, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 5.245028273574943e-06, | |
| "loss": 0.1455, | |
| "mean_token_accuracy": 0.9678148710727692, | |
| "num_tokens": 2333819381.0, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.5386699748620678, | |
| "eval_entropy": 1.4535431814193727, | |
| "eval_loss": 0.15979354083538055, | |
| "eval_mean_token_accuracy": 0.9644315036137899, | |
| "eval_num_tokens": 2333819381.0, | |
| "eval_runtime": 744.9612, | |
| "eval_samples_per_second": 12.962, | |
| "eval_steps_per_second": 0.102, | |
| "step": 16500 | |
| }, | |
| { | |
| "entropy": 1.4589093685150147, | |
| "epoch": 0.5403023081192256, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 5.239649153091669e-06, | |
| "loss": 0.1366, | |
| "mean_token_accuracy": 0.9689966702461242, | |
| "num_tokens": 2340621485.0, | |
| "step": 16550 | |
| }, | |
| { | |
| "entropy": 1.4447486186027527, | |
| "epoch": 0.5419346413763834, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 5.234253716996714e-06, | |
| "loss": 0.1407, | |
| "mean_token_accuracy": 0.9680201160907745, | |
| "num_tokens": 2347557447.0, | |
| "step": 16600 | |
| }, | |
| { | |
| "entropy": 1.4582811617851257, | |
| "epoch": 0.5435669746335412, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 5.228842004595271e-06, | |
| "loss": 0.1416, | |
| "mean_token_accuracy": 0.9680859756469726, | |
| "num_tokens": 2354654075.0, | |
| "step": 16650 | |
| }, | |
| { | |
| "entropy": 1.4596582174301147, | |
| "epoch": 0.5451993078906989, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 5.223414055311104e-06, | |
| "loss": 0.1456, | |
| "mean_token_accuracy": 0.9669416832923889, | |
| "num_tokens": 2361479872.0, | |
| "step": 16700 | |
| }, | |
| { | |
| "entropy": 1.455612142086029, | |
| "epoch": 0.5468316411478568, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 5.217969908686259e-06, | |
| "loss": 0.1494, | |
| "mean_token_accuracy": 0.9662820196151733, | |
| "num_tokens": 2368784116.0, | |
| "step": 16750 | |
| }, | |
| { | |
| "entropy": 1.437941801548004, | |
| "epoch": 0.5484639744050145, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 5.2125096043807805e-06, | |
| "loss": 0.1351, | |
| "mean_token_accuracy": 0.969438636302948, | |
| "num_tokens": 2375699232.0, | |
| "step": 16800 | |
| }, | |
| { | |
| "entropy": 1.4474022889137268, | |
| "epoch": 0.5500963076621723, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 5.2070331821724175e-06, | |
| "loss": 0.1496, | |
| "mean_token_accuracy": 0.9666418421268463, | |
| "num_tokens": 2382571188.0, | |
| "step": 16850 | |
| }, | |
| { | |
| "entropy": 1.443734383583069, | |
| "epoch": 0.55172864091933, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 5.201540681956339e-06, | |
| "loss": 0.1417, | |
| "mean_token_accuracy": 0.9685157811641694, | |
| "num_tokens": 2389499925.0, | |
| "step": 16900 | |
| }, | |
| { | |
| "entropy": 1.4289966011047364, | |
| "epoch": 0.5533609741764879, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 5.196032143744837e-06, | |
| "loss": 0.1502, | |
| "mean_token_accuracy": 0.9662255728244782, | |
| "num_tokens": 2396898569.0, | |
| "step": 16950 | |
| }, | |
| { | |
| "entropy": 1.422741391658783, | |
| "epoch": 0.5549933074336456, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.190507607667043e-06, | |
| "loss": 0.1362, | |
| "mean_token_accuracy": 0.9694574820995331, | |
| "num_tokens": 2403493744.0, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.5549933074336456, | |
| "eval_entropy": 1.410568381945292, | |
| "eval_loss": 0.1593320667743683, | |
| "eval_mean_token_accuracy": 0.9647167531649271, | |
| "eval_num_tokens": 2403493744.0, | |
| "eval_runtime": 749.6267, | |
| "eval_samples_per_second": 12.881, | |
| "eval_steps_per_second": 0.101, | |
| "step": 17000 | |
| }, | |
| { | |
| "entropy": 1.4179514050483704, | |
| "epoch": 0.5566256406908034, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 5.184967113968628e-06, | |
| "loss": 0.1437, | |
| "mean_token_accuracy": 0.9673176133632659, | |
| "num_tokens": 2410291888.0, | |
| "step": 17050 | |
| }, | |
| { | |
| "entropy": 1.420968849658966, | |
| "epoch": 0.5582579739479612, | |
| "grad_norm": 1.875, | |
| "learning_rate": 5.179410703011514e-06, | |
| "loss": 0.1416, | |
| "mean_token_accuracy": 0.9681920135021209, | |
| "num_tokens": 2417511664.0, | |
| "step": 17100 | |
| }, | |
| { | |
| "entropy": 1.4380729961395264, | |
| "epoch": 0.559890307205119, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 5.173838415273578e-06, | |
| "loss": 0.1381, | |
| "mean_token_accuracy": 0.9696194708347321, | |
| "num_tokens": 2424287780.0, | |
| "step": 17150 | |
| }, | |
| { | |
| "entropy": 1.418275089263916, | |
| "epoch": 0.5615226404622767, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 5.168250291348358e-06, | |
| "loss": 0.1307, | |
| "mean_token_accuracy": 0.9706324160099029, | |
| "num_tokens": 2431053313.0, | |
| "step": 17200 | |
| }, | |
| { | |
| "entropy": 1.4160829043388368, | |
| "epoch": 0.5631549737194346, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 5.162646371944757e-06, | |
| "loss": 0.139, | |
| "mean_token_accuracy": 0.9683985018730163, | |
| "num_tokens": 2438106375.0, | |
| "step": 17250 | |
| }, | |
| { | |
| "entropy": 1.400640585422516, | |
| "epoch": 0.5647873069765923, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 5.157026697886745e-06, | |
| "loss": 0.1367, | |
| "mean_token_accuracy": 0.9683994352817535, | |
| "num_tokens": 2445056771.0, | |
| "step": 17300 | |
| }, | |
| { | |
| "entropy": 1.4130174493789673, | |
| "epoch": 0.5664196402337501, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 5.151391310113067e-06, | |
| "loss": 0.1459, | |
| "mean_token_accuracy": 0.9672868931293488, | |
| "num_tokens": 2452756923.0, | |
| "step": 17350 | |
| }, | |
| { | |
| "entropy": 1.4374343585968017, | |
| "epoch": 0.5680519734909079, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 5.145740249676937e-06, | |
| "loss": 0.1405, | |
| "mean_token_accuracy": 0.9680230569839477, | |
| "num_tokens": 2459581470.0, | |
| "step": 17400 | |
| }, | |
| { | |
| "entropy": 1.4422858119010926, | |
| "epoch": 0.5696843067480657, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 5.140073557745743e-06, | |
| "loss": 0.1365, | |
| "mean_token_accuracy": 0.9697402846813202, | |
| "num_tokens": 2466422672.0, | |
| "step": 17450 | |
| }, | |
| { | |
| "entropy": 1.430338339805603, | |
| "epoch": 0.5713166400052234, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 5.134391275600748e-06, | |
| "loss": 0.1361, | |
| "mean_token_accuracy": 0.9687436437606811, | |
| "num_tokens": 2473543628.0, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.5713166400052234, | |
| "eval_entropy": 1.4239939387639364, | |
| "eval_loss": 0.15963123738765717, | |
| "eval_mean_token_accuracy": 0.9646872194608053, | |
| "eval_num_tokens": 2473543628.0, | |
| "eval_runtime": 746.9116, | |
| "eval_samples_per_second": 12.928, | |
| "eval_steps_per_second": 0.102, | |
| "step": 17500 | |
| }, | |
| { | |
| "entropy": 1.41823988199234, | |
| "epoch": 0.5729489732623813, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 5.12869344463679e-06, | |
| "loss": 0.1349, | |
| "mean_token_accuracy": 0.969758038520813, | |
| "num_tokens": 2480227038.0, | |
| "step": 17550 | |
| }, | |
| { | |
| "entropy": 1.423335657119751, | |
| "epoch": 0.574581306519539, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 5.122980106361973e-06, | |
| "loss": 0.1427, | |
| "mean_token_accuracy": 0.9678147983551025, | |
| "num_tokens": 2487475967.0, | |
| "step": 17600 | |
| }, | |
| { | |
| "entropy": 1.4069186902046205, | |
| "epoch": 0.5762136397766968, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 5.117251302397376e-06, | |
| "loss": 0.138, | |
| "mean_token_accuracy": 0.9696054446697235, | |
| "num_tokens": 2494547025.0, | |
| "step": 17650 | |
| }, | |
| { | |
| "entropy": 1.4386369252204896, | |
| "epoch": 0.5778459730338545, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 5.111507074476741e-06, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9627214801311493, | |
| "num_tokens": 2502024527.0, | |
| "step": 17700 | |
| }, | |
| { | |
| "entropy": 1.4271745777130127, | |
| "epoch": 0.5794783062910124, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.105747464446171e-06, | |
| "loss": 0.1462, | |
| "mean_token_accuracy": 0.9673130023479461, | |
| "num_tokens": 2509136097.0, | |
| "step": 17750 | |
| }, | |
| { | |
| "entropy": 1.4218875932693482, | |
| "epoch": 0.5811106395481701, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 5.099972514263828e-06, | |
| "loss": 0.1425, | |
| "mean_token_accuracy": 0.9676541352272033, | |
| "num_tokens": 2516010870.0, | |
| "step": 17800 | |
| }, | |
| { | |
| "entropy": 1.4141360521316528, | |
| "epoch": 0.5827429728053279, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 5.094182265999625e-06, | |
| "loss": 0.144, | |
| "mean_token_accuracy": 0.9672036898136139, | |
| "num_tokens": 2523293871.0, | |
| "step": 17850 | |
| }, | |
| { | |
| "entropy": 1.4077139925956725, | |
| "epoch": 0.5843753060624857, | |
| "grad_norm": 1.375, | |
| "learning_rate": 5.0883767618349205e-06, | |
| "loss": 0.1419, | |
| "mean_token_accuracy": 0.9684661495685577, | |
| "num_tokens": 2530147747.0, | |
| "step": 17900 | |
| }, | |
| { | |
| "entropy": 1.407555968761444, | |
| "epoch": 0.5860076393196435, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 5.082556044062209e-06, | |
| "loss": 0.1277, | |
| "mean_token_accuracy": 0.9704045712947845, | |
| "num_tokens": 2536714168.0, | |
| "step": 17950 | |
| }, | |
| { | |
| "entropy": 1.4110585117340089, | |
| "epoch": 0.5876399725768012, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 5.0767201550848155e-06, | |
| "loss": 0.1402, | |
| "mean_token_accuracy": 0.9680717265605927, | |
| "num_tokens": 2543453566.0, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.5876399725768012, | |
| "eval_entropy": 1.3970834159851073, | |
| "eval_loss": 0.15939612686634064, | |
| "eval_mean_token_accuracy": 0.9646043960253398, | |
| "eval_num_tokens": 2543453566.0, | |
| "eval_runtime": 745.3104, | |
| "eval_samples_per_second": 12.956, | |
| "eval_steps_per_second": 0.102, | |
| "step": 18000 | |
| }, | |
| { | |
| "entropy": 1.405302128791809, | |
| "epoch": 0.5892723058339591, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 5.070869137416586e-06, | |
| "loss": 0.1392, | |
| "mean_token_accuracy": 0.9686900818347931, | |
| "num_tokens": 2550316340.0, | |
| "step": 18050 | |
| }, | |
| { | |
| "entropy": 1.409914915561676, | |
| "epoch": 0.5909046390911168, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 5.065003033681577e-06, | |
| "loss": 0.1464, | |
| "mean_token_accuracy": 0.9667869770526886, | |
| "num_tokens": 2557419151.0, | |
| "step": 18100 | |
| }, | |
| { | |
| "entropy": 1.4183743119239807, | |
| "epoch": 0.5925369723482746, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 5.059121886613746e-06, | |
| "loss": 0.1463, | |
| "mean_token_accuracy": 0.9668680250644683, | |
| "num_tokens": 2564861132.0, | |
| "step": 18150 | |
| }, | |
| { | |
| "entropy": 1.4198586773872375, | |
| "epoch": 0.5941693056054324, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 5.053225739056638e-06, | |
| "loss": 0.1395, | |
| "mean_token_accuracy": 0.9689326965808869, | |
| "num_tokens": 2571587295.0, | |
| "step": 18200 | |
| }, | |
| { | |
| "entropy": 1.4410333514213562, | |
| "epoch": 0.5958016388625902, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.047314633963077e-06, | |
| "loss": 0.142, | |
| "mean_token_accuracy": 0.9676812827587128, | |
| "num_tokens": 2578976320.0, | |
| "step": 18250 | |
| }, | |
| { | |
| "entropy": 1.4181599235534668, | |
| "epoch": 0.5974339721197479, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 5.04138861439485e-06, | |
| "loss": 0.1388, | |
| "mean_token_accuracy": 0.968032683134079, | |
| "num_tokens": 2585791943.0, | |
| "step": 18300 | |
| }, | |
| { | |
| "entropy": 1.4222650122642517, | |
| "epoch": 0.5990663053769058, | |
| "grad_norm": 2.0, | |
| "learning_rate": 5.0354477235223945e-06, | |
| "loss": 0.1479, | |
| "mean_token_accuracy": 0.9662945425510406, | |
| "num_tokens": 2592969827.0, | |
| "step": 18350 | |
| }, | |
| { | |
| "entropy": 1.4107950401306153, | |
| "epoch": 0.6006986386340635, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 5.029492004624484e-06, | |
| "loss": 0.1495, | |
| "mean_token_accuracy": 0.9669906544685364, | |
| "num_tokens": 2600453000.0, | |
| "step": 18400 | |
| }, | |
| { | |
| "entropy": 1.4090588116645812, | |
| "epoch": 0.6023309718912213, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 5.023521501087913e-06, | |
| "loss": 0.142, | |
| "mean_token_accuracy": 0.9677951109409332, | |
| "num_tokens": 2607819604.0, | |
| "step": 18450 | |
| }, | |
| { | |
| "entropy": 1.4200125312805176, | |
| "epoch": 0.6039633051483791, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 5.017536256407179e-06, | |
| "loss": 0.1471, | |
| "mean_token_accuracy": 0.9666409981250763, | |
| "num_tokens": 2615067666.0, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.6039633051483791, | |
| "eval_entropy": 1.4210956970850626, | |
| "eval_loss": 0.15948741137981415, | |
| "eval_mean_token_accuracy": 0.9648204270998637, | |
| "eval_num_tokens": 2615067666.0, | |
| "eval_runtime": 748.5938, | |
| "eval_samples_per_second": 12.899, | |
| "eval_steps_per_second": 0.102, | |
| "step": 18500 | |
| }, | |
| { | |
| "entropy": 1.4314108753204347, | |
| "epoch": 0.6055956384055369, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 5.011536314184171e-06, | |
| "loss": 0.1353, | |
| "mean_token_accuracy": 0.9691914403438568, | |
| "num_tokens": 2621774246.0, | |
| "step": 18550 | |
| }, | |
| { | |
| "entropy": 1.438691704273224, | |
| "epoch": 0.6072279716626947, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 5.0055217181278435e-06, | |
| "loss": 0.152, | |
| "mean_token_accuracy": 0.9660729610919953, | |
| "num_tokens": 2628968865.0, | |
| "step": 18600 | |
| }, | |
| { | |
| "entropy": 1.4327943706512452, | |
| "epoch": 0.6088603049198524, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 4.999492512053904e-06, | |
| "loss": 0.1429, | |
| "mean_token_accuracy": 0.9679563343524933, | |
| "num_tokens": 2636298361.0, | |
| "step": 18650 | |
| }, | |
| { | |
| "entropy": 1.419541118144989, | |
| "epoch": 0.6104926381770103, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 4.993448739884496e-06, | |
| "loss": 0.1365, | |
| "mean_token_accuracy": 0.9690783095359802, | |
| "num_tokens": 2643149114.0, | |
| "step": 18700 | |
| }, | |
| { | |
| "entropy": 1.433673801422119, | |
| "epoch": 0.612124971434168, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 4.98739044564787e-06, | |
| "loss": 0.1387, | |
| "mean_token_accuracy": 0.9684871184825897, | |
| "num_tokens": 2649909312.0, | |
| "step": 18750 | |
| }, | |
| { | |
| "entropy": 1.41679913520813, | |
| "epoch": 0.6137573046913258, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 4.9813176734780714e-06, | |
| "loss": 0.1351, | |
| "mean_token_accuracy": 0.9691712772846222, | |
| "num_tokens": 2656772858.0, | |
| "step": 18800 | |
| }, | |
| { | |
| "entropy": 1.4235585713386536, | |
| "epoch": 0.6153896379484836, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 4.975230467614616e-06, | |
| "loss": 0.1413, | |
| "mean_token_accuracy": 0.9680274319648743, | |
| "num_tokens": 2664404838.0, | |
| "step": 18850 | |
| }, | |
| { | |
| "entropy": 1.452963993549347, | |
| "epoch": 0.6170219712056414, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 4.969128872402166e-06, | |
| "loss": 0.1479, | |
| "mean_token_accuracy": 0.9667314755916595, | |
| "num_tokens": 2671712943.0, | |
| "step": 18900 | |
| }, | |
| { | |
| "entropy": 1.4436225032806396, | |
| "epoch": 0.6186543044627991, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 4.96301293229021e-06, | |
| "loss": 0.1499, | |
| "mean_token_accuracy": 0.9664954626560212, | |
| "num_tokens": 2678603623.0, | |
| "step": 18950 | |
| }, | |
| { | |
| "entropy": 1.4565304446220397, | |
| "epoch": 0.620286637719957, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 4.9568826918327375e-06, | |
| "loss": 0.1481, | |
| "mean_token_accuracy": 0.9671436643600464, | |
| "num_tokens": 2685887315.0, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.620286637719957, | |
| "eval_entropy": 1.4394246594111124, | |
| "eval_loss": 0.15968339145183563, | |
| "eval_mean_token_accuracy": 0.9645526838302613, | |
| "eval_num_tokens": 2685887315.0, | |
| "eval_runtime": 743.8859, | |
| "eval_samples_per_second": 12.98, | |
| "eval_steps_per_second": 0.102, | |
| "step": 19000 | |
| }, | |
| { | |
| "entropy": 1.4509250116348267, | |
| "epoch": 0.6219189709771147, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 4.950738195687914e-06, | |
| "loss": 0.1478, | |
| "mean_token_accuracy": 0.9675949096679688, | |
| "num_tokens": 2692939859.0, | |
| "step": 19050 | |
| }, | |
| { | |
| "entropy": 1.4424961733818054, | |
| "epoch": 0.6235513042342725, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 4.944579488617754e-06, | |
| "loss": 0.1342, | |
| "mean_token_accuracy": 0.9699901640415192, | |
| "num_tokens": 2699651700.0, | |
| "step": 19100 | |
| }, | |
| { | |
| "entropy": 1.43824138879776, | |
| "epoch": 0.6251836374914302, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 4.938406615487804e-06, | |
| "loss": 0.1498, | |
| "mean_token_accuracy": 0.9661287260055542, | |
| "num_tokens": 2707480098.0, | |
| "step": 19150 | |
| }, | |
| { | |
| "entropy": 1.4361098647117614, | |
| "epoch": 0.6268159707485881, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 4.9322196212668e-06, | |
| "loss": 0.1563, | |
| "mean_token_accuracy": 0.9649386501312256, | |
| "num_tokens": 2715153146.0, | |
| "step": 19200 | |
| }, | |
| { | |
| "entropy": 1.4300830745697022, | |
| "epoch": 0.6284483040057458, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 4.9260185510263546e-06, | |
| "loss": 0.1363, | |
| "mean_token_accuracy": 0.9690042114257813, | |
| "num_tokens": 2722322826.0, | |
| "step": 19250 | |
| }, | |
| { | |
| "entropy": 1.423244218826294, | |
| "epoch": 0.6300806372629036, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 4.919803449940621e-06, | |
| "loss": 0.1253, | |
| "mean_token_accuracy": 0.9720934724807739, | |
| "num_tokens": 2728961876.0, | |
| "step": 19300 | |
| }, | |
| { | |
| "entropy": 1.4357132482528687, | |
| "epoch": 0.6317129705200614, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 4.913574363285965e-06, | |
| "loss": 0.1451, | |
| "mean_token_accuracy": 0.967208684682846, | |
| "num_tokens": 2735922821.0, | |
| "step": 19350 | |
| }, | |
| { | |
| "entropy": 1.4567182993888854, | |
| "epoch": 0.6333453037772192, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 4.907331336440637e-06, | |
| "loss": 0.1533, | |
| "mean_token_accuracy": 0.9658544027805328, | |
| "num_tokens": 2743184778.0, | |
| "step": 19400 | |
| }, | |
| { | |
| "entropy": 1.4356631135940552, | |
| "epoch": 0.6349776370343769, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 4.9010744148844414e-06, | |
| "loss": 0.1392, | |
| "mean_token_accuracy": 0.9687076592445374, | |
| "num_tokens": 2750183248.0, | |
| "step": 19450 | |
| }, | |
| { | |
| "entropy": 1.4388001561164856, | |
| "epoch": 0.6366099702915348, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 4.8948036441984e-06, | |
| "loss": 0.1408, | |
| "mean_token_accuracy": 0.9690019035339356, | |
| "num_tokens": 2757183194.0, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.6366099702915348, | |
| "eval_entropy": 1.4425183471043905, | |
| "eval_loss": 0.15926624834537506, | |
| "eval_mean_token_accuracy": 0.9642885835965475, | |
| "eval_num_tokens": 2757183194.0, | |
| "eval_runtime": 747.7266, | |
| "eval_samples_per_second": 12.914, | |
| "eval_steps_per_second": 0.102, | |
| "step": 19500 | |
| }, | |
| { | |
| "entropy": 1.4483740854263305, | |
| "epoch": 0.6382423035486925, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 4.888519070064427e-06, | |
| "loss": 0.1424, | |
| "mean_token_accuracy": 0.9682291400432587, | |
| "num_tokens": 2764192867.0, | |
| "step": 19550 | |
| }, | |
| { | |
| "entropy": 1.4348323988914489, | |
| "epoch": 0.6398746368058503, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 4.882220738264994e-06, | |
| "loss": 0.1485, | |
| "mean_token_accuracy": 0.9667525351047516, | |
| "num_tokens": 2771378704.0, | |
| "step": 19600 | |
| }, | |
| { | |
| "entropy": 1.431165406703949, | |
| "epoch": 0.641506970063008, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 4.875908694682793e-06, | |
| "loss": 0.1387, | |
| "mean_token_accuracy": 0.9686442255973816, | |
| "num_tokens": 2778817459.0, | |
| "step": 19650 | |
| }, | |
| { | |
| "entropy": 1.4262248253822327, | |
| "epoch": 0.6431393033201659, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 4.869582985300409e-06, | |
| "loss": 0.1443, | |
| "mean_token_accuracy": 0.9666674077510834, | |
| "num_tokens": 2785809690.0, | |
| "step": 19700 | |
| }, | |
| { | |
| "entropy": 1.4198060846328735, | |
| "epoch": 0.6447716365773236, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 4.8632436561999754e-06, | |
| "loss": 0.141, | |
| "mean_token_accuracy": 0.9685878479480743, | |
| "num_tokens": 2792972553.0, | |
| "step": 19750 | |
| }, | |
| { | |
| "entropy": 1.42301029920578, | |
| "epoch": 0.6464039698344815, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 4.85689075356285e-06, | |
| "loss": 0.1323, | |
| "mean_token_accuracy": 0.9700925529003144, | |
| "num_tokens": 2799755524.0, | |
| "step": 19800 | |
| }, | |
| { | |
| "entropy": 1.407839720249176, | |
| "epoch": 0.6480363030916392, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 4.850524323669266e-06, | |
| "loss": 0.144, | |
| "mean_token_accuracy": 0.9683354413509369, | |
| "num_tokens": 2806934638.0, | |
| "step": 19850 | |
| }, | |
| { | |
| "entropy": 1.4161179232597352, | |
| "epoch": 0.649668636348797, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 4.844144412898006e-06, | |
| "loss": 0.1468, | |
| "mean_token_accuracy": 0.9669712007045745, | |
| "num_tokens": 2814052906.0, | |
| "step": 19900 | |
| }, | |
| { | |
| "entropy": 1.4124717712402344, | |
| "epoch": 0.6513009696059547, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 4.83775106772606e-06, | |
| "loss": 0.1368, | |
| "mean_token_accuracy": 0.9689941215515137, | |
| "num_tokens": 2820334438.0, | |
| "step": 19950 | |
| }, | |
| { | |
| "entropy": 1.4184478282928468, | |
| "epoch": 0.6529333028631126, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 4.8313443347282805e-06, | |
| "loss": 0.1377, | |
| "mean_token_accuracy": 0.9692059123516082, | |
| "num_tokens": 2826961099.0, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.6529333028631126, | |
| "eval_entropy": 1.4259551413853964, | |
| "eval_loss": 0.15875579416751862, | |
| "eval_mean_token_accuracy": 0.9646251447995504, | |
| "eval_num_tokens": 2826961099.0, | |
| "eval_runtime": 744.3635, | |
| "eval_samples_per_second": 12.972, | |
| "eval_steps_per_second": 0.102, | |
| "step": 20000 | |
| }, | |
| { | |
| "entropy": 1.4160712957382202, | |
| "epoch": 0.6545656361202703, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 4.824924260577056e-06, | |
| "loss": 0.1377, | |
| "mean_token_accuracy": 0.9691171681880951, | |
| "num_tokens": 2834147273.0, | |
| "step": 20050 | |
| }, | |
| { | |
| "entropy": 1.4152118015289306, | |
| "epoch": 0.6561979693774281, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 4.818490892041959e-06, | |
| "loss": 0.1347, | |
| "mean_token_accuracy": 0.9702432310581207, | |
| "num_tokens": 2840408479.0, | |
| "step": 20100 | |
| }, | |
| { | |
| "entropy": 1.4018067622184753, | |
| "epoch": 0.6578303026345859, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 4.81204427598941e-06, | |
| "loss": 0.1227, | |
| "mean_token_accuracy": 0.9716095530986786, | |
| "num_tokens": 2847100048.0, | |
| "step": 20150 | |
| }, | |
| { | |
| "entropy": 1.4273248219490051, | |
| "epoch": 0.6594626358917437, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 4.805584459382342e-06, | |
| "loss": 0.144, | |
| "mean_token_accuracy": 0.9675836896896363, | |
| "num_tokens": 2853932358.0, | |
| "step": 20200 | |
| }, | |
| { | |
| "entropy": 1.431253423690796, | |
| "epoch": 0.6610949691489014, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 4.799111489279844e-06, | |
| "loss": 0.1403, | |
| "mean_token_accuracy": 0.9687580478191375, | |
| "num_tokens": 2860669435.0, | |
| "step": 20250 | |
| }, | |
| { | |
| "entropy": 1.4074326467514038, | |
| "epoch": 0.6627273024060593, | |
| "grad_norm": 1.375, | |
| "learning_rate": 4.792625412836835e-06, | |
| "loss": 0.136, | |
| "mean_token_accuracy": 0.9695438253879547, | |
| "num_tokens": 2867505726.0, | |
| "step": 20300 | |
| }, | |
| { | |
| "entropy": 1.4165804195404053, | |
| "epoch": 0.664359635663217, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 4.786126277303707e-06, | |
| "loss": 0.1459, | |
| "mean_token_accuracy": 0.9678076064586639, | |
| "num_tokens": 2874388589.0, | |
| "step": 20350 | |
| }, | |
| { | |
| "entropy": 1.405945236682892, | |
| "epoch": 0.6659919689203748, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 4.779614130025989e-06, | |
| "loss": 0.1434, | |
| "mean_token_accuracy": 0.9678366994857788, | |
| "num_tokens": 2881624440.0, | |
| "step": 20400 | |
| }, | |
| { | |
| "entropy": 1.4089183855056762, | |
| "epoch": 0.6676243021775325, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 4.7730890184439984e-06, | |
| "loss": 0.1402, | |
| "mean_token_accuracy": 0.9682434296607971, | |
| "num_tokens": 2888818613.0, | |
| "step": 20450 | |
| }, | |
| { | |
| "entropy": 1.4186338901519775, | |
| "epoch": 0.6692566354346904, | |
| "grad_norm": 2.375, | |
| "learning_rate": 4.766550990092494e-06, | |
| "loss": 0.1501, | |
| "mean_token_accuracy": 0.9658879733085632, | |
| "num_tokens": 2895674916.0, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.6692566354346904, | |
| "eval_entropy": 1.4077574555079142, | |
| "eval_loss": 0.15949369966983795, | |
| "eval_mean_token_accuracy": 0.9645699612299601, | |
| "eval_num_tokens": 2895674916.0, | |
| "eval_runtime": 751.7816, | |
| "eval_samples_per_second": 12.844, | |
| "eval_steps_per_second": 0.101, | |
| "step": 20500 | |
| }, | |
| { | |
| "entropy": 1.4104346489906312, | |
| "epoch": 0.6708889686918481, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 4.760000092600337e-06, | |
| "loss": 0.1379, | |
| "mean_token_accuracy": 0.969134624004364, | |
| "num_tokens": 2902349294.0, | |
| "step": 20550 | |
| }, | |
| { | |
| "entropy": 1.4055168724060059, | |
| "epoch": 0.672521301949006, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 4.7534363736901334e-06, | |
| "loss": 0.1305, | |
| "mean_token_accuracy": 0.9709035861492157, | |
| "num_tokens": 2909212953.0, | |
| "step": 20600 | |
| }, | |
| { | |
| "entropy": 1.4179642391204834, | |
| "epoch": 0.6741536352061637, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 4.746859881177895e-06, | |
| "loss": 0.1462, | |
| "mean_token_accuracy": 0.9668932211399078, | |
| "num_tokens": 2916418343.0, | |
| "step": 20650 | |
| }, | |
| { | |
| "entropy": 1.3955682134628296, | |
| "epoch": 0.6757859684633215, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 4.7402706629726884e-06, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9696724009513855, | |
| "num_tokens": 2923371196.0, | |
| "step": 20700 | |
| }, | |
| { | |
| "entropy": 1.4021626067161561, | |
| "epoch": 0.6774183017204792, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 4.733668767076282e-06, | |
| "loss": 0.1525, | |
| "mean_token_accuracy": 0.9660227704048157, | |
| "num_tokens": 2930542072.0, | |
| "step": 20750 | |
| }, | |
| { | |
| "entropy": 1.3913014960289, | |
| "epoch": 0.6790506349776371, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 4.727054241582805e-06, | |
| "loss": 0.1393, | |
| "mean_token_accuracy": 0.968789451122284, | |
| "num_tokens": 2937522853.0, | |
| "step": 20800 | |
| }, | |
| { | |
| "entropy": 1.3855742335319519, | |
| "epoch": 0.6806829682347948, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 4.720427134678388e-06, | |
| "loss": 0.1295, | |
| "mean_token_accuracy": 0.9704008960723877, | |
| "num_tokens": 2944112485.0, | |
| "step": 20850 | |
| }, | |
| { | |
| "entropy": 1.3942367219924927, | |
| "epoch": 0.6823153014919526, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 4.713787494640818e-06, | |
| "loss": 0.1414, | |
| "mean_token_accuracy": 0.9685467886924743, | |
| "num_tokens": 2951083570.0, | |
| "step": 20900 | |
| }, | |
| { | |
| "entropy": 1.3912032508850098, | |
| "epoch": 0.6839476347491104, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 4.707135369839182e-06, | |
| "loss": 0.1443, | |
| "mean_token_accuracy": 0.9680955350399018, | |
| "num_tokens": 2958496819.0, | |
| "step": 20950 | |
| }, | |
| { | |
| "entropy": 1.3916519379615784, | |
| "epoch": 0.6855799680062682, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 4.70047080873352e-06, | |
| "loss": 0.1419, | |
| "mean_token_accuracy": 0.9678064894676208, | |
| "num_tokens": 2965544584.0, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.6855799680062682, | |
| "eval_entropy": 1.3711633268992107, | |
| "eval_loss": 0.15859009325504303, | |
| "eval_mean_token_accuracy": 0.9645708012580871, | |
| "eval_num_tokens": 2965544584.0, | |
| "eval_runtime": 749.4291, | |
| "eval_samples_per_second": 12.884, | |
| "eval_steps_per_second": 0.101, | |
| "step": 21000 | |
| }, | |
| { | |
| "entropy": 1.3885661768913269, | |
| "epoch": 0.6872123012634259, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 4.693793859874469e-06, | |
| "loss": 0.1389, | |
| "mean_token_accuracy": 0.968884084224701, | |
| "num_tokens": 2972445388.0, | |
| "step": 21050 | |
| }, | |
| { | |
| "entropy": 1.3977628707885743, | |
| "epoch": 0.6888446345205838, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 4.687104571902907e-06, | |
| "loss": 0.153, | |
| "mean_token_accuracy": 0.9655460596084595, | |
| "num_tokens": 2979970737.0, | |
| "step": 21100 | |
| }, | |
| { | |
| "entropy": 1.3903096175193788, | |
| "epoch": 0.6904769677777415, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 4.680402993549603e-06, | |
| "loss": 0.1434, | |
| "mean_token_accuracy": 0.9677474045753479, | |
| "num_tokens": 2986677415.0, | |
| "step": 21150 | |
| }, | |
| { | |
| "entropy": 1.404201271533966, | |
| "epoch": 0.6921093010348993, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 4.673689173634861e-06, | |
| "loss": 0.1581, | |
| "mean_token_accuracy": 0.9651612138748169, | |
| "num_tokens": 2994298485.0, | |
| "step": 21200 | |
| }, | |
| { | |
| "entropy": 1.3858560705184937, | |
| "epoch": 0.693741634292057, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 4.666963161068162e-06, | |
| "loss": 0.1459, | |
| "mean_token_accuracy": 0.967083740234375, | |
| "num_tokens": 3001141837.0, | |
| "step": 21250 | |
| }, | |
| { | |
| "entropy": 1.3804667377471924, | |
| "epoch": 0.6953739675492149, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 4.660225004847808e-06, | |
| "loss": 0.1464, | |
| "mean_token_accuracy": 0.9659059000015259, | |
| "num_tokens": 3008640955.0, | |
| "step": 21300 | |
| }, | |
| { | |
| "entropy": 1.3600660967826843, | |
| "epoch": 0.6970063008063726, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 4.65347475406057e-06, | |
| "loss": 0.1309, | |
| "mean_token_accuracy": 0.9700274157524109, | |
| "num_tokens": 3015031696.0, | |
| "step": 21350 | |
| }, | |
| { | |
| "entropy": 1.363333306312561, | |
| "epoch": 0.6986386340635304, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 4.646712457881323e-06, | |
| "loss": 0.1455, | |
| "mean_token_accuracy": 0.9672285616397858, | |
| "num_tokens": 3022491104.0, | |
| "step": 21400 | |
| }, | |
| { | |
| "entropy": 1.3792692565917968, | |
| "epoch": 0.7002709673206882, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 4.639938165572694e-06, | |
| "loss": 0.1365, | |
| "mean_token_accuracy": 0.9694170689582825, | |
| "num_tokens": 3029347619.0, | |
| "step": 21450 | |
| }, | |
| { | |
| "entropy": 1.3687973999977112, | |
| "epoch": 0.701903300577846, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 4.6331519264847e-06, | |
| "loss": 0.1425, | |
| "mean_token_accuracy": 0.9671109592914582, | |
| "num_tokens": 3036384532.0, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.701903300577846, | |
| "eval_entropy": 1.377278790473938, | |
| "eval_loss": 0.15810321271419525, | |
| "eval_mean_token_accuracy": 0.9647063970565796, | |
| "eval_num_tokens": 3036384532.0, | |
| "eval_runtime": 747.7284, | |
| "eval_samples_per_second": 12.914, | |
| "eval_steps_per_second": 0.102, | |
| "step": 21500 | |
| }, | |
| { | |
| "entropy": 1.3783918046951293, | |
| "epoch": 0.7035356338350037, | |
| "grad_norm": 1.625, | |
| "learning_rate": 4.626353790054387e-06, | |
| "loss": 0.1487, | |
| "mean_token_accuracy": 0.967153193950653, | |
| "num_tokens": 3043409455.0, | |
| "step": 21550 | |
| }, | |
| { | |
| "entropy": 1.3809619188308715, | |
| "epoch": 0.7051679670921616, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 4.619543805805475e-06, | |
| "loss": 0.145, | |
| "mean_token_accuracy": 0.9672031688690186, | |
| "num_tokens": 3050598353.0, | |
| "step": 21600 | |
| }, | |
| { | |
| "entropy": 1.3623022150993347, | |
| "epoch": 0.7068003003493193, | |
| "grad_norm": 1.5, | |
| "learning_rate": 4.612722023347991e-06, | |
| "loss": 0.1383, | |
| "mean_token_accuracy": 0.9682861661911011, | |
| "num_tokens": 3057976676.0, | |
| "step": 21650 | |
| }, | |
| { | |
| "entropy": 1.361666476726532, | |
| "epoch": 0.7084326336064771, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 4.6058884923779135e-06, | |
| "loss": 0.143, | |
| "mean_token_accuracy": 0.9676965260505677, | |
| "num_tokens": 3065507507.0, | |
| "step": 21700 | |
| }, | |
| { | |
| "entropy": 1.3833092284202575, | |
| "epoch": 0.7100649668636348, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 4.599043262676806e-06, | |
| "loss": 0.1443, | |
| "mean_token_accuracy": 0.9680466854572296, | |
| "num_tokens": 3072550373.0, | |
| "step": 21750 | |
| }, | |
| { | |
| "entropy": 1.386215124130249, | |
| "epoch": 0.7116973001207927, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 4.592186384111457e-06, | |
| "loss": 0.1457, | |
| "mean_token_accuracy": 0.9672110736370086, | |
| "num_tokens": 3079372014.0, | |
| "step": 21800 | |
| }, | |
| { | |
| "entropy": 1.3695436239242553, | |
| "epoch": 0.7133296333779504, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 4.585317906633516e-06, | |
| "loss": 0.1291, | |
| "mean_token_accuracy": 0.9703351008892059, | |
| "num_tokens": 3086062214.0, | |
| "step": 21850 | |
| }, | |
| { | |
| "entropy": 1.3959812355041503, | |
| "epoch": 0.7149619666351082, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 4.578437880279126e-06, | |
| "loss": 0.1269, | |
| "mean_token_accuracy": 0.9709628915786743, | |
| "num_tokens": 3092573548.0, | |
| "step": 21900 | |
| }, | |
| { | |
| "entropy": 1.3880902981758119, | |
| "epoch": 0.716594299892266, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 4.571546355168567e-06, | |
| "loss": 0.1414, | |
| "mean_token_accuracy": 0.9681686234474182, | |
| "num_tokens": 3099550809.0, | |
| "step": 21950 | |
| }, | |
| { | |
| "entropy": 1.40864750623703, | |
| "epoch": 0.7182266331494238, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 4.564643381505886e-06, | |
| "loss": 0.1468, | |
| "mean_token_accuracy": 0.9665729129314422, | |
| "num_tokens": 3106864257.0, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.7182266331494238, | |
| "eval_entropy": 1.3993131558100382, | |
| "eval_loss": 0.15842117369174957, | |
| "eval_mean_token_accuracy": 0.9647385891278585, | |
| "eval_num_tokens": 3106864257.0, | |
| "eval_runtime": 751.7197, | |
| "eval_samples_per_second": 12.845, | |
| "eval_steps_per_second": 0.101, | |
| "step": 22000 | |
| }, | |
| { | |
| "entropy": 1.3926187753677368, | |
| "epoch": 0.7198589664065815, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 4.557729009578527e-06, | |
| "loss": 0.14, | |
| "mean_token_accuracy": 0.9683624911308288, | |
| "num_tokens": 3113827777.0, | |
| "step": 22050 | |
| }, | |
| { | |
| "entropy": 1.3936706256866456, | |
| "epoch": 0.7214912996637394, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 4.550803289756973e-06, | |
| "loss": 0.1333, | |
| "mean_token_accuracy": 0.9700661396980286, | |
| "num_tokens": 3120906575.0, | |
| "step": 22100 | |
| }, | |
| { | |
| "entropy": 1.3945861506462096, | |
| "epoch": 0.7231236329208971, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 4.543866272494375e-06, | |
| "loss": 0.1535, | |
| "mean_token_accuracy": 0.9649194324016571, | |
| "num_tokens": 3128776941.0, | |
| "step": 22150 | |
| }, | |
| { | |
| "entropy": 1.4023722219467163, | |
| "epoch": 0.7247559661780549, | |
| "grad_norm": 1.125, | |
| "learning_rate": 4.536918008326183e-06, | |
| "loss": 0.137, | |
| "mean_token_accuracy": 0.9688715541362762, | |
| "num_tokens": 3135678296.0, | |
| "step": 22200 | |
| }, | |
| { | |
| "entropy": 1.4066932153701783, | |
| "epoch": 0.7263882994352127, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 4.529958547869781e-06, | |
| "loss": 0.1373, | |
| "mean_token_accuracy": 0.968953766822815, | |
| "num_tokens": 3142368713.0, | |
| "step": 22250 | |
| }, | |
| { | |
| "entropy": 1.4128772020339966, | |
| "epoch": 0.7280206326923705, | |
| "grad_norm": 2.125, | |
| "learning_rate": 4.5229879418241155e-06, | |
| "loss": 0.1431, | |
| "mean_token_accuracy": 0.9682337057590484, | |
| "num_tokens": 3149212589.0, | |
| "step": 22300 | |
| }, | |
| { | |
| "entropy": 1.4088830590248107, | |
| "epoch": 0.7296529659495282, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 4.516006240969329e-06, | |
| "loss": 0.1481, | |
| "mean_token_accuracy": 0.9669530403614044, | |
| "num_tokens": 3156794748.0, | |
| "step": 22350 | |
| }, | |
| { | |
| "entropy": 1.41916588306427, | |
| "epoch": 0.7312852992066861, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 4.509013496166387e-06, | |
| "loss": 0.1429, | |
| "mean_token_accuracy": 0.9679375386238098, | |
| "num_tokens": 3164068091.0, | |
| "step": 22400 | |
| }, | |
| { | |
| "entropy": 1.408693754673004, | |
| "epoch": 0.7329176324638438, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 4.5020097583567104e-06, | |
| "loss": 0.1324, | |
| "mean_token_accuracy": 0.9701212620735169, | |
| "num_tokens": 3170772040.0, | |
| "step": 22450 | |
| }, | |
| { | |
| "entropy": 1.4141771841049193, | |
| "epoch": 0.7345499657210016, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 4.4949950785618025e-06, | |
| "loss": 0.1394, | |
| "mean_token_accuracy": 0.9690138208866119, | |
| "num_tokens": 3177787601.0, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.7345499657210016, | |
| "eval_entropy": 1.405626532236735, | |
| "eval_loss": 0.1584155559539795, | |
| "eval_mean_token_accuracy": 0.9649723815917969, | |
| "eval_num_tokens": 3177787601.0, | |
| "eval_runtime": 749.0195, | |
| "eval_samples_per_second": 12.892, | |
| "eval_steps_per_second": 0.101, | |
| "step": 22500 | |
| }, | |
| { | |
| "entropy": 1.409652578830719, | |
| "epoch": 0.7361822989781593, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 4.4879695078828765e-06, | |
| "loss": 0.1447, | |
| "mean_token_accuracy": 0.9667559254169464, | |
| "num_tokens": 3184597089.0, | |
| "step": 22550 | |
| }, | |
| { | |
| "entropy": 1.3969360828399657, | |
| "epoch": 0.7378146322353172, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 4.480933097500489e-06, | |
| "loss": 0.1261, | |
| "mean_token_accuracy": 0.9719417309761047, | |
| "num_tokens": 3191019437.0, | |
| "step": 22600 | |
| }, | |
| { | |
| "entropy": 1.4107865738868712, | |
| "epoch": 0.7394469654924749, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 4.473885898674155e-06, | |
| "loss": 0.1444, | |
| "mean_token_accuracy": 0.9674191176891327, | |
| "num_tokens": 3198108407.0, | |
| "step": 22650 | |
| }, | |
| { | |
| "entropy": 1.4059437608718872, | |
| "epoch": 0.7410792987496327, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 4.4668279627419904e-06, | |
| "loss": 0.1408, | |
| "mean_token_accuracy": 0.9680229306221009, | |
| "num_tokens": 3205399672.0, | |
| "step": 22700 | |
| }, | |
| { | |
| "entropy": 1.3878255271911621, | |
| "epoch": 0.7427116320067905, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 4.459759341120323e-06, | |
| "loss": 0.1355, | |
| "mean_token_accuracy": 0.9695122539997101, | |
| "num_tokens": 3212086664.0, | |
| "step": 22750 | |
| }, | |
| { | |
| "entropy": 1.3905278396606446, | |
| "epoch": 0.7443439652639483, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 4.452680085303331e-06, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9699708425998688, | |
| "num_tokens": 3219061680.0, | |
| "step": 22800 | |
| }, | |
| { | |
| "entropy": 1.392570481300354, | |
| "epoch": 0.745976298521106, | |
| "grad_norm": 2.0, | |
| "learning_rate": 4.445590246862656e-06, | |
| "loss": 0.1348, | |
| "mean_token_accuracy": 0.9696343839168549, | |
| "num_tokens": 3225449142.0, | |
| "step": 22850 | |
| }, | |
| { | |
| "entropy": 1.3988615465164185, | |
| "epoch": 0.7476086317782639, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 4.438489877447037e-06, | |
| "loss": 0.153, | |
| "mean_token_accuracy": 0.9651832461357117, | |
| "num_tokens": 3232829699.0, | |
| "step": 22900 | |
| }, | |
| { | |
| "entropy": 1.3806819915771484, | |
| "epoch": 0.7492409650354216, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 4.431379028781927e-06, | |
| "loss": 0.1391, | |
| "mean_token_accuracy": 0.9683671975135804, | |
| "num_tokens": 3239549563.0, | |
| "step": 22950 | |
| }, | |
| { | |
| "entropy": 1.3997270607948302, | |
| "epoch": 0.7508732982925794, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 4.424257752669122e-06, | |
| "loss": 0.1504, | |
| "mean_token_accuracy": 0.9661344397068024, | |
| "num_tokens": 3246842815.0, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.7508732982925794, | |
| "eval_entropy": 1.3884711440404256, | |
| "eval_loss": 0.1580066680908203, | |
| "eval_mean_token_accuracy": 0.9648262977600097, | |
| "eval_num_tokens": 3246842815.0, | |
| "eval_runtime": 751.2443, | |
| "eval_samples_per_second": 12.853, | |
| "eval_steps_per_second": 0.101, | |
| "step": 23000 | |
| }, | |
| { | |
| "entropy": 1.386196925640106, | |
| "epoch": 0.7525056315497372, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 4.417126100986378e-06, | |
| "loss": 0.1349, | |
| "mean_token_accuracy": 0.9699360942840576, | |
| "num_tokens": 3253700706.0, | |
| "step": 23050 | |
| }, | |
| { | |
| "entropy": 1.3982844924926758, | |
| "epoch": 0.754137964806895, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 4.409984125687039e-06, | |
| "loss": 0.1469, | |
| "mean_token_accuracy": 0.9673852014541626, | |
| "num_tokens": 3260955234.0, | |
| "step": 23100 | |
| }, | |
| { | |
| "entropy": 1.4037910151481627, | |
| "epoch": 0.7557702980640527, | |
| "grad_norm": 1.5, | |
| "learning_rate": 4.402831878799652e-06, | |
| "loss": 0.1442, | |
| "mean_token_accuracy": 0.9678021275997162, | |
| "num_tokens": 3267932211.0, | |
| "step": 23150 | |
| }, | |
| { | |
| "entropy": 1.4121773409843446, | |
| "epoch": 0.7574026313212106, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 4.395669412427596e-06, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9696631526947022, | |
| "num_tokens": 3274869428.0, | |
| "step": 23200 | |
| }, | |
| { | |
| "entropy": 1.4119772076606751, | |
| "epoch": 0.7590349645783683, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 4.388496778748694e-06, | |
| "loss": 0.1349, | |
| "mean_token_accuracy": 0.969488970041275, | |
| "num_tokens": 3281666389.0, | |
| "step": 23250 | |
| }, | |
| { | |
| "entropy": 1.3851684546470642, | |
| "epoch": 0.7606672978355261, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 4.381314030014837e-06, | |
| "loss": 0.1419, | |
| "mean_token_accuracy": 0.96839430809021, | |
| "num_tokens": 3289075580.0, | |
| "step": 23300 | |
| }, | |
| { | |
| "entropy": 1.407478768825531, | |
| "epoch": 0.7622996310926838, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 4.374121218551606e-06, | |
| "loss": 0.158, | |
| "mean_token_accuracy": 0.9645157742500305, | |
| "num_tokens": 3296401618.0, | |
| "step": 23350 | |
| }, | |
| { | |
| "entropy": 1.4000224781036377, | |
| "epoch": 0.7639319643498417, | |
| "grad_norm": 1.25, | |
| "learning_rate": 4.366918396757886e-06, | |
| "loss": 0.139, | |
| "mean_token_accuracy": 0.9687736296653747, | |
| "num_tokens": 3303532934.0, | |
| "step": 23400 | |
| }, | |
| { | |
| "entropy": 1.4132404017448426, | |
| "epoch": 0.7655642976069994, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 4.359705617105485e-06, | |
| "loss": 0.1464, | |
| "mean_token_accuracy": 0.9665714311599731, | |
| "num_tokens": 3310772502.0, | |
| "step": 23450 | |
| }, | |
| { | |
| "entropy": 1.409326949119568, | |
| "epoch": 0.7671966308641572, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 4.352482932138756e-06, | |
| "loss": 0.1475, | |
| "mean_token_accuracy": 0.9669658172130585, | |
| "num_tokens": 3318251468.0, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.7671966308641572, | |
| "eval_entropy": 1.3898699220021566, | |
| "eval_loss": 0.15739725530147552, | |
| "eval_mean_token_accuracy": 0.9649832367897033, | |
| "eval_num_tokens": 3318251468.0, | |
| "eval_runtime": 743.2592, | |
| "eval_samples_per_second": 12.991, | |
| "eval_steps_per_second": 0.102, | |
| "step": 23500 | |
| }, | |
| { | |
| "entropy": 1.3853822755813598, | |
| "epoch": 0.768828964121315, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 4.345250394474207e-06, | |
| "loss": 0.1417, | |
| "mean_token_accuracy": 0.968293867111206, | |
| "num_tokens": 3325415082.0, | |
| "step": 23550 | |
| }, | |
| { | |
| "entropy": 1.3828247284889221, | |
| "epoch": 0.7704612973784728, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 4.338008056800126e-06, | |
| "loss": 0.1408, | |
| "mean_token_accuracy": 0.9685234224796295, | |
| "num_tokens": 3332121947.0, | |
| "step": 23600 | |
| }, | |
| { | |
| "entropy": 1.4080828213691712, | |
| "epoch": 0.7720936306356305, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 4.330755971876192e-06, | |
| "loss": 0.1464, | |
| "mean_token_accuracy": 0.9673770892620087, | |
| "num_tokens": 3339236872.0, | |
| "step": 23650 | |
| }, | |
| { | |
| "entropy": 1.399913146495819, | |
| "epoch": 0.7737259638927884, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 4.3234941925330915e-06, | |
| "loss": 0.144, | |
| "mean_token_accuracy": 0.9670092570781708, | |
| "num_tokens": 3346331383.0, | |
| "step": 23700 | |
| }, | |
| { | |
| "entropy": 1.391896095275879, | |
| "epoch": 0.7753582971499461, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 4.316222771672132e-06, | |
| "loss": 0.1356, | |
| "mean_token_accuracy": 0.9692227625846863, | |
| "num_tokens": 3353234181.0, | |
| "step": 23750 | |
| }, | |
| { | |
| "entropy": 1.4092794895172118, | |
| "epoch": 0.7769906304071039, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 4.3089417622648605e-06, | |
| "loss": 0.1451, | |
| "mean_token_accuracy": 0.9670332086086273, | |
| "num_tokens": 3360224816.0, | |
| "step": 23800 | |
| }, | |
| { | |
| "entropy": 1.4031621408462525, | |
| "epoch": 0.7786229636642616, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 4.301651217352674e-06, | |
| "loss": 0.1412, | |
| "mean_token_accuracy": 0.9681469559669494, | |
| "num_tokens": 3367296325.0, | |
| "step": 23850 | |
| }, | |
| { | |
| "entropy": 1.4203896260261535, | |
| "epoch": 0.7802552969214195, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 4.294351190046439e-06, | |
| "loss": 0.1433, | |
| "mean_token_accuracy": 0.9682426953315735, | |
| "num_tokens": 3373747388.0, | |
| "step": 23900 | |
| }, | |
| { | |
| "entropy": 1.412433443069458, | |
| "epoch": 0.7818876301785772, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 4.2870417335260925e-06, | |
| "loss": 0.1459, | |
| "mean_token_accuracy": 0.9671615362167358, | |
| "num_tokens": 3380836126.0, | |
| "step": 23950 | |
| }, | |
| { | |
| "entropy": 1.4091150188446044, | |
| "epoch": 0.783519963435735, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 4.2797229010402695e-06, | |
| "loss": 0.1459, | |
| "mean_token_accuracy": 0.9665989732742309, | |
| "num_tokens": 3388112439.0, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.783519963435735, | |
| "eval_entropy": 1.3812917121251425, | |
| "eval_loss": 0.15769919753074646, | |
| "eval_mean_token_accuracy": 0.964633092880249, | |
| "eval_num_tokens": 3388112439.0, | |
| "eval_runtime": 743.2853, | |
| "eval_samples_per_second": 12.991, | |
| "eval_steps_per_second": 0.102, | |
| "step": 24000 | |
| }, | |
| { | |
| "entropy": 1.3875314927101134, | |
| "epoch": 0.7851522966928928, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 4.272394745905904e-06, | |
| "loss": 0.1394, | |
| "mean_token_accuracy": 0.9680357229709625, | |
| "num_tokens": 3395153920.0, | |
| "step": 24050 | |
| }, | |
| { | |
| "entropy": 1.3935593914985658, | |
| "epoch": 0.7867846299500506, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 4.265057321507848e-06, | |
| "loss": 0.1223, | |
| "mean_token_accuracy": 0.9719667887687683, | |
| "num_tokens": 3401608036.0, | |
| "step": 24100 | |
| }, | |
| { | |
| "entropy": 1.3922600531578064, | |
| "epoch": 0.7884169632072083, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 4.257710681298474e-06, | |
| "loss": 0.1431, | |
| "mean_token_accuracy": 0.9678590965270996, | |
| "num_tokens": 3409048396.0, | |
| "step": 24150 | |
| }, | |
| { | |
| "entropy": 1.404571294784546, | |
| "epoch": 0.7900492964643662, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 4.250354878797295e-06, | |
| "loss": 0.1339, | |
| "mean_token_accuracy": 0.969120637178421, | |
| "num_tokens": 3415718906.0, | |
| "step": 24200 | |
| }, | |
| { | |
| "entropy": 1.4066750693321228, | |
| "epoch": 0.7916816297215239, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 4.242989967590568e-06, | |
| "loss": 0.1471, | |
| "mean_token_accuracy": 0.9678963148593902, | |
| "num_tokens": 3422879629.0, | |
| "step": 24250 | |
| }, | |
| { | |
| "entropy": 1.4055565428733825, | |
| "epoch": 0.7933139629786817, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 4.235616001330909e-06, | |
| "loss": 0.1386, | |
| "mean_token_accuracy": 0.9684436011314392, | |
| "num_tokens": 3430035861.0, | |
| "step": 24300 | |
| }, | |
| { | |
| "entropy": 1.3904715991020202, | |
| "epoch": 0.7949462962358395, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 4.228233033736894e-06, | |
| "loss": 0.1375, | |
| "mean_token_accuracy": 0.9683421933650971, | |
| "num_tokens": 3437370072.0, | |
| "step": 24350 | |
| }, | |
| { | |
| "entropy": 1.4026382374763489, | |
| "epoch": 0.7965786294929973, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 4.22084111859268e-06, | |
| "loss": 0.1447, | |
| "mean_token_accuracy": 0.967241278886795, | |
| "num_tokens": 3444922290.0, | |
| "step": 24400 | |
| }, | |
| { | |
| "entropy": 1.4011790704727174, | |
| "epoch": 0.798210962750155, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 4.213440309747597e-06, | |
| "loss": 0.1355, | |
| "mean_token_accuracy": 0.969396059513092, | |
| "num_tokens": 3451653589.0, | |
| "step": 24450 | |
| }, | |
| { | |
| "entropy": 1.410584397315979, | |
| "epoch": 0.7998432960073129, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 4.206030661115772e-06, | |
| "loss": 0.1447, | |
| "mean_token_accuracy": 0.9678661072254181, | |
| "num_tokens": 3459121557.0, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.7998432960073129, | |
| "eval_entropy": 1.3954780069986978, | |
| "eval_loss": 0.15847522020339966, | |
| "eval_mean_token_accuracy": 0.9648264590899149, | |
| "eval_num_tokens": 3459121557.0, | |
| "eval_runtime": 748.4306, | |
| "eval_samples_per_second": 12.902, | |
| "eval_steps_per_second": 0.102, | |
| "step": 24500 | |
| }, | |
| { | |
| "entropy": 1.4040296864509583, | |
| "epoch": 0.8014756292644706, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 4.198612226675727e-06, | |
| "loss": 0.1397, | |
| "mean_token_accuracy": 0.9695000052452087, | |
| "num_tokens": 3466079227.0, | |
| "step": 24550 | |
| }, | |
| { | |
| "entropy": 1.4071758961677552, | |
| "epoch": 0.8031079625216284, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 4.191185060469987e-06, | |
| "loss": 0.145, | |
| "mean_token_accuracy": 0.9681662321090698, | |
| "num_tokens": 3472782801.0, | |
| "step": 24600 | |
| }, | |
| { | |
| "entropy": 1.4046114492416382, | |
| "epoch": 0.8047402957787863, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 4.183749216604685e-06, | |
| "loss": 0.137, | |
| "mean_token_accuracy": 0.9698592948913575, | |
| "num_tokens": 3479515350.0, | |
| "step": 24650 | |
| }, | |
| { | |
| "entropy": 1.4098283767700195, | |
| "epoch": 0.806372629035944, | |
| "grad_norm": 1.875, | |
| "learning_rate": 4.1763047492491746e-06, | |
| "loss": 0.1386, | |
| "mean_token_accuracy": 0.9687701988220215, | |
| "num_tokens": 3486524754.0, | |
| "step": 24700 | |
| }, | |
| { | |
| "entropy": 1.39289204120636, | |
| "epoch": 0.8080049622931018, | |
| "grad_norm": 1.375, | |
| "learning_rate": 4.1688517126356256e-06, | |
| "loss": 0.1331, | |
| "mean_token_accuracy": 0.9696247518062592, | |
| "num_tokens": 3493517440.0, | |
| "step": 24750 | |
| }, | |
| { | |
| "entropy": 1.3983655071258545, | |
| "epoch": 0.8096372955502595, | |
| "grad_norm": 1.25, | |
| "learning_rate": 4.161390161058637e-06, | |
| "loss": 0.1455, | |
| "mean_token_accuracy": 0.9675554573535919, | |
| "num_tokens": 3500746527.0, | |
| "step": 24800 | |
| }, | |
| { | |
| "entropy": 1.4149045872688293, | |
| "epoch": 0.8112696288074174, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 4.153920148874839e-06, | |
| "loss": 0.1459, | |
| "mean_token_accuracy": 0.9668138778209686, | |
| "num_tokens": 3507701444.0, | |
| "step": 24850 | |
| }, | |
| { | |
| "entropy": 1.4177529954910277, | |
| "epoch": 0.8129019620645751, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 4.146441730502496e-06, | |
| "loss": 0.152, | |
| "mean_token_accuracy": 0.9660572922229766, | |
| "num_tokens": 3514490276.0, | |
| "step": 24900 | |
| }, | |
| { | |
| "entropy": 1.4248465538024901, | |
| "epoch": 0.8145342953217329, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 4.1389549604211064e-06, | |
| "loss": 0.1481, | |
| "mean_token_accuracy": 0.966586571931839, | |
| "num_tokens": 3521708251.0, | |
| "step": 24950 | |
| }, | |
| { | |
| "entropy": 1.4028679990768433, | |
| "epoch": 0.8161666285788907, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 4.131459893171016e-06, | |
| "loss": 0.1293, | |
| "mean_token_accuracy": 0.9706797707080841, | |
| "num_tokens": 3528390818.0, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.8161666285788907, | |
| "eval_entropy": 1.4112112029393513, | |
| "eval_loss": 0.15821218490600586, | |
| "eval_mean_token_accuracy": 0.9647614455223084, | |
| "eval_num_tokens": 3528390818.0, | |
| "eval_runtime": 747.1945, | |
| "eval_samples_per_second": 12.923, | |
| "eval_steps_per_second": 0.102, | |
| "step": 25000 | |
| }, | |
| { | |
| "entropy": 1.4167933750152588, | |
| "epoch": 0.8177989618360485, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 4.1239565833530115e-06, | |
| "loss": 0.1441, | |
| "mean_token_accuracy": 0.9679395818710327, | |
| "num_tokens": 3535142016.0, | |
| "step": 25050 | |
| }, | |
| { | |
| "entropy": 1.4142408227920533, | |
| "epoch": 0.8194312950932062, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 4.116445085627926e-06, | |
| "loss": 0.1437, | |
| "mean_token_accuracy": 0.9675932359695435, | |
| "num_tokens": 3541811208.0, | |
| "step": 25100 | |
| }, | |
| { | |
| "entropy": 1.4017269968986512, | |
| "epoch": 0.8210636283503641, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 4.108925454716242e-06, | |
| "loss": 0.1388, | |
| "mean_token_accuracy": 0.9687268888950348, | |
| "num_tokens": 3549027064.0, | |
| "step": 25150 | |
| }, | |
| { | |
| "entropy": 1.4196626782417296, | |
| "epoch": 0.8226959616075218, | |
| "grad_norm": 2.0, | |
| "learning_rate": 4.101397745397689e-06, | |
| "loss": 0.1374, | |
| "mean_token_accuracy": 0.9688492357730866, | |
| "num_tokens": 3556231008.0, | |
| "step": 25200 | |
| }, | |
| { | |
| "entropy": 1.4409023642539978, | |
| "epoch": 0.8243282948646796, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 4.093862012510847e-06, | |
| "loss": 0.1528, | |
| "mean_token_accuracy": 0.9662479484081268, | |
| "num_tokens": 3563607911.0, | |
| "step": 25250 | |
| }, | |
| { | |
| "entropy": 1.3985059690475463, | |
| "epoch": 0.8259606281218373, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 4.086318310952752e-06, | |
| "loss": 0.1363, | |
| "mean_token_accuracy": 0.969468570947647, | |
| "num_tokens": 3570544476.0, | |
| "step": 25300 | |
| }, | |
| { | |
| "entropy": 1.395876476764679, | |
| "epoch": 0.8275929613789952, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 4.078766695678484e-06, | |
| "loss": 0.141, | |
| "mean_token_accuracy": 0.9681139755249023, | |
| "num_tokens": 3577514806.0, | |
| "step": 25350 | |
| }, | |
| { | |
| "entropy": 1.3914256238937377, | |
| "epoch": 0.8292252946361529, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 4.071207221700778e-06, | |
| "loss": 0.1417, | |
| "mean_token_accuracy": 0.9679916310310364, | |
| "num_tokens": 3584710985.0, | |
| "step": 25400 | |
| }, | |
| { | |
| "entropy": 1.3804515194892883, | |
| "epoch": 0.8308576278933107, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 4.063639944089617e-06, | |
| "loss": 0.1369, | |
| "mean_token_accuracy": 0.9694907116889954, | |
| "num_tokens": 3591780461.0, | |
| "step": 25450 | |
| }, | |
| { | |
| "entropy": 1.4110288119316101, | |
| "epoch": 0.8324899611504685, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 4.0560649179718345e-06, | |
| "loss": 0.1454, | |
| "mean_token_accuracy": 0.9680348300933838, | |
| "num_tokens": 3598701756.0, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.8324899611504685, | |
| "eval_entropy": 1.4156104850769042, | |
| "eval_loss": 0.1576152741909027, | |
| "eval_mean_token_accuracy": 0.9650608507792154, | |
| "eval_num_tokens": 3598701756.0, | |
| "eval_runtime": 749.1988, | |
| "eval_samples_per_second": 12.888, | |
| "eval_steps_per_second": 0.101, | |
| "step": 25500 | |
| }, | |
| { | |
| "entropy": 1.40669499874115, | |
| "epoch": 0.8341222944076263, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 4.048482198530708e-06, | |
| "loss": 0.1321, | |
| "mean_token_accuracy": 0.9696508872509003, | |
| "num_tokens": 3605779866.0, | |
| "step": 25550 | |
| }, | |
| { | |
| "entropy": 1.40333979845047, | |
| "epoch": 0.835754627664784, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 4.040891841005562e-06, | |
| "loss": 0.1327, | |
| "mean_token_accuracy": 0.9699014961719513, | |
| "num_tokens": 3612587584.0, | |
| "step": 25600 | |
| }, | |
| { | |
| "entropy": 1.412612452507019, | |
| "epoch": 0.8373869609219419, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 4.033293900691364e-06, | |
| "loss": 0.1471, | |
| "mean_token_accuracy": 0.9674671077728272, | |
| "num_tokens": 3619996003.0, | |
| "step": 25650 | |
| }, | |
| { | |
| "entropy": 1.411652238368988, | |
| "epoch": 0.8390192941790996, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 4.02568843293832e-06, | |
| "loss": 0.1484, | |
| "mean_token_accuracy": 0.9667781054973602, | |
| "num_tokens": 3627359567.0, | |
| "step": 25700 | |
| }, | |
| { | |
| "entropy": 1.3998070549964905, | |
| "epoch": 0.8406516274362574, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 4.0180754931514745e-06, | |
| "loss": 0.1456, | |
| "mean_token_accuracy": 0.9677145159244538, | |
| "num_tokens": 3634637886.0, | |
| "step": 25750 | |
| }, | |
| { | |
| "entropy": 1.3955191278457642, | |
| "epoch": 0.8422839606934152, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 4.010455136790304e-06, | |
| "loss": 0.1469, | |
| "mean_token_accuracy": 0.9672901368141175, | |
| "num_tokens": 3642177370.0, | |
| "step": 25800 | |
| }, | |
| { | |
| "entropy": 1.3989661598205567, | |
| "epoch": 0.843916293950573, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 4.0028274193683124e-06, | |
| "loss": 0.1401, | |
| "mean_token_accuracy": 0.9687706243991852, | |
| "num_tokens": 3649058563.0, | |
| "step": 25850 | |
| }, | |
| { | |
| "entropy": 1.4021561121940613, | |
| "epoch": 0.8455486272077307, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 3.995192396452631e-06, | |
| "loss": 0.1411, | |
| "mean_token_accuracy": 0.9682626259326935, | |
| "num_tokens": 3655854303.0, | |
| "step": 25900 | |
| }, | |
| { | |
| "entropy": 1.382779130935669, | |
| "epoch": 0.8471809604648886, | |
| "grad_norm": 2.25, | |
| "learning_rate": 3.987550123663608e-06, | |
| "loss": 0.1462, | |
| "mean_token_accuracy": 0.9676120269298554, | |
| "num_tokens": 3662624685.0, | |
| "step": 25950 | |
| }, | |
| { | |
| "entropy": 1.3848181200027465, | |
| "epoch": 0.8488132937220463, | |
| "grad_norm": 2.25, | |
| "learning_rate": 3.97990065667441e-06, | |
| "loss": 0.14, | |
| "mean_token_accuracy": 0.9676021826267243, | |
| "num_tokens": 3669341852.0, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.8488132937220463, | |
| "eval_entropy": 1.3808231941858928, | |
| "eval_loss": 0.15642417967319489, | |
| "eval_mean_token_accuracy": 0.9652521824836731, | |
| "eval_num_tokens": 3669341852.0, | |
| "eval_runtime": 753.4968, | |
| "eval_samples_per_second": 12.815, | |
| "eval_steps_per_second": 0.101, | |
| "step": 26000 | |
| }, | |
| { | |
| "entropy": 1.388938684463501, | |
| "epoch": 0.8504456269792041, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 3.972244051210608e-06, | |
| "loss": 0.1499, | |
| "mean_token_accuracy": 0.9663873422145843, | |
| "num_tokens": 3676639962.0, | |
| "step": 26050 | |
| }, | |
| { | |
| "entropy": 1.3947418189048768, | |
| "epoch": 0.8520779602363618, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 3.964580363049779e-06, | |
| "loss": 0.1446, | |
| "mean_token_accuracy": 0.9676153147220612, | |
| "num_tokens": 3683543595.0, | |
| "step": 26100 | |
| }, | |
| { | |
| "entropy": 1.3950307440757752, | |
| "epoch": 0.8537102934935197, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 3.956909648021096e-06, | |
| "loss": 0.136, | |
| "mean_token_accuracy": 0.9690661346912384, | |
| "num_tokens": 3690286971.0, | |
| "step": 26150 | |
| }, | |
| { | |
| "entropy": 1.402916703224182, | |
| "epoch": 0.8553426267506774, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 3.949231962004922e-06, | |
| "loss": 0.1541, | |
| "mean_token_accuracy": 0.9650176286697387, | |
| "num_tokens": 3697865663.0, | |
| "step": 26200 | |
| }, | |
| { | |
| "entropy": 1.408561556339264, | |
| "epoch": 0.8569749600078352, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 3.941547360932404e-06, | |
| "loss": 0.1375, | |
| "mean_token_accuracy": 0.9691826546192169, | |
| "num_tokens": 3704856688.0, | |
| "step": 26250 | |
| }, | |
| { | |
| "entropy": 1.3971546220779418, | |
| "epoch": 0.858607293264993, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 3.933855900785063e-06, | |
| "loss": 0.1349, | |
| "mean_token_accuracy": 0.9694166851043701, | |
| "num_tokens": 3711350984.0, | |
| "step": 26300 | |
| }, | |
| { | |
| "entropy": 1.403090295791626, | |
| "epoch": 0.8602396265221508, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 3.926157637594387e-06, | |
| "loss": 0.1427, | |
| "mean_token_accuracy": 0.9675269103050232, | |
| "num_tokens": 3718476089.0, | |
| "step": 26350 | |
| }, | |
| { | |
| "entropy": 1.3978805589675902, | |
| "epoch": 0.8618719597793085, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 3.918452627441425e-06, | |
| "loss": 0.1342, | |
| "mean_token_accuracy": 0.9688924837112427, | |
| "num_tokens": 3725594290.0, | |
| "step": 26400 | |
| }, | |
| { | |
| "entropy": 1.4037820672988892, | |
| "epoch": 0.8635042930364664, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 3.910740926456376e-06, | |
| "loss": 0.1356, | |
| "mean_token_accuracy": 0.9694356083869934, | |
| "num_tokens": 3731938265.0, | |
| "step": 26450 | |
| }, | |
| { | |
| "entropy": 1.4001559686660767, | |
| "epoch": 0.8651366262936241, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 3.903022590818183e-06, | |
| "loss": 0.1364, | |
| "mean_token_accuracy": 0.9691901934146882, | |
| "num_tokens": 3738816334.0, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.8651366262936241, | |
| "eval_entropy": 1.3956389427185059, | |
| "eval_loss": 0.1563226878643036, | |
| "eval_mean_token_accuracy": 0.965215961933136, | |
| "eval_num_tokens": 3738816334.0, | |
| "eval_runtime": 749.9014, | |
| "eval_samples_per_second": 12.876, | |
| "eval_steps_per_second": 0.101, | |
| "step": 26500 | |
| }, | |
| { | |
| "entropy": 1.3984951615333556, | |
| "epoch": 0.8667689595507819, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 3.895297676754119e-06, | |
| "loss": 0.1413, | |
| "mean_token_accuracy": 0.9679640185832977, | |
| "num_tokens": 3745955192.0, | |
| "step": 26550 | |
| }, | |
| { | |
| "entropy": 1.3817971444129944, | |
| "epoch": 0.8684012928079397, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 3.887566240539381e-06, | |
| "loss": 0.1445, | |
| "mean_token_accuracy": 0.9670935535430908, | |
| "num_tokens": 3753010501.0, | |
| "step": 26600 | |
| }, | |
| { | |
| "entropy": 1.3948706936836244, | |
| "epoch": 0.8700336260650975, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 3.87982833849668e-06, | |
| "loss": 0.1325, | |
| "mean_token_accuracy": 0.9705722856521607, | |
| "num_tokens": 3759798351.0, | |
| "step": 26650 | |
| }, | |
| { | |
| "entropy": 1.393795645236969, | |
| "epoch": 0.8716659593222552, | |
| "grad_norm": 1.0, | |
| "learning_rate": 3.87208402699583e-06, | |
| "loss": 0.1317, | |
| "mean_token_accuracy": 0.9698431146144867, | |
| "num_tokens": 3766781169.0, | |
| "step": 26700 | |
| }, | |
| { | |
| "entropy": 1.3873535466194153, | |
| "epoch": 0.873298292579413, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 3.864333362453337e-06, | |
| "loss": 0.1387, | |
| "mean_token_accuracy": 0.9688050973415375, | |
| "num_tokens": 3773957478.0, | |
| "step": 26750 | |
| }, | |
| { | |
| "entropy": 1.3917481398582459, | |
| "epoch": 0.8749306258365708, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 3.856576401331988e-06, | |
| "loss": 0.1394, | |
| "mean_token_accuracy": 0.9683778524398804, | |
| "num_tokens": 3780762633.0, | |
| "step": 26800 | |
| }, | |
| { | |
| "entropy": 1.3908830904960632, | |
| "epoch": 0.8765629590937286, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 3.848813200140437e-06, | |
| "loss": 0.1547, | |
| "mean_token_accuracy": 0.9653457498550415, | |
| "num_tokens": 3788278563.0, | |
| "step": 26850 | |
| }, | |
| { | |
| "entropy": 1.4087229990959167, | |
| "epoch": 0.8781952923508863, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 3.841043815432803e-06, | |
| "loss": 0.1401, | |
| "mean_token_accuracy": 0.9690191769599914, | |
| "num_tokens": 3794646188.0, | |
| "step": 26900 | |
| }, | |
| { | |
| "entropy": 1.3997572946548462, | |
| "epoch": 0.8798276256080442, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 3.833268303808244e-06, | |
| "loss": 0.1366, | |
| "mean_token_accuracy": 0.9689948236942292, | |
| "num_tokens": 3801360008.0, | |
| "step": 26950 | |
| }, | |
| { | |
| "entropy": 1.3976869773864746, | |
| "epoch": 0.8814599588652019, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 3.8254867219105575e-06, | |
| "loss": 0.1307, | |
| "mean_token_accuracy": 0.970016497373581, | |
| "num_tokens": 3808022349.0, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.8814599588652019, | |
| "eval_entropy": 1.3932032998402912, | |
| "eval_loss": 0.15805409848690033, | |
| "eval_mean_token_accuracy": 0.9647916714350383, | |
| "eval_num_tokens": 3808022349.0, | |
| "eval_runtime": 750.2779, | |
| "eval_samples_per_second": 12.87, | |
| "eval_steps_per_second": 0.101, | |
| "step": 27000 | |
| }, | |
| { | |
| "entropy": 1.406122510433197, | |
| "epoch": 0.8830922921223597, | |
| "grad_norm": 1.625, | |
| "learning_rate": 3.8176991264277604e-06, | |
| "loss": 0.1555, | |
| "mean_token_accuracy": 0.9653628063201904, | |
| "num_tokens": 3814977879.0, | |
| "step": 27050 | |
| }, | |
| { | |
| "entropy": 1.3947002291679382, | |
| "epoch": 0.8847246253795175, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 3.809905574091676e-06, | |
| "loss": 0.1439, | |
| "mean_token_accuracy": 0.9674938654899597, | |
| "num_tokens": 3822114331.0, | |
| "step": 27100 | |
| }, | |
| { | |
| "entropy": 1.3995588779449464, | |
| "epoch": 0.8863569586366753, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 3.802106121677525e-06, | |
| "loss": 0.1389, | |
| "mean_token_accuracy": 0.9679549074172974, | |
| "num_tokens": 3829036413.0, | |
| "step": 27150 | |
| }, | |
| { | |
| "entropy": 1.397332239151001, | |
| "epoch": 0.887989291893833, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 3.7943008260035106e-06, | |
| "loss": 0.1394, | |
| "mean_token_accuracy": 0.9689336049556733, | |
| "num_tokens": 3835541715.0, | |
| "step": 27200 | |
| }, | |
| { | |
| "entropy": 1.398532338142395, | |
| "epoch": 0.8896216251509909, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 3.7864897439304e-06, | |
| "loss": 0.1328, | |
| "mean_token_accuracy": 0.970375450849533, | |
| "num_tokens": 3842178219.0, | |
| "step": 27250 | |
| }, | |
| { | |
| "entropy": 1.3905003619194032, | |
| "epoch": 0.8912539584081486, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 3.778672932361116e-06, | |
| "loss": 0.138, | |
| "mean_token_accuracy": 0.9678446400165558, | |
| "num_tokens": 3849481284.0, | |
| "step": 27300 | |
| }, | |
| { | |
| "entropy": 1.4009161067008973, | |
| "epoch": 0.8928862916653064, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 3.7708504482403198e-06, | |
| "loss": 0.1369, | |
| "mean_token_accuracy": 0.9693617796897889, | |
| "num_tokens": 3855937939.0, | |
| "step": 27350 | |
| }, | |
| { | |
| "entropy": 1.3991939783096314, | |
| "epoch": 0.8945186249224641, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 3.7630223485539955e-06, | |
| "loss": 0.1391, | |
| "mean_token_accuracy": 0.9692135906219482, | |
| "num_tokens": 3863063108.0, | |
| "step": 27400 | |
| }, | |
| { | |
| "entropy": 1.4059844970703126, | |
| "epoch": 0.896150958179622, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 3.755188690329039e-06, | |
| "loss": 0.1387, | |
| "mean_token_accuracy": 0.9687328970432282, | |
| "num_tokens": 3870522891.0, | |
| "step": 27450 | |
| }, | |
| { | |
| "entropy": 1.4190341973304748, | |
| "epoch": 0.8977832914367797, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 3.747349530632837e-06, | |
| "loss": 0.137, | |
| "mean_token_accuracy": 0.9695413172245025, | |
| "num_tokens": 3877480147.0, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.8977832914367797, | |
| "eval_entropy": 1.407424882253011, | |
| "eval_loss": 0.1556614637374878, | |
| "eval_mean_token_accuracy": 0.9652939391136169, | |
| "eval_num_tokens": 3877480147.0, | |
| "eval_runtime": 746.039, | |
| "eval_samples_per_second": 12.943, | |
| "eval_steps_per_second": 0.102, | |
| "step": 27500 | |
| }, | |
| { | |
| "entropy": 1.4175709581375122, | |
| "epoch": 0.8994156246939375, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 3.7395049265728537e-06, | |
| "loss": 0.1574, | |
| "mean_token_accuracy": 0.964717469215393, | |
| "num_tokens": 3884856009.0, | |
| "step": 27550 | |
| }, | |
| { | |
| "entropy": 1.4064431715011596, | |
| "epoch": 0.9010479579510953, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 3.7316549352962154e-06, | |
| "loss": 0.139, | |
| "mean_token_accuracy": 0.9685567510128021, | |
| "num_tokens": 3891927392.0, | |
| "step": 27600 | |
| }, | |
| { | |
| "entropy": 1.4132932043075561, | |
| "epoch": 0.9026802912082531, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 3.7237996139892955e-06, | |
| "loss": 0.1441, | |
| "mean_token_accuracy": 0.9683604872226715, | |
| "num_tokens": 3899174699.0, | |
| "step": 27650 | |
| }, | |
| { | |
| "entropy": 1.3990925669670105, | |
| "epoch": 0.9043126244654108, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 3.7159390198772933e-06, | |
| "loss": 0.1453, | |
| "mean_token_accuracy": 0.9674279451370239, | |
| "num_tokens": 3906648455.0, | |
| "step": 27700 | |
| }, | |
| { | |
| "entropy": 1.3980679297447205, | |
| "epoch": 0.9059449577225687, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 3.7080732102238214e-06, | |
| "loss": 0.13, | |
| "mean_token_accuracy": 0.9701244246959686, | |
| "num_tokens": 3913624677.0, | |
| "step": 27750 | |
| }, | |
| { | |
| "entropy": 1.4017365527153016, | |
| "epoch": 0.9075772909797264, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 3.700202242330488e-06, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9692477977275848, | |
| "num_tokens": 3920103211.0, | |
| "step": 27800 | |
| }, | |
| { | |
| "entropy": 1.3995830225944519, | |
| "epoch": 0.9092096242368842, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 3.6923261735364753e-06, | |
| "loss": 0.1343, | |
| "mean_token_accuracy": 0.9691221857070923, | |
| "num_tokens": 3927208526.0, | |
| "step": 27850 | |
| }, | |
| { | |
| "entropy": 1.3959479594230653, | |
| "epoch": 0.910841957494042, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 3.6844450612181293e-06, | |
| "loss": 0.1442, | |
| "mean_token_accuracy": 0.967134929895401, | |
| "num_tokens": 3934542379.0, | |
| "step": 27900 | |
| }, | |
| { | |
| "entropy": 1.391806445121765, | |
| "epoch": 0.9124742907511998, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 3.6765589627885352e-06, | |
| "loss": 0.1313, | |
| "mean_token_accuracy": 0.9697517728805543, | |
| "num_tokens": 3941546757.0, | |
| "step": 27950 | |
| }, | |
| { | |
| "entropy": 1.3797322702407837, | |
| "epoch": 0.9141066240083575, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 3.6686679356971017e-06, | |
| "loss": 0.1352, | |
| "mean_token_accuracy": 0.9690727829933167, | |
| "num_tokens": 3948336251.0, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.9141066240083575, | |
| "eval_entropy": 1.3829597409566243, | |
| "eval_loss": 0.15241551399230957, | |
| "eval_mean_token_accuracy": 0.9657313092549642, | |
| "eval_num_tokens": 3948336251.0, | |
| "eval_runtime": 750.4693, | |
| "eval_samples_per_second": 12.867, | |
| "eval_steps_per_second": 0.101, | |
| "step": 28000 | |
| }, | |
| { | |
| "entropy": 1.3851616716384887, | |
| "epoch": 0.9157389572655154, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 3.660772037429141e-06, | |
| "loss": 0.1329, | |
| "mean_token_accuracy": 0.9694375658035278, | |
| "num_tokens": 3955743379.0, | |
| "step": 28050 | |
| }, | |
| { | |
| "entropy": 1.3846897101402282, | |
| "epoch": 0.9173712905226731, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 3.652871325505453e-06, | |
| "loss": 0.1396, | |
| "mean_token_accuracy": 0.9685408413410187, | |
| "num_tokens": 3962634196.0, | |
| "step": 28100 | |
| }, | |
| { | |
| "entropy": 1.3823327445983886, | |
| "epoch": 0.9190036237798309, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 3.6449658574819062e-06, | |
| "loss": 0.1342, | |
| "mean_token_accuracy": 0.9699604260921478, | |
| "num_tokens": 3969575759.0, | |
| "step": 28150 | |
| }, | |
| { | |
| "entropy": 1.3885199642181396, | |
| "epoch": 0.9206359570369886, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 3.637055690949012e-06, | |
| "loss": 0.1365, | |
| "mean_token_accuracy": 0.9687310314178467, | |
| "num_tokens": 3976309312.0, | |
| "step": 28200 | |
| }, | |
| { | |
| "entropy": 1.3812410712242127, | |
| "epoch": 0.9222682902941465, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 3.629140883531515e-06, | |
| "loss": 0.1322, | |
| "mean_token_accuracy": 0.9697493410110474, | |
| "num_tokens": 3983034311.0, | |
| "step": 28250 | |
| }, | |
| { | |
| "entropy": 1.3719140005111694, | |
| "epoch": 0.9239006235513042, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 3.6212214928879643e-06, | |
| "loss": 0.1311, | |
| "mean_token_accuracy": 0.9697664487361908, | |
| "num_tokens": 3990038956.0, | |
| "step": 28300 | |
| }, | |
| { | |
| "entropy": 1.3883194899559022, | |
| "epoch": 0.925532956808462, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 3.6132975767103e-06, | |
| "loss": 0.127, | |
| "mean_token_accuracy": 0.9704679012298584, | |
| "num_tokens": 3997170550.0, | |
| "step": 28350 | |
| }, | |
| { | |
| "entropy": 1.3769879937171936, | |
| "epoch": 0.9271652900656198, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 3.6053691927234304e-06, | |
| "loss": 0.1318, | |
| "mean_token_accuracy": 0.9701116299629211, | |
| "num_tokens": 4003683105.0, | |
| "step": 28400 | |
| }, | |
| { | |
| "entropy": 1.36924959897995, | |
| "epoch": 0.9287976233227776, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.5974363986848077e-06, | |
| "loss": 0.13, | |
| "mean_token_accuracy": 0.9698223459720612, | |
| "num_tokens": 4010540640.0, | |
| "step": 28450 | |
| }, | |
| { | |
| "entropy": 1.3812626338005065, | |
| "epoch": 0.9304299565799353, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 3.5894992523840146e-06, | |
| "loss": 0.1395, | |
| "mean_token_accuracy": 0.9686692810058594, | |
| "num_tokens": 4017705171.0, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.9304299565799353, | |
| "eval_entropy": 1.3810707855224609, | |
| "eval_loss": 0.15061478316783905, | |
| "eval_mean_token_accuracy": 0.9661375037829081, | |
| "eval_num_tokens": 4017705171.0, | |
| "eval_runtime": 742.3916, | |
| "eval_samples_per_second": 13.007, | |
| "eval_steps_per_second": 0.102, | |
| "step": 28500 | |
| }, | |
| { | |
| "entropy": 1.3842783665657044, | |
| "epoch": 0.9320622898370932, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 3.581557811642338e-06, | |
| "loss": 0.1411, | |
| "mean_token_accuracy": 0.9674799299240112, | |
| "num_tokens": 4025097590.0, | |
| "step": 28550 | |
| }, | |
| { | |
| "entropy": 1.374339952468872, | |
| "epoch": 0.9336946230942509, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 3.57361213431235e-06, | |
| "loss": 0.1421, | |
| "mean_token_accuracy": 0.96745934009552, | |
| "num_tokens": 4032859225.0, | |
| "step": 28600 | |
| }, | |
| { | |
| "entropy": 1.3793137764930725, | |
| "epoch": 0.9353269563514087, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 3.565662278277484e-06, | |
| "loss": 0.1371, | |
| "mean_token_accuracy": 0.9678510630130768, | |
| "num_tokens": 4040008030.0, | |
| "step": 28650 | |
| }, | |
| { | |
| "entropy": 1.3788374090194702, | |
| "epoch": 0.9369592896085664, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 3.5577083014516183e-06, | |
| "loss": 0.1261, | |
| "mean_token_accuracy": 0.9712175786495209, | |
| "num_tokens": 4046560521.0, | |
| "step": 28700 | |
| }, | |
| { | |
| "entropy": 1.3812962436676026, | |
| "epoch": 0.9385916228657243, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 3.549750261778648e-06, | |
| "loss": 0.1378, | |
| "mean_token_accuracy": 0.9689911651611328, | |
| "num_tokens": 4053568855.0, | |
| "step": 28750 | |
| }, | |
| { | |
| "entropy": 1.384900779724121, | |
| "epoch": 0.940223956122882, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 3.5417882172320663e-06, | |
| "loss": 0.1418, | |
| "mean_token_accuracy": 0.9679834198951721, | |
| "num_tokens": 4060892652.0, | |
| "step": 28800 | |
| }, | |
| { | |
| "entropy": 1.3779156827926635, | |
| "epoch": 0.9418562893800398, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 3.5338222258145408e-06, | |
| "loss": 0.1246, | |
| "mean_token_accuracy": 0.9717523455619812, | |
| "num_tokens": 4067536985.0, | |
| "step": 28850 | |
| }, | |
| { | |
| "entropy": 1.3658223152160645, | |
| "epoch": 0.9434886226371976, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 3.525852345557493e-06, | |
| "loss": 0.1344, | |
| "mean_token_accuracy": 0.9694867217540741, | |
| "num_tokens": 4075107141.0, | |
| "step": 28900 | |
| }, | |
| { | |
| "entropy": 1.3954302740097047, | |
| "epoch": 0.9451209558943554, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 3.5178786345206746e-06, | |
| "loss": 0.1341, | |
| "mean_token_accuracy": 0.9694812285900116, | |
| "num_tokens": 4082057215.0, | |
| "step": 28950 | |
| }, | |
| { | |
| "entropy": 1.3778202867507934, | |
| "epoch": 0.9467532891515131, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 3.509901150791742e-06, | |
| "loss": 0.1394, | |
| "mean_token_accuracy": 0.9685069918632507, | |
| "num_tokens": 4089240268.0, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.9467532891515131, | |
| "eval_entropy": 1.3786554765701293, | |
| "eval_loss": 0.14917373657226562, | |
| "eval_mean_token_accuracy": 0.9661656268437704, | |
| "eval_num_tokens": 4089240268.0, | |
| "eval_runtime": 744.7112, | |
| "eval_samples_per_second": 12.966, | |
| "eval_steps_per_second": 0.102, | |
| "step": 29000 | |
| }, | |
| { | |
| "entropy": 1.3767950320243836, | |
| "epoch": 0.948385622408671, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 3.5019199524858355e-06, | |
| "loss": 0.1245, | |
| "mean_token_accuracy": 0.9718893337249755, | |
| "num_tokens": 4096140756.0, | |
| "step": 29050 | |
| }, | |
| { | |
| "entropy": 1.375689399242401, | |
| "epoch": 0.9500179556658287, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 3.493935097745158e-06, | |
| "loss": 0.1242, | |
| "mean_token_accuracy": 0.9714548885822296, | |
| "num_tokens": 4102843838.0, | |
| "step": 29100 | |
| }, | |
| { | |
| "entropy": 1.386333782672882, | |
| "epoch": 0.9516502889229865, | |
| "grad_norm": 1.25, | |
| "learning_rate": 3.4859466447385477e-06, | |
| "loss": 0.1364, | |
| "mean_token_accuracy": 0.9688875234127045, | |
| "num_tokens": 4109572295.0, | |
| "step": 29150 | |
| }, | |
| { | |
| "entropy": 1.3935841035842895, | |
| "epoch": 0.9532826221801443, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 3.477954651661055e-06, | |
| "loss": 0.1389, | |
| "mean_token_accuracy": 0.9683682763576508, | |
| "num_tokens": 4116355893.0, | |
| "step": 29200 | |
| }, | |
| { | |
| "entropy": 1.3900840377807617, | |
| "epoch": 0.9549149554373021, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 3.4699591767335203e-06, | |
| "loss": 0.1461, | |
| "mean_token_accuracy": 0.9672015142440796, | |
| "num_tokens": 4123980186.0, | |
| "step": 29250 | |
| }, | |
| { | |
| "entropy": 1.3854497838020325, | |
| "epoch": 0.9565472886944598, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 3.4619602782021497e-06, | |
| "loss": 0.127, | |
| "mean_token_accuracy": 0.971345556974411, | |
| "num_tokens": 4130458823.0, | |
| "step": 29300 | |
| }, | |
| { | |
| "entropy": 1.3904473185539246, | |
| "epoch": 0.9581796219516177, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 3.4539580143380884e-06, | |
| "loss": 0.1345, | |
| "mean_token_accuracy": 0.9702473485469818, | |
| "num_tokens": 4137313489.0, | |
| "step": 29350 | |
| }, | |
| { | |
| "entropy": 1.393708050251007, | |
| "epoch": 0.9598119552087754, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 3.4459524434369967e-06, | |
| "loss": 0.1393, | |
| "mean_token_accuracy": 0.9680514478683472, | |
| "num_tokens": 4144869499.0, | |
| "step": 29400 | |
| }, | |
| { | |
| "entropy": 1.392387228012085, | |
| "epoch": 0.9614442884659332, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 3.437943623818631e-06, | |
| "loss": 0.1244, | |
| "mean_token_accuracy": 0.9713895416259766, | |
| "num_tokens": 4151722923.0, | |
| "step": 29450 | |
| }, | |
| { | |
| "entropy": 1.3936436820030211, | |
| "epoch": 0.9630766217230909, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 3.4299316138264096e-06, | |
| "loss": 0.1435, | |
| "mean_token_accuracy": 0.9673401594161988, | |
| "num_tokens": 4159401264.0, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.9630766217230909, | |
| "eval_entropy": 1.377136646906535, | |
| "eval_loss": 0.1480059176683426, | |
| "eval_mean_token_accuracy": 0.9664394434293111, | |
| "eval_num_tokens": 4159401264.0, | |
| "eval_runtime": 746.3254, | |
| "eval_samples_per_second": 12.938, | |
| "eval_steps_per_second": 0.102, | |
| "step": 29500 | |
| }, | |
| { | |
| "entropy": 1.3720522713661194, | |
| "epoch": 0.9647089549802488, | |
| "grad_norm": 0.00274658203125, | |
| "learning_rate": 3.4219164718269925e-06, | |
| "loss": 0.1237, | |
| "mean_token_accuracy": 0.9710344398021697, | |
| "num_tokens": 4166633451.0, | |
| "step": 29550 | |
| }, | |
| { | |
| "entropy": 1.3591685533523559, | |
| "epoch": 0.9663412882374065, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 3.41389825620986e-06, | |
| "loss": 0.1271, | |
| "mean_token_accuracy": 0.9711633479595184, | |
| "num_tokens": 4173854933.0, | |
| "step": 29600 | |
| }, | |
| { | |
| "entropy": 1.3691086530685426, | |
| "epoch": 0.9679736214945643, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 3.405877025386879e-06, | |
| "loss": 0.1329, | |
| "mean_token_accuracy": 0.969690408706665, | |
| "num_tokens": 4180650471.0, | |
| "step": 29650 | |
| }, | |
| { | |
| "entropy": 1.3538868117332459, | |
| "epoch": 0.9696059547517221, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 3.397852837791885e-06, | |
| "loss": 0.1193, | |
| "mean_token_accuracy": 0.9719677448272706, | |
| "num_tokens": 4187582242.0, | |
| "step": 29700 | |
| }, | |
| { | |
| "entropy": 1.3645092558860779, | |
| "epoch": 0.9712382880088799, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 3.389825751880252e-06, | |
| "loss": 0.1333, | |
| "mean_token_accuracy": 0.9694900810718536, | |
| "num_tokens": 4194210247.0, | |
| "step": 29750 | |
| }, | |
| { | |
| "entropy": 1.3627040433883666, | |
| "epoch": 0.9728706212660376, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 3.381795826128467e-06, | |
| "loss": 0.1332, | |
| "mean_token_accuracy": 0.9694800686836242, | |
| "num_tokens": 4201506868.0, | |
| "step": 29800 | |
| }, | |
| { | |
| "entropy": 1.3740008974075317, | |
| "epoch": 0.9745029545231955, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 3.373763119033706e-06, | |
| "loss": 0.132, | |
| "mean_token_accuracy": 0.969896445274353, | |
| "num_tokens": 4208636691.0, | |
| "step": 29850 | |
| }, | |
| { | |
| "entropy": 1.3614243865013123, | |
| "epoch": 0.9761352877803532, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 3.365727689113406e-06, | |
| "loss": 0.1333, | |
| "mean_token_accuracy": 0.9692292737960816, | |
| "num_tokens": 4215943871.0, | |
| "step": 29900 | |
| }, | |
| { | |
| "entropy": 1.3712631511688231, | |
| "epoch": 0.977767621037511, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 3.3576895949048423e-06, | |
| "loss": 0.1314, | |
| "mean_token_accuracy": 0.970370488166809, | |
| "num_tokens": 4222890357.0, | |
| "step": 29950 | |
| }, | |
| { | |
| "entropy": 1.3694654393196106, | |
| "epoch": 0.9793999542946688, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 3.3496488949646945e-06, | |
| "loss": 0.132, | |
| "mean_token_accuracy": 0.9699479579925537, | |
| "num_tokens": 4229911312.0, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.9793999542946688, | |
| "eval_entropy": 1.3661458206176758, | |
| "eval_loss": 0.14751291275024414, | |
| "eval_mean_token_accuracy": 0.9666498748461405, | |
| "eval_num_tokens": 4229911312.0, | |
| "eval_runtime": 747.9793, | |
| "eval_samples_per_second": 12.909, | |
| "eval_steps_per_second": 0.102, | |
| "step": 30000 | |
| }, | |
| { | |
| "entropy": 1.3588277745246886, | |
| "epoch": 0.9810322875518266, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 3.34160564786863e-06, | |
| "loss": 0.1369, | |
| "mean_token_accuracy": 0.968973708152771, | |
| "num_tokens": 4237375692.0, | |
| "step": 30050 | |
| }, | |
| { | |
| "entropy": 1.3632358622550964, | |
| "epoch": 0.9826646208089843, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 3.3335599122108676e-06, | |
| "loss": 0.1282, | |
| "mean_token_accuracy": 0.9707362723350524, | |
| "num_tokens": 4244434501.0, | |
| "step": 30100 | |
| }, | |
| { | |
| "entropy": 1.3589469051361085, | |
| "epoch": 0.9842969540661421, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 3.3255117466037573e-06, | |
| "loss": 0.133, | |
| "mean_token_accuracy": 0.9700418126583099, | |
| "num_tokens": 4251615278.0, | |
| "step": 30150 | |
| }, | |
| { | |
| "entropy": 1.3744734477996827, | |
| "epoch": 0.9859292873232999, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 3.3174612096773496e-06, | |
| "loss": 0.1455, | |
| "mean_token_accuracy": 0.9672206926345825, | |
| "num_tokens": 4259009585.0, | |
| "step": 30200 | |
| }, | |
| { | |
| "entropy": 1.3715810680389404, | |
| "epoch": 0.9875616205804577, | |
| "grad_norm": 1.5, | |
| "learning_rate": 3.3094083600789717e-06, | |
| "loss": 0.1328, | |
| "mean_token_accuracy": 0.969623521566391, | |
| "num_tokens": 4266185168.0, | |
| "step": 30250 | |
| }, | |
| { | |
| "entropy": 1.3768983268737793, | |
| "epoch": 0.9891939538376154, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 3.3013532564727965e-06, | |
| "loss": 0.1399, | |
| "mean_token_accuracy": 0.9680708968639373, | |
| "num_tokens": 4273427348.0, | |
| "step": 30300 | |
| }, | |
| { | |
| "entropy": 1.3672343015670776, | |
| "epoch": 0.9908262870947733, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 3.293295957539418e-06, | |
| "loss": 0.1295, | |
| "mean_token_accuracy": 0.9705338907241822, | |
| "num_tokens": 4280054609.0, | |
| "step": 30350 | |
| }, | |
| { | |
| "entropy": 1.3644690942764282, | |
| "epoch": 0.992458620351931, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 3.2852365219754234e-06, | |
| "loss": 0.1298, | |
| "mean_token_accuracy": 0.9702000212669373, | |
| "num_tokens": 4287354435.0, | |
| "step": 30400 | |
| }, | |
| { | |
| "entropy": 1.3673407602310181, | |
| "epoch": 0.9940909536090888, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.2771750084929644e-06, | |
| "loss": 0.1388, | |
| "mean_token_accuracy": 0.9686336624622345, | |
| "num_tokens": 4294494938.0, | |
| "step": 30450 | |
| }, | |
| { | |
| "entropy": 1.3702644801139832, | |
| "epoch": 0.9957232868662466, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 3.26911147581933e-06, | |
| "loss": 0.1348, | |
| "mean_token_accuracy": 0.9689600837230682, | |
| "num_tokens": 4302306234.0, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.9957232868662466, | |
| "eval_entropy": 1.3691168228785198, | |
| "eval_loss": 0.14723782241344452, | |
| "eval_mean_token_accuracy": 0.9666363048553467, | |
| "eval_num_tokens": 4302306234.0, | |
| "eval_runtime": 744.3054, | |
| "eval_samples_per_second": 12.973, | |
| "eval_steps_per_second": 0.102, | |
| "step": 30500 | |
| }, | |
| { | |
| "entropy": 1.3547360873222352, | |
| "epoch": 0.9973556201234044, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 3.2610459826965177e-06, | |
| "loss": 0.1191, | |
| "mean_token_accuracy": 0.9728741991519928, | |
| "num_tokens": 4309003875.0, | |
| "step": 30550 | |
| }, | |
| { | |
| "entropy": 1.3677322697639465, | |
| "epoch": 0.9989879533805621, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 3.2529785878808105e-06, | |
| "loss": 0.1382, | |
| "mean_token_accuracy": 0.9686129570007325, | |
| "num_tokens": 4316663353.0, | |
| "step": 30600 | |
| }, | |
| { | |
| "entropy": 1.366257793903351, | |
| "epoch": 1.0006202866377198, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 3.244909350142341e-06, | |
| "loss": 0.1326, | |
| "mean_token_accuracy": 0.9687590861320495, | |
| "num_tokens": 4324200010.0, | |
| "step": 30650 | |
| }, | |
| { | |
| "entropy": 1.3630602145195008, | |
| "epoch": 1.0022526198948778, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 3.2368383282646688e-06, | |
| "loss": 0.1329, | |
| "mean_token_accuracy": 0.9694711458683014, | |
| "num_tokens": 4331478206.0, | |
| "step": 30700 | |
| }, | |
| { | |
| "entropy": 1.366960186958313, | |
| "epoch": 1.0038849531520355, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 3.2287655810443514e-06, | |
| "loss": 0.1382, | |
| "mean_token_accuracy": 0.9686524891853332, | |
| "num_tokens": 4338814315.0, | |
| "step": 30750 | |
| }, | |
| { | |
| "entropy": 1.3614837670326232, | |
| "epoch": 1.0055172864091932, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 3.220691167290514e-06, | |
| "loss": 0.1387, | |
| "mean_token_accuracy": 0.968714509010315, | |
| "num_tokens": 4346125127.0, | |
| "step": 30800 | |
| }, | |
| { | |
| "entropy": 1.3573535728454589, | |
| "epoch": 1.0071496196663512, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 3.2126151458244233e-06, | |
| "loss": 0.1204, | |
| "mean_token_accuracy": 0.9723792004585267, | |
| "num_tokens": 4352939968.0, | |
| "step": 30850 | |
| }, | |
| { | |
| "entropy": 1.3520698595046996, | |
| "epoch": 1.008781952923509, | |
| "grad_norm": 1.875, | |
| "learning_rate": 3.2045375754790577e-06, | |
| "loss": 0.1231, | |
| "mean_token_accuracy": 0.9714046669006348, | |
| "num_tokens": 4359619212.0, | |
| "step": 30900 | |
| }, | |
| { | |
| "entropy": 1.3647415375709533, | |
| "epoch": 1.0104142861806666, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 3.196458515098679e-06, | |
| "loss": 0.1372, | |
| "mean_token_accuracy": 0.9691076791286468, | |
| "num_tokens": 4366790518.0, | |
| "step": 30950 | |
| }, | |
| { | |
| "entropy": 1.3572459483146668, | |
| "epoch": 1.0120466194378244, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 3.1883780235384036e-06, | |
| "loss": 0.13, | |
| "mean_token_accuracy": 0.9705017244815827, | |
| "num_tokens": 4373881040.0, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.0120466194378244, | |
| "eval_entropy": 1.3610133997599283, | |
| "eval_loss": 0.1467311531305313, | |
| "eval_mean_token_accuracy": 0.9667138489087422, | |
| "eval_num_tokens": 4373881040.0, | |
| "eval_runtime": 749.5562, | |
| "eval_samples_per_second": 12.882, | |
| "eval_steps_per_second": 0.101, | |
| "step": 31000 | |
| }, | |
| { | |
| "entropy": 1.3617827844619752, | |
| "epoch": 1.0136789526949823, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 3.180296159663773e-06, | |
| "loss": 0.1256, | |
| "mean_token_accuracy": 0.9706788539886475, | |
| "num_tokens": 4380849198.0, | |
| "step": 31050 | |
| }, | |
| { | |
| "entropy": 1.35469162940979, | |
| "epoch": 1.01531128595214, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 3.1722129823503283e-06, | |
| "loss": 0.1265, | |
| "mean_token_accuracy": 0.9702699911594391, | |
| "num_tokens": 4388053942.0, | |
| "step": 31100 | |
| }, | |
| { | |
| "entropy": 1.3688899064064026, | |
| "epoch": 1.0169436192092978, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 3.1641285504831776e-06, | |
| "loss": 0.1379, | |
| "mean_token_accuracy": 0.9682463228702545, | |
| "num_tokens": 4395442873.0, | |
| "step": 31150 | |
| }, | |
| { | |
| "entropy": 1.3647472047805786, | |
| "epoch": 1.0185759524664555, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 3.156042922956568e-06, | |
| "loss": 0.1285, | |
| "mean_token_accuracy": 0.9707048869132996, | |
| "num_tokens": 4402556775.0, | |
| "step": 31200 | |
| }, | |
| { | |
| "entropy": 1.3655140924453735, | |
| "epoch": 1.0202082857236134, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 3.1479561586734553e-06, | |
| "loss": 0.1273, | |
| "mean_token_accuracy": 0.9710086095333099, | |
| "num_tokens": 4409440789.0, | |
| "step": 31250 | |
| }, | |
| { | |
| "entropy": 1.378363606929779, | |
| "epoch": 1.0218406189807712, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 3.139868316545081e-06, | |
| "loss": 0.1413, | |
| "mean_token_accuracy": 0.9676133573055268, | |
| "num_tokens": 4416910867.0, | |
| "step": 31300 | |
| }, | |
| { | |
| "entropy": 1.377373902797699, | |
| "epoch": 1.023472952237929, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 3.131779455490534e-06, | |
| "loss": 0.1353, | |
| "mean_token_accuracy": 0.9695135807991028, | |
| "num_tokens": 4424153945.0, | |
| "step": 31350 | |
| }, | |
| { | |
| "entropy": 1.3685814261436462, | |
| "epoch": 1.0251052854950866, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 3.1236896344363276e-06, | |
| "loss": 0.1261, | |
| "mean_token_accuracy": 0.9713682627677918, | |
| "num_tokens": 4431041238.0, | |
| "step": 31400 | |
| }, | |
| { | |
| "entropy": 1.3680060362815858, | |
| "epoch": 1.0267376187522446, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 3.1155989123159693e-06, | |
| "loss": 0.1308, | |
| "mean_token_accuracy": 0.9699837076663971, | |
| "num_tokens": 4438354536.0, | |
| "step": 31450 | |
| }, | |
| { | |
| "entropy": 1.3511241865158081, | |
| "epoch": 1.0283699520094023, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 3.1075073480695303e-06, | |
| "loss": 0.12, | |
| "mean_token_accuracy": 0.9721428179740905, | |
| "num_tokens": 4445073229.0, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.0283699520094023, | |
| "eval_entropy": 1.3678371334075927, | |
| "eval_loss": 0.14646433293819427, | |
| "eval_mean_token_accuracy": 0.9666456254323323, | |
| "eval_num_tokens": 4445073229.0, | |
| "eval_runtime": 748.9165, | |
| "eval_samples_per_second": 12.893, | |
| "eval_steps_per_second": 0.101, | |
| "step": 31500 | |
| }, | |
| { | |
| "entropy": 1.3680097246170044, | |
| "epoch": 1.03000228526656, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 3.099415000643216e-06, | |
| "loss": 0.125, | |
| "mean_token_accuracy": 0.9707134962081909, | |
| "num_tokens": 4452143066.0, | |
| "step": 31550 | |
| }, | |
| { | |
| "entropy": 1.3657612824440002, | |
| "epoch": 1.0316346185237177, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 3.0913219289889375e-06, | |
| "loss": 0.1399, | |
| "mean_token_accuracy": 0.9681135547161103, | |
| "num_tokens": 4459565328.0, | |
| "step": 31600 | |
| }, | |
| { | |
| "entropy": 1.3669775104522706, | |
| "epoch": 1.0332669517808757, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 3.083228192063883e-06, | |
| "loss": 0.1296, | |
| "mean_token_accuracy": 0.9709093308448792, | |
| "num_tokens": 4466669853.0, | |
| "step": 31650 | |
| }, | |
| { | |
| "entropy": 1.373291413784027, | |
| "epoch": 1.0348992850380334, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 3.0751338488300846e-06, | |
| "loss": 0.1319, | |
| "mean_token_accuracy": 0.969772047996521, | |
| "num_tokens": 4473341935.0, | |
| "step": 31700 | |
| }, | |
| { | |
| "entropy": 1.3762017822265624, | |
| "epoch": 1.0365316182951911, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 3.0670389582539956e-06, | |
| "loss": 0.138, | |
| "mean_token_accuracy": 0.968094003200531, | |
| "num_tokens": 4480827308.0, | |
| "step": 31750 | |
| }, | |
| { | |
| "entropy": 1.3670384407043457, | |
| "epoch": 1.0381639515523489, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 3.0589435793060506e-06, | |
| "loss": 0.1257, | |
| "mean_token_accuracy": 0.9709041547775269, | |
| "num_tokens": 4487979209.0, | |
| "step": 31800 | |
| }, | |
| { | |
| "entropy": 1.3619010615348817, | |
| "epoch": 1.0397962848095068, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 3.050847770960248e-06, | |
| "loss": 0.1182, | |
| "mean_token_accuracy": 0.9726350855827331, | |
| "num_tokens": 4494810042.0, | |
| "step": 31850 | |
| }, | |
| { | |
| "entropy": 1.3843055248260498, | |
| "epoch": 1.0414286180666645, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 3.0427515921937097e-06, | |
| "loss": 0.1398, | |
| "mean_token_accuracy": 0.9685020220279693, | |
| "num_tokens": 4502522702.0, | |
| "step": 31900 | |
| }, | |
| { | |
| "entropy": 1.3764786529541015, | |
| "epoch": 1.0430609513238223, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 3.034655101986258e-06, | |
| "loss": 0.1399, | |
| "mean_token_accuracy": 0.9681815671920776, | |
| "num_tokens": 4509891561.0, | |
| "step": 31950 | |
| }, | |
| { | |
| "entropy": 1.3768756079673767, | |
| "epoch": 1.04469328458098, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 3.026558359319985e-06, | |
| "loss": 0.1378, | |
| "mean_token_accuracy": 0.9689823544025421, | |
| "num_tokens": 4517228622.0, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.04469328458098, | |
| "eval_entropy": 1.3686085001627604, | |
| "eval_loss": 0.14597955346107483, | |
| "eval_mean_token_accuracy": 0.9667494138081868, | |
| "eval_num_tokens": 4517228622.0, | |
| "eval_runtime": 756.2048, | |
| "eval_samples_per_second": 12.769, | |
| "eval_steps_per_second": 0.101, | |
| "step": 32000 | |
| }, | |
| { | |
| "entropy": 1.3622459721565248, | |
| "epoch": 1.046325617838138, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 3.01846142317882e-06, | |
| "loss": 0.1246, | |
| "mean_token_accuracy": 0.971124712228775, | |
| "num_tokens": 4524548751.0, | |
| "step": 32050 | |
| }, | |
| { | |
| "entropy": 1.360032732486725, | |
| "epoch": 1.0479579510952957, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 3.0103643525481026e-06, | |
| "loss": 0.1299, | |
| "mean_token_accuracy": 0.970292786359787, | |
| "num_tokens": 4531455869.0, | |
| "step": 32100 | |
| }, | |
| { | |
| "entropy": 1.3518124055862426, | |
| "epoch": 1.0495902843524534, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 3.0022672064141524e-06, | |
| "loss": 0.1269, | |
| "mean_token_accuracy": 0.971394385099411, | |
| "num_tokens": 4538509160.0, | |
| "step": 32150 | |
| }, | |
| { | |
| "entropy": 1.3580170464515686, | |
| "epoch": 1.051222617609611, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 2.9941700437638386e-06, | |
| "loss": 0.1289, | |
| "mean_token_accuracy": 0.9704779148101806, | |
| "num_tokens": 4545863027.0, | |
| "step": 32200 | |
| }, | |
| { | |
| "entropy": 1.3661806869506836, | |
| "epoch": 1.052854950866769, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 2.986072923584151e-06, | |
| "loss": 0.1374, | |
| "mean_token_accuracy": 0.9684454727172852, | |
| "num_tokens": 4553360974.0, | |
| "step": 32250 | |
| }, | |
| { | |
| "entropy": 1.3564281272888183, | |
| "epoch": 1.0544872841239268, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 2.9779759048617704e-06, | |
| "loss": 0.1416, | |
| "mean_token_accuracy": 0.9682377851009369, | |
| "num_tokens": 4560701368.0, | |
| "step": 32300 | |
| }, | |
| { | |
| "entropy": 1.3551061296463012, | |
| "epoch": 1.0561196173810845, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 2.9698790465826377e-06, | |
| "loss": 0.1241, | |
| "mean_token_accuracy": 0.9714620614051819, | |
| "num_tokens": 4567111828.0, | |
| "step": 32350 | |
| }, | |
| { | |
| "entropy": 1.3527950978279113, | |
| "epoch": 1.0577519506382422, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 2.961782407731525e-06, | |
| "loss": 0.1337, | |
| "mean_token_accuracy": 0.9692088150978089, | |
| "num_tokens": 4574301428.0, | |
| "step": 32400 | |
| }, | |
| { | |
| "entropy": 1.3572787022590638, | |
| "epoch": 1.0593842838954002, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 2.953686047291606e-06, | |
| "loss": 0.1286, | |
| "mean_token_accuracy": 0.9706799817085267, | |
| "num_tokens": 4581456456.0, | |
| "step": 32450 | |
| }, | |
| { | |
| "entropy": 1.3538258695602416, | |
| "epoch": 1.061016617152558, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 2.945590024244026e-06, | |
| "loss": 0.1297, | |
| "mean_token_accuracy": 0.9701016509532928, | |
| "num_tokens": 4588137631.0, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.061016617152558, | |
| "eval_entropy": 1.3580620272954305, | |
| "eval_loss": 0.14589445292949677, | |
| "eval_mean_token_accuracy": 0.9668857765197754, | |
| "eval_num_tokens": 4588137631.0, | |
| "eval_runtime": 748.9222, | |
| "eval_samples_per_second": 12.893, | |
| "eval_steps_per_second": 0.101, | |
| "step": 32500 | |
| }, | |
| { | |
| "entropy": 1.361629192829132, | |
| "epoch": 1.0626489504097156, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 2.9374943975674745e-06, | |
| "loss": 0.138, | |
| "mean_token_accuracy": 0.9691119182109833, | |
| "num_tokens": 4595619130.0, | |
| "step": 32550 | |
| }, | |
| { | |
| "entropy": 1.3559014773368836, | |
| "epoch": 1.0642812836668734, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 2.92939922623775e-06, | |
| "loss": 0.1239, | |
| "mean_token_accuracy": 0.9716914188861847, | |
| "num_tokens": 4602722754.0, | |
| "step": 32600 | |
| }, | |
| { | |
| "entropy": 1.3606642532348632, | |
| "epoch": 1.0659136169240313, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 2.921304569227337e-06, | |
| "loss": 0.1308, | |
| "mean_token_accuracy": 0.9696120321750641, | |
| "num_tokens": 4609681202.0, | |
| "step": 32650 | |
| }, | |
| { | |
| "entropy": 1.3540519714355468, | |
| "epoch": 1.067545950181189, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 2.913210485504971e-06, | |
| "loss": 0.1191, | |
| "mean_token_accuracy": 0.972172474861145, | |
| "num_tokens": 4616745205.0, | |
| "step": 32700 | |
| }, | |
| { | |
| "entropy": 1.3715915560722352, | |
| "epoch": 1.0691782834383468, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 2.9051170340352125e-06, | |
| "loss": 0.1348, | |
| "mean_token_accuracy": 0.9693544006347656, | |
| "num_tokens": 4624203423.0, | |
| "step": 32750 | |
| }, | |
| { | |
| "entropy": 1.3620137906074523, | |
| "epoch": 1.0708106166955045, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 2.8970242737780152e-06, | |
| "loss": 0.1319, | |
| "mean_token_accuracy": 0.9693595457077027, | |
| "num_tokens": 4631098385.0, | |
| "step": 32800 | |
| }, | |
| { | |
| "entropy": 1.3573533582687378, | |
| "epoch": 1.0724429499526624, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 2.8889322636882975e-06, | |
| "loss": 0.1149, | |
| "mean_token_accuracy": 0.9735664069652558, | |
| "num_tokens": 4637689978.0, | |
| "step": 32850 | |
| }, | |
| { | |
| "entropy": 1.3642809319496154, | |
| "epoch": 1.0740752832098202, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 2.8808410627155142e-06, | |
| "loss": 0.1288, | |
| "mean_token_accuracy": 0.9694396567344665, | |
| "num_tokens": 4644751687.0, | |
| "step": 32900 | |
| }, | |
| { | |
| "entropy": 1.34871666431427, | |
| "epoch": 1.0757076164669779, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 2.8727507298032246e-06, | |
| "loss": 0.1298, | |
| "mean_token_accuracy": 0.9703471696376801, | |
| "num_tokens": 4651717972.0, | |
| "step": 32950 | |
| }, | |
| { | |
| "entropy": 1.3684996843338013, | |
| "epoch": 1.0773399497241356, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 2.864661323888664e-06, | |
| "loss": 0.1341, | |
| "mean_token_accuracy": 0.9694813418388367, | |
| "num_tokens": 4659247374.0, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.0773399497241356, | |
| "eval_entropy": 1.3517811473210652, | |
| "eval_loss": 0.14623871445655823, | |
| "eval_mean_token_accuracy": 0.9667705456415813, | |
| "eval_num_tokens": 4659247374.0, | |
| "eval_runtime": 756.2099, | |
| "eval_samples_per_second": 12.769, | |
| "eval_steps_per_second": 0.101, | |
| "step": 33000 | |
| }, | |
| { | |
| "entropy": 1.3631637930870055, | |
| "epoch": 1.0789722829812936, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 2.8565729039023154e-06, | |
| "loss": 0.134, | |
| "mean_token_accuracy": 0.9690206825733185, | |
| "num_tokens": 4666487708.0, | |
| "step": 33050 | |
| }, | |
| { | |
| "entropy": 1.3550728225708009, | |
| "epoch": 1.0806046162384513, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 2.8484855287674787e-06, | |
| "loss": 0.139, | |
| "mean_token_accuracy": 0.9683072865009308, | |
| "num_tokens": 4673679571.0, | |
| "step": 33100 | |
| }, | |
| { | |
| "entropy": 1.3468902921676635, | |
| "epoch": 1.082236949495609, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 2.8403992573998416e-06, | |
| "loss": 0.1287, | |
| "mean_token_accuracy": 0.9701869285106659, | |
| "num_tokens": 4680648568.0, | |
| "step": 33150 | |
| }, | |
| { | |
| "entropy": 1.3574704766273498, | |
| "epoch": 1.0838692827527667, | |
| "grad_norm": 2.0, | |
| "learning_rate": 2.8323141487070544e-06, | |
| "loss": 0.1252, | |
| "mean_token_accuracy": 0.9709026992321015, | |
| "num_tokens": 4687624851.0, | |
| "step": 33200 | |
| }, | |
| { | |
| "entropy": 1.3406636595726014, | |
| "epoch": 1.0855016160099247, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 2.824230261588294e-06, | |
| "loss": 0.123, | |
| "mean_token_accuracy": 0.9712547302246094, | |
| "num_tokens": 4694450430.0, | |
| "step": 33250 | |
| }, | |
| { | |
| "entropy": 1.3555320692062378, | |
| "epoch": 1.0871339492670824, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 2.816147654933839e-06, | |
| "loss": 0.137, | |
| "mean_token_accuracy": 0.9693058180809021, | |
| "num_tokens": 4701897095.0, | |
| "step": 33300 | |
| }, | |
| { | |
| "entropy": 1.3499479746818543, | |
| "epoch": 1.0887662825242401, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 2.8080663876246394e-06, | |
| "loss": 0.1231, | |
| "mean_token_accuracy": 0.9722630488872528, | |
| "num_tokens": 4708881122.0, | |
| "step": 33350 | |
| }, | |
| { | |
| "entropy": 1.3455316138267517, | |
| "epoch": 1.0903986157813979, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 2.79998651853189e-06, | |
| "loss": 0.1199, | |
| "mean_token_accuracy": 0.9716216671466827, | |
| "num_tokens": 4716074845.0, | |
| "step": 33400 | |
| }, | |
| { | |
| "entropy": 1.359624376296997, | |
| "epoch": 1.0920309490385558, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 2.7919081065165985e-06, | |
| "loss": 0.1257, | |
| "mean_token_accuracy": 0.9711248898506164, | |
| "num_tokens": 4723316565.0, | |
| "step": 33450 | |
| }, | |
| { | |
| "entropy": 1.3494191646575928, | |
| "epoch": 1.0936632822957135, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 2.7838312104291584e-06, | |
| "loss": 0.1323, | |
| "mean_token_accuracy": 0.9694008147716522, | |
| "num_tokens": 4730650888.0, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.0936632822957135, | |
| "eval_entropy": 1.353581156730652, | |
| "eval_loss": 0.14585214853286743, | |
| "eval_mean_token_accuracy": 0.9667929395039876, | |
| "eval_num_tokens": 4730650888.0, | |
| "eval_runtime": 754.4564, | |
| "eval_samples_per_second": 12.799, | |
| "eval_steps_per_second": 0.101, | |
| "step": 33500 | |
| }, | |
| { | |
| "entropy": 1.3704795885086059, | |
| "epoch": 1.0952956155528712, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 2.775755889108919e-06, | |
| "loss": 0.134, | |
| "mean_token_accuracy": 0.9687422275543213, | |
| "num_tokens": 4737974169.0, | |
| "step": 33550 | |
| }, | |
| { | |
| "entropy": 1.3608631157875062, | |
| "epoch": 1.096927948810029, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 2.7676822013837588e-06, | |
| "loss": 0.1299, | |
| "mean_token_accuracy": 0.9702245342731476, | |
| "num_tokens": 4745212589.0, | |
| "step": 33600 | |
| }, | |
| { | |
| "entropy": 1.3445592832565307, | |
| "epoch": 1.098560282067187, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 2.7596102060696543e-06, | |
| "loss": 0.123, | |
| "mean_token_accuracy": 0.9716692876815796, | |
| "num_tokens": 4752104456.0, | |
| "step": 33650 | |
| }, | |
| { | |
| "entropy": 1.3646861577033997, | |
| "epoch": 1.1001926153243446, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 2.7515399619702545e-06, | |
| "loss": 0.1286, | |
| "mean_token_accuracy": 0.9707652199268341, | |
| "num_tokens": 4759026284.0, | |
| "step": 33700 | |
| }, | |
| { | |
| "entropy": 1.3495210075378419, | |
| "epoch": 1.1018249485815024, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 2.7434715278764494e-06, | |
| "loss": 0.1298, | |
| "mean_token_accuracy": 0.9711257350444794, | |
| "num_tokens": 4765987412.0, | |
| "step": 33750 | |
| }, | |
| { | |
| "entropy": 1.3825481986999513, | |
| "epoch": 1.10345728183866, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 2.735404962565945e-06, | |
| "loss": 0.1417, | |
| "mean_token_accuracy": 0.9679384648799896, | |
| "num_tokens": 4773475530.0, | |
| "step": 33800 | |
| }, | |
| { | |
| "entropy": 1.3517045164108277, | |
| "epoch": 1.105089615095818, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.7273403248028325e-06, | |
| "loss": 0.1183, | |
| "mean_token_accuracy": 0.9723455941677094, | |
| "num_tokens": 4780068847.0, | |
| "step": 33850 | |
| }, | |
| { | |
| "entropy": 1.3678963851928712, | |
| "epoch": 1.1067219483529758, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 2.7192776733371608e-06, | |
| "loss": 0.1354, | |
| "mean_token_accuracy": 0.9694834208488464, | |
| "num_tokens": 4786993711.0, | |
| "step": 33900 | |
| }, | |
| { | |
| "entropy": 1.3571431303024293, | |
| "epoch": 1.1083542816101335, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 2.711217066904509e-06, | |
| "loss": 0.1212, | |
| "mean_token_accuracy": 0.9715266978740692, | |
| "num_tokens": 4793644363.0, | |
| "step": 33950 | |
| }, | |
| { | |
| "entropy": 1.3623171138763428, | |
| "epoch": 1.1099866148672912, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 2.7031585642255596e-06, | |
| "loss": 0.1279, | |
| "mean_token_accuracy": 0.970818110704422, | |
| "num_tokens": 4800772089.0, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.1099866148672912, | |
| "eval_entropy": 1.3600939814249675, | |
| "eval_loss": 0.14549146592617035, | |
| "eval_mean_token_accuracy": 0.9668245681126912, | |
| "eval_num_tokens": 4800772089.0, | |
| "eval_runtime": 753.1399, | |
| "eval_samples_per_second": 12.821, | |
| "eval_steps_per_second": 0.101, | |
| "step": 34000 | |
| }, | |
| { | |
| "entropy": 1.3657611656188964, | |
| "epoch": 1.1116189481244492, | |
| "grad_norm": 1.375, | |
| "learning_rate": 2.695102224005667e-06, | |
| "loss": 0.1312, | |
| "mean_token_accuracy": 0.9701169800758361, | |
| "num_tokens": 4808026786.0, | |
| "step": 34050 | |
| }, | |
| { | |
| "entropy": 1.3706095337867736, | |
| "epoch": 1.113251281381607, | |
| "grad_norm": 0.002044677734375, | |
| "learning_rate": 2.687048104934434e-06, | |
| "loss": 0.1344, | |
| "mean_token_accuracy": 0.9694833195209503, | |
| "num_tokens": 4815351616.0, | |
| "step": 34100 | |
| }, | |
| { | |
| "entropy": 1.3567158889770508, | |
| "epoch": 1.1148836146387646, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 2.6789962656852835e-06, | |
| "loss": 0.1273, | |
| "mean_token_accuracy": 0.9710471928119659, | |
| "num_tokens": 4822489624.0, | |
| "step": 34150 | |
| }, | |
| { | |
| "entropy": 1.3655909848213197, | |
| "epoch": 1.1165159478959223, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 2.6709467649150276e-06, | |
| "loss": 0.138, | |
| "mean_token_accuracy": 0.9684849452972412, | |
| "num_tokens": 4830165631.0, | |
| "step": 34200 | |
| }, | |
| { | |
| "entropy": 1.3546900820732117, | |
| "epoch": 1.1181482811530803, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 2.662899661263445e-06, | |
| "loss": 0.1259, | |
| "mean_token_accuracy": 0.9711956691741943, | |
| "num_tokens": 4836745329.0, | |
| "step": 34250 | |
| }, | |
| { | |
| "entropy": 1.361421148777008, | |
| "epoch": 1.119780614410238, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.654855013352849e-06, | |
| "loss": 0.1297, | |
| "mean_token_accuracy": 0.9703176605701447, | |
| "num_tokens": 4843917385.0, | |
| "step": 34300 | |
| }, | |
| { | |
| "entropy": 1.3671817374229431, | |
| "epoch": 1.1214129476673957, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 2.646812879787668e-06, | |
| "loss": 0.1262, | |
| "mean_token_accuracy": 0.9710332584381104, | |
| "num_tokens": 4850936707.0, | |
| "step": 34350 | |
| }, | |
| { | |
| "entropy": 1.368682358264923, | |
| "epoch": 1.1230452809245535, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 2.6387733191540083e-06, | |
| "loss": 0.1271, | |
| "mean_token_accuracy": 0.9702812135219574, | |
| "num_tokens": 4857774583.0, | |
| "step": 34400 | |
| }, | |
| { | |
| "entropy": 1.357949526309967, | |
| "epoch": 1.1246776141817114, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 2.6307363900192354e-06, | |
| "loss": 0.1344, | |
| "mean_token_accuracy": 0.9693097794055938, | |
| "num_tokens": 4864886795.0, | |
| "step": 34450 | |
| }, | |
| { | |
| "entropy": 1.367612702846527, | |
| "epoch": 1.1263099474388691, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.6227021509315442e-06, | |
| "loss": 0.1312, | |
| "mean_token_accuracy": 0.9697531294822693, | |
| "num_tokens": 4872140576.0, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.1263099474388691, | |
| "eval_entropy": 1.3573125632603964, | |
| "eval_loss": 0.14544960856437683, | |
| "eval_mean_token_accuracy": 0.9669329651196797, | |
| "eval_num_tokens": 4872140576.0, | |
| "eval_runtime": 753.8369, | |
| "eval_samples_per_second": 12.809, | |
| "eval_steps_per_second": 0.101, | |
| "step": 34500 | |
| }, | |
| { | |
| "entropy": 1.3483564281463623, | |
| "epoch": 1.1279422806960269, | |
| "grad_norm": 1.5, | |
| "learning_rate": 2.614670660419533e-06, | |
| "loss": 0.1174, | |
| "mean_token_accuracy": 0.9726065421104431, | |
| "num_tokens": 4879225657.0, | |
| "step": 34550 | |
| }, | |
| { | |
| "entropy": 1.3606638669967652, | |
| "epoch": 1.1295746139531846, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.606641976991775e-06, | |
| "loss": 0.1254, | |
| "mean_token_accuracy": 0.9706631207466125, | |
| "num_tokens": 4886242099.0, | |
| "step": 34600 | |
| }, | |
| { | |
| "entropy": 1.3547830367088318, | |
| "epoch": 1.1312069472103425, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 2.5986161591363984e-06, | |
| "loss": 0.1294, | |
| "mean_token_accuracy": 0.9702156925201416, | |
| "num_tokens": 4892983817.0, | |
| "step": 34650 | |
| }, | |
| { | |
| "entropy": 1.3496627926826477, | |
| "epoch": 1.1328392804675003, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.590593265320652e-06, | |
| "loss": 0.1236, | |
| "mean_token_accuracy": 0.9711934244632721, | |
| "num_tokens": 4900048536.0, | |
| "step": 34700 | |
| }, | |
| { | |
| "entropy": 1.3515707707405091, | |
| "epoch": 1.134471613724658, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 2.582573353990486e-06, | |
| "loss": 0.1279, | |
| "mean_token_accuracy": 0.9710315072536468, | |
| "num_tokens": 4906893370.0, | |
| "step": 34750 | |
| }, | |
| { | |
| "entropy": 1.3535165977478028, | |
| "epoch": 1.1361039469818157, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 2.5745564835701206e-06, | |
| "loss": 0.1204, | |
| "mean_token_accuracy": 0.9720440351963043, | |
| "num_tokens": 4913536928.0, | |
| "step": 34800 | |
| }, | |
| { | |
| "entropy": 1.3644448471069337, | |
| "epoch": 1.1377362802389737, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 2.5665427124616256e-06, | |
| "loss": 0.1317, | |
| "mean_token_accuracy": 0.970216943025589, | |
| "num_tokens": 4920499397.0, | |
| "step": 34850 | |
| }, | |
| { | |
| "entropy": 1.3579574704170227, | |
| "epoch": 1.1393686134961314, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.5585320990444923e-06, | |
| "loss": 0.1277, | |
| "mean_token_accuracy": 0.970561819076538, | |
| "num_tokens": 4928000813.0, | |
| "step": 34900 | |
| }, | |
| { | |
| "entropy": 1.3763416075706483, | |
| "epoch": 1.1410009467532891, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 2.550524701675208e-06, | |
| "loss": 0.1359, | |
| "mean_token_accuracy": 0.9684149813652039, | |
| "num_tokens": 4935032419.0, | |
| "step": 34950 | |
| }, | |
| { | |
| "entropy": 1.3512770438194275, | |
| "epoch": 1.1426332800104468, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 2.542520578686831e-06, | |
| "loss": 0.1211, | |
| "mean_token_accuracy": 0.971741670370102, | |
| "num_tokens": 4941410711.0, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.1426332800104468, | |
| "eval_entropy": 1.3627102088928222, | |
| "eval_loss": 0.14536549150943756, | |
| "eval_mean_token_accuracy": 0.9668985676765441, | |
| "eval_num_tokens": 4941410711.0, | |
| "eval_runtime": 752.233, | |
| "eval_samples_per_second": 12.836, | |
| "eval_steps_per_second": 0.101, | |
| "step": 35000 | |
| }, | |
| { | |
| "entropy": 1.3602009153366088, | |
| "epoch": 1.1442656132676048, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 2.5345197883885677e-06, | |
| "loss": 0.1279, | |
| "mean_token_accuracy": 0.9709439516067505, | |
| "num_tokens": 4948340334.0, | |
| "step": 35050 | |
| }, | |
| { | |
| "entropy": 1.3565789103507995, | |
| "epoch": 1.1458979465247625, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 2.526522389065345e-06, | |
| "loss": 0.1356, | |
| "mean_token_accuracy": 0.9685303854942322, | |
| "num_tokens": 4956282307.0, | |
| "step": 35100 | |
| }, | |
| { | |
| "entropy": 1.3583333039283751, | |
| "epoch": 1.1475302797819202, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 2.518528438977387e-06, | |
| "loss": 0.1214, | |
| "mean_token_accuracy": 0.9718796277046203, | |
| "num_tokens": 4963542094.0, | |
| "step": 35150 | |
| }, | |
| { | |
| "entropy": 1.35888774394989, | |
| "epoch": 1.149162613039078, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 2.51053799635979e-06, | |
| "loss": 0.1341, | |
| "mean_token_accuracy": 0.9693203794956208, | |
| "num_tokens": 4970941813.0, | |
| "step": 35200 | |
| }, | |
| { | |
| "entropy": 1.3589514350891114, | |
| "epoch": 1.150794946296236, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 2.5025511194221e-06, | |
| "loss": 0.1371, | |
| "mean_token_accuracy": 0.9692372989654541, | |
| "num_tokens": 4978422565.0, | |
| "step": 35250 | |
| }, | |
| { | |
| "entropy": 1.3571251654624938, | |
| "epoch": 1.1524272795533936, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 2.494567866347887e-06, | |
| "loss": 0.1301, | |
| "mean_token_accuracy": 0.9706821513175964, | |
| "num_tokens": 4985066771.0, | |
| "step": 35300 | |
| }, | |
| { | |
| "entropy": 1.3489887595176697, | |
| "epoch": 1.1540596128105514, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 2.4865882952943194e-06, | |
| "loss": 0.1179, | |
| "mean_token_accuracy": 0.9729493832588196, | |
| "num_tokens": 4991808794.0, | |
| "step": 35350 | |
| }, | |
| { | |
| "entropy": 1.3580448365211486, | |
| "epoch": 1.155691946067709, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 2.478612464391746e-06, | |
| "loss": 0.1267, | |
| "mean_token_accuracy": 0.9707785761356353, | |
| "num_tokens": 4998905781.0, | |
| "step": 35400 | |
| }, | |
| { | |
| "entropy": 1.35804701089859, | |
| "epoch": 1.157324279324867, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 2.470640431743268e-06, | |
| "loss": 0.1381, | |
| "mean_token_accuracy": 0.9687949836254119, | |
| "num_tokens": 5006205777.0, | |
| "step": 35450 | |
| }, | |
| { | |
| "entropy": 1.361290261745453, | |
| "epoch": 1.1589566125820248, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 2.4626722554243144e-06, | |
| "loss": 0.1272, | |
| "mean_token_accuracy": 0.9700055694580079, | |
| "num_tokens": 5013371710.0, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.1589566125820248, | |
| "eval_entropy": 1.359473959604899, | |
| "eval_loss": 0.14513231813907623, | |
| "eval_mean_token_accuracy": 0.9669572798411051, | |
| "eval_num_tokens": 5013371710.0, | |
| "eval_runtime": 750.7651, | |
| "eval_samples_per_second": 12.862, | |
| "eval_steps_per_second": 0.101, | |
| "step": 35500 | |
| }, | |
| { | |
| "entropy": 1.3512133407592772, | |
| "epoch": 1.1605889458391825, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 2.454707993482224e-06, | |
| "loss": 0.1272, | |
| "mean_token_accuracy": 0.9703232657909393, | |
| "num_tokens": 5020188118.0, | |
| "step": 35550 | |
| }, | |
| { | |
| "entropy": 1.3578423738479615, | |
| "epoch": 1.1622212790963404, | |
| "grad_norm": 1.5, | |
| "learning_rate": 2.446747703935818e-06, | |
| "loss": 0.1222, | |
| "mean_token_accuracy": 0.9719510304927826, | |
| "num_tokens": 5026929276.0, | |
| "step": 35600 | |
| }, | |
| { | |
| "entropy": 1.3523945426940918, | |
| "epoch": 1.1638536123534982, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 2.4387914447749802e-06, | |
| "loss": 0.131, | |
| "mean_token_accuracy": 0.9701538634300232, | |
| "num_tokens": 5034103966.0, | |
| "step": 35650 | |
| }, | |
| { | |
| "entropy": 1.3755294966697693, | |
| "epoch": 1.1654859456106559, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 2.4308392739602323e-06, | |
| "loss": 0.138, | |
| "mean_token_accuracy": 0.968312075138092, | |
| "num_tokens": 5041450508.0, | |
| "step": 35700 | |
| }, | |
| { | |
| "entropy": 1.3669376826286317, | |
| "epoch": 1.1671182788678136, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 2.4228912494223137e-06, | |
| "loss": 0.1333, | |
| "mean_token_accuracy": 0.9692367768287659, | |
| "num_tokens": 5048332405.0, | |
| "step": 35750 | |
| }, | |
| { | |
| "entropy": 1.371087245941162, | |
| "epoch": 1.1687506121249713, | |
| "grad_norm": 2.375, | |
| "learning_rate": 2.414947429061759e-06, | |
| "loss": 0.1322, | |
| "mean_token_accuracy": 0.9700936663150788, | |
| "num_tokens": 5055257003.0, | |
| "step": 35800 | |
| }, | |
| { | |
| "entropy": 1.374721155166626, | |
| "epoch": 1.1703829453821293, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 2.4070078707484743e-06, | |
| "loss": 0.1387, | |
| "mean_token_accuracy": 0.9685248970985413, | |
| "num_tokens": 5062113906.0, | |
| "step": 35850 | |
| }, | |
| { | |
| "entropy": 1.373306679725647, | |
| "epoch": 1.172015278639287, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 2.399072632321319e-06, | |
| "loss": 0.1278, | |
| "mean_token_accuracy": 0.9704372000694275, | |
| "num_tokens": 5069153056.0, | |
| "step": 35900 | |
| }, | |
| { | |
| "entropy": 1.362773072719574, | |
| "epoch": 1.1736476118964447, | |
| "grad_norm": 3.0, | |
| "learning_rate": 2.3911417715876806e-06, | |
| "loss": 0.1175, | |
| "mean_token_accuracy": 0.9727724301815033, | |
| "num_tokens": 5075547365.0, | |
| "step": 35950 | |
| }, | |
| { | |
| "entropy": 1.361773042678833, | |
| "epoch": 1.1752799451536027, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.383215346323058e-06, | |
| "loss": 0.1318, | |
| "mean_token_accuracy": 0.9693261981010437, | |
| "num_tokens": 5082553584.0, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.1752799451536027, | |
| "eval_entropy": 1.3677510404586792, | |
| "eval_loss": 0.1451091319322586, | |
| "eval_mean_token_accuracy": 0.9669363768895467, | |
| "eval_num_tokens": 5082553584.0, | |
| "eval_runtime": 752.6099, | |
| "eval_samples_per_second": 12.83, | |
| "eval_steps_per_second": 0.101, | |
| "step": 36000 | |
| }, | |
| { | |
| "entropy": 1.367734661102295, | |
| "epoch": 1.1769122784107604, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.3752934142706355e-06, | |
| "loss": 0.1292, | |
| "mean_token_accuracy": 0.970736186504364, | |
| "num_tokens": 5089460622.0, | |
| "step": 36050 | |
| }, | |
| { | |
| "entropy": 1.3702288055419922, | |
| "epoch": 1.1785446116679181, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 2.3673760331408664e-06, | |
| "loss": 0.1245, | |
| "mean_token_accuracy": 0.9706618010997772, | |
| "num_tokens": 5096477290.0, | |
| "step": 36100 | |
| }, | |
| { | |
| "entropy": 1.3666929292678833, | |
| "epoch": 1.1801769449250759, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 2.3594632606110514e-06, | |
| "loss": 0.1348, | |
| "mean_token_accuracy": 0.9692357456684113, | |
| "num_tokens": 5103888303.0, | |
| "step": 36150 | |
| }, | |
| { | |
| "entropy": 1.3703182339668274, | |
| "epoch": 1.1818092781822336, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 2.351555154324916e-06, | |
| "loss": 0.1352, | |
| "mean_token_accuracy": 0.9687790739536285, | |
| "num_tokens": 5111598482.0, | |
| "step": 36200 | |
| }, | |
| { | |
| "entropy": 1.3647811126708984, | |
| "epoch": 1.1834416114393915, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.3436517718921944e-06, | |
| "loss": 0.123, | |
| "mean_token_accuracy": 0.9715970456600189, | |
| "num_tokens": 5118205523.0, | |
| "step": 36250 | |
| }, | |
| { | |
| "entropy": 1.3724281644821168, | |
| "epoch": 1.1850739446965493, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 2.3357531708882084e-06, | |
| "loss": 0.1351, | |
| "mean_token_accuracy": 0.9688549792766571, | |
| "num_tokens": 5125390728.0, | |
| "step": 36300 | |
| }, | |
| { | |
| "entropy": 1.3688122749328613, | |
| "epoch": 1.186706277953707, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 2.3278594088534453e-06, | |
| "loss": 0.1248, | |
| "mean_token_accuracy": 0.9709884691238403, | |
| "num_tokens": 5132430170.0, | |
| "step": 36350 | |
| }, | |
| { | |
| "entropy": 1.371947205066681, | |
| "epoch": 1.188338611210865, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 2.319970543293144e-06, | |
| "loss": 0.1299, | |
| "mean_token_accuracy": 0.9705728948116302, | |
| "num_tokens": 5139505776.0, | |
| "step": 36400 | |
| }, | |
| { | |
| "entropy": 1.3706382060050963, | |
| "epoch": 1.1899709444680227, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 2.3120866316768705e-06, | |
| "loss": 0.1288, | |
| "mean_token_accuracy": 0.9700265216827393, | |
| "num_tokens": 5146615876.0, | |
| "step": 36450 | |
| }, | |
| { | |
| "entropy": 1.3712089610099794, | |
| "epoch": 1.1916032777251804, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 2.3042077314381025e-06, | |
| "loss": 0.1227, | |
| "mean_token_accuracy": 0.9712138116359711, | |
| "num_tokens": 5153410420.0, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.1916032777251804, | |
| "eval_entropy": 1.369815084139506, | |
| "eval_loss": 0.14498043060302734, | |
| "eval_mean_token_accuracy": 0.9671436421076457, | |
| "eval_num_tokens": 5153410420.0, | |
| "eval_runtime": 750.4787, | |
| "eval_samples_per_second": 12.866, | |
| "eval_steps_per_second": 0.101, | |
| "step": 36500 | |
| }, | |
| { | |
| "entropy": 1.3633040881156921, | |
| "epoch": 1.193235610982338, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 2.2963338999738103e-06, | |
| "loss": 0.1321, | |
| "mean_token_accuracy": 0.9702033531665802, | |
| "num_tokens": 5160782174.0, | |
| "step": 36550 | |
| }, | |
| { | |
| "entropy": 1.3697287273406982, | |
| "epoch": 1.1948679442394958, | |
| "grad_norm": 1.625, | |
| "learning_rate": 2.288465194644041e-06, | |
| "loss": 0.1346, | |
| "mean_token_accuracy": 0.9692202270030975, | |
| "num_tokens": 5168040473.0, | |
| "step": 36600 | |
| }, | |
| { | |
| "entropy": 1.3875136041641236, | |
| "epoch": 1.1965002774966538, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 2.2806016727714953e-06, | |
| "loss": 0.1355, | |
| "mean_token_accuracy": 0.9688812565803527, | |
| "num_tokens": 5175213322.0, | |
| "step": 36650 | |
| }, | |
| { | |
| "entropy": 1.3730654883384705, | |
| "epoch": 1.1981326107538115, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 2.272743391641114e-06, | |
| "loss": 0.123, | |
| "mean_token_accuracy": 0.9711851370334625, | |
| "num_tokens": 5182005797.0, | |
| "step": 36700 | |
| }, | |
| { | |
| "entropy": 1.3739676403999328, | |
| "epoch": 1.1997649440109692, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 2.2648904084996593e-06, | |
| "loss": 0.1329, | |
| "mean_token_accuracy": 0.9690020906925202, | |
| "num_tokens": 5188915363.0, | |
| "step": 36750 | |
| }, | |
| { | |
| "entropy": 1.3796244549751282, | |
| "epoch": 1.2013972772681272, | |
| "grad_norm": 1.75, | |
| "learning_rate": 2.2570427805553e-06, | |
| "loss": 0.135, | |
| "mean_token_accuracy": 0.969414986371994, | |
| "num_tokens": 5196503416.0, | |
| "step": 36800 | |
| }, | |
| { | |
| "entropy": 1.3724525594711303, | |
| "epoch": 1.203029610525285, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 2.24920056497719e-06, | |
| "loss": 0.1277, | |
| "mean_token_accuracy": 0.9707681620121003, | |
| "num_tokens": 5203249529.0, | |
| "step": 36850 | |
| }, | |
| { | |
| "entropy": 1.3727708411216737, | |
| "epoch": 1.2046619437824426, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 2.2413638188950564e-06, | |
| "loss": 0.1301, | |
| "mean_token_accuracy": 0.9704027915000916, | |
| "num_tokens": 5210234077.0, | |
| "step": 36900 | |
| }, | |
| { | |
| "entropy": 1.3519327425956726, | |
| "epoch": 1.2062942770396003, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 2.2335325993987815e-06, | |
| "loss": 0.1196, | |
| "mean_token_accuracy": 0.9722448754310608, | |
| "num_tokens": 5216853099.0, | |
| "step": 36950 | |
| }, | |
| { | |
| "entropy": 1.365940923690796, | |
| "epoch": 1.207926610296758, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 2.2257069635379863e-06, | |
| "loss": 0.1328, | |
| "mean_token_accuracy": 0.9699783003330231, | |
| "num_tokens": 5223781418.0, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.207926610296758, | |
| "eval_entropy": 1.360196549097697, | |
| "eval_loss": 0.1449788212776184, | |
| "eval_mean_token_accuracy": 0.9670122480392456, | |
| "eval_num_tokens": 5223781418.0, | |
| "eval_runtime": 751.1493, | |
| "eval_samples_per_second": 12.855, | |
| "eval_steps_per_second": 0.101, | |
| "step": 37000 | |
| }, | |
| { | |
| "entropy": 1.360548312664032, | |
| "epoch": 1.209558943553916, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 2.2178869683216164e-06, | |
| "loss": 0.1262, | |
| "mean_token_accuracy": 0.9713475477695465, | |
| "num_tokens": 5230659746.0, | |
| "step": 37050 | |
| }, | |
| { | |
| "entropy": 1.357577109336853, | |
| "epoch": 1.2111912768110737, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 2.2100726707175246e-06, | |
| "loss": 0.1313, | |
| "mean_token_accuracy": 0.969699913263321, | |
| "num_tokens": 5237707649.0, | |
| "step": 37100 | |
| }, | |
| { | |
| "entropy": 1.3651654267311095, | |
| "epoch": 1.2128236100682315, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 2.202264127652059e-06, | |
| "loss": 0.1295, | |
| "mean_token_accuracy": 0.9710112833976745, | |
| "num_tokens": 5244771475.0, | |
| "step": 37150 | |
| }, | |
| { | |
| "entropy": 1.3489612317085267, | |
| "epoch": 1.2144559433253894, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 2.1944613960096456e-06, | |
| "loss": 0.1213, | |
| "mean_token_accuracy": 0.9717336785793305, | |
| "num_tokens": 5251743181.0, | |
| "step": 37200 | |
| }, | |
| { | |
| "entropy": 1.355490939617157, | |
| "epoch": 1.2160882765825471, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 2.1866645326323743e-06, | |
| "loss": 0.1271, | |
| "mean_token_accuracy": 0.9707163834571838, | |
| "num_tokens": 5258283419.0, | |
| "step": 37250 | |
| }, | |
| { | |
| "entropy": 1.3655565786361694, | |
| "epoch": 1.2177206098397049, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 2.1788735943195865e-06, | |
| "loss": 0.1367, | |
| "mean_token_accuracy": 0.9688389587402344, | |
| "num_tokens": 5265823807.0, | |
| "step": 37300 | |
| }, | |
| { | |
| "entropy": 1.349189965724945, | |
| "epoch": 1.2193529430968626, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 2.171088637827458e-06, | |
| "loss": 0.1318, | |
| "mean_token_accuracy": 0.9697683715820312, | |
| "num_tokens": 5273206881.0, | |
| "step": 37350 | |
| }, | |
| { | |
| "entropy": 1.3544545078277588, | |
| "epoch": 1.2209852763540203, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 2.16330971986859e-06, | |
| "loss": 0.1216, | |
| "mean_token_accuracy": 0.9722874271869659, | |
| "num_tokens": 5280217004.0, | |
| "step": 37400 | |
| }, | |
| { | |
| "entropy": 1.3607108736038207, | |
| "epoch": 1.2226176096111783, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 2.1555368971115926e-06, | |
| "loss": 0.1353, | |
| "mean_token_accuracy": 0.9696134865283966, | |
| "num_tokens": 5287498948.0, | |
| "step": 37450 | |
| }, | |
| { | |
| "entropy": 1.3576147866249084, | |
| "epoch": 1.224249942868336, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 2.147770226180672e-06, | |
| "loss": 0.1276, | |
| "mean_token_accuracy": 0.9706488907337188, | |
| "num_tokens": 5294869687.0, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.224249942868336, | |
| "eval_entropy": 1.3614781061808268, | |
| "eval_loss": 0.14473138749599457, | |
| "eval_mean_token_accuracy": 0.9672889757156372, | |
| "eval_num_tokens": 5294869687.0, | |
| "eval_runtime": 751.3614, | |
| "eval_samples_per_second": 12.851, | |
| "eval_steps_per_second": 0.101, | |
| "step": 37500 | |
| }, | |
| { | |
| "entropy": 1.376727077960968, | |
| "epoch": 1.2258822761254937, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 2.1400097636552217e-06, | |
| "loss": 0.1308, | |
| "mean_token_accuracy": 0.9701304376125336, | |
| "num_tokens": 5301938154.0, | |
| "step": 37550 | |
| }, | |
| { | |
| "entropy": 1.3636725115776063, | |
| "epoch": 1.2275146093826517, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 2.1322555660694053e-06, | |
| "loss": 0.1301, | |
| "mean_token_accuracy": 0.9700271189212799, | |
| "num_tokens": 5308968407.0, | |
| "step": 37600 | |
| }, | |
| { | |
| "entropy": 1.3606876850128173, | |
| "epoch": 1.2291469426398094, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 2.124507689911747e-06, | |
| "loss": 0.1232, | |
| "mean_token_accuracy": 0.9719198334217072, | |
| "num_tokens": 5316027114.0, | |
| "step": 37650 | |
| }, | |
| { | |
| "entropy": 1.3608359265327454, | |
| "epoch": 1.2307792758969671, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 2.1167661916247203e-06, | |
| "loss": 0.1352, | |
| "mean_token_accuracy": 0.9692904555797577, | |
| "num_tokens": 5323480081.0, | |
| "step": 37700 | |
| }, | |
| { | |
| "entropy": 1.3596609687805177, | |
| "epoch": 1.2324116091541248, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 2.109031127604339e-06, | |
| "loss": 0.1171, | |
| "mean_token_accuracy": 0.9726130926609039, | |
| "num_tokens": 5330170561.0, | |
| "step": 37750 | |
| }, | |
| { | |
| "entropy": 1.36385005235672, | |
| "epoch": 1.2340439424112826, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 2.10130255419974e-06, | |
| "loss": 0.1271, | |
| "mean_token_accuracy": 0.9708395230770112, | |
| "num_tokens": 5337270272.0, | |
| "step": 37800 | |
| }, | |
| { | |
| "entropy": 1.3592445206642152, | |
| "epoch": 1.2356762756684405, | |
| "grad_norm": 1.75, | |
| "learning_rate": 2.0935805277127794e-06, | |
| "loss": 0.1307, | |
| "mean_token_accuracy": 0.9704990041255951, | |
| "num_tokens": 5344402468.0, | |
| "step": 37850 | |
| }, | |
| { | |
| "entropy": 1.369277856349945, | |
| "epoch": 1.2373086089255982, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 2.0858651043976183e-06, | |
| "loss": 0.1373, | |
| "mean_token_accuracy": 0.9686707425117492, | |
| "num_tokens": 5351952870.0, | |
| "step": 37900 | |
| }, | |
| { | |
| "entropy": 1.3526391363143921, | |
| "epoch": 1.238940942182756, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 2.0781563404603153e-06, | |
| "loss": 0.1266, | |
| "mean_token_accuracy": 0.9708797204494476, | |
| "num_tokens": 5359011853.0, | |
| "step": 37950 | |
| }, | |
| { | |
| "entropy": 1.358784899711609, | |
| "epoch": 1.240573275439914, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 2.0704542920584153e-06, | |
| "loss": 0.1342, | |
| "mean_token_accuracy": 0.9693182837963105, | |
| "num_tokens": 5366436177.0, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.240573275439914, | |
| "eval_entropy": 1.3544676637649535, | |
| "eval_loss": 0.14443761110305786, | |
| "eval_mean_token_accuracy": 0.9673179856936137, | |
| "eval_num_tokens": 5366436177.0, | |
| "eval_runtime": 752.6655, | |
| "eval_samples_per_second": 12.829, | |
| "eval_steps_per_second": 0.101, | |
| "step": 38000 | |
| }, | |
| { | |
| "entropy": 1.344338595867157, | |
| "epoch": 1.2422056086970716, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 2.0627590153005426e-06, | |
| "loss": 0.1146, | |
| "mean_token_accuracy": 0.9736293482780457, | |
| "num_tokens": 5373144912.0, | |
| "step": 38050 | |
| }, | |
| { | |
| "entropy": 1.3542405033111573, | |
| "epoch": 1.2438379419542294, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 2.0550705662459896e-06, | |
| "loss": 0.1276, | |
| "mean_token_accuracy": 0.9702631950378418, | |
| "num_tokens": 5380308640.0, | |
| "step": 38100 | |
| }, | |
| { | |
| "entropy": 1.3496335220336915, | |
| "epoch": 1.245470275211387, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 2.047389000904309e-06, | |
| "loss": 0.1233, | |
| "mean_token_accuracy": 0.9710812640190124, | |
| "num_tokens": 5386799272.0, | |
| "step": 38150 | |
| }, | |
| { | |
| "entropy": 1.3436848521232605, | |
| "epoch": 1.2471026084685448, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 2.0397143752349084e-06, | |
| "loss": 0.1313, | |
| "mean_token_accuracy": 0.9700379192829132, | |
| "num_tokens": 5393957806.0, | |
| "step": 38200 | |
| }, | |
| { | |
| "entropy": 1.3535588788986206, | |
| "epoch": 1.2487349417257028, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 2.032046745146638e-06, | |
| "loss": 0.1256, | |
| "mean_token_accuracy": 0.9713840389251709, | |
| "num_tokens": 5400892138.0, | |
| "step": 38250 | |
| }, | |
| { | |
| "entropy": 1.35460533618927, | |
| "epoch": 1.2503672749828605, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.0243861664973897e-06, | |
| "loss": 0.1271, | |
| "mean_token_accuracy": 0.9702360582351685, | |
| "num_tokens": 5407837431.0, | |
| "step": 38300 | |
| }, | |
| { | |
| "entropy": 1.350141739845276, | |
| "epoch": 1.2519996082400182, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 2.016732695093681e-06, | |
| "loss": 0.1195, | |
| "mean_token_accuracy": 0.9717020273208619, | |
| "num_tokens": 5414369851.0, | |
| "step": 38350 | |
| }, | |
| { | |
| "entropy": 1.3572440361976623, | |
| "epoch": 1.2536319414971762, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 2.009086386690259e-06, | |
| "loss": 0.1321, | |
| "mean_token_accuracy": 0.9700393569469452, | |
| "num_tokens": 5421779114.0, | |
| "step": 38400 | |
| }, | |
| { | |
| "entropy": 1.3585280227661132, | |
| "epoch": 1.2552642747543339, | |
| "grad_norm": 0.002349853515625, | |
| "learning_rate": 2.001447296989687e-06, | |
| "loss": 0.1236, | |
| "mean_token_accuracy": 0.9713066399097443, | |
| "num_tokens": 5428733100.0, | |
| "step": 38450 | |
| }, | |
| { | |
| "entropy": 1.3632621765136719, | |
| "epoch": 1.2568966080114916, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.993815481641939e-06, | |
| "loss": 0.1282, | |
| "mean_token_accuracy": 0.9696168947219849, | |
| "num_tokens": 5435549385.0, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.2568966080114916, | |
| "eval_entropy": 1.3491881497701008, | |
| "eval_loss": 0.14436551928520203, | |
| "eval_mean_token_accuracy": 0.9672579844792684, | |
| "eval_num_tokens": 5435549385.0, | |
| "eval_runtime": 753.3503, | |
| "eval_samples_per_second": 12.817, | |
| "eval_steps_per_second": 0.101, | |
| "step": 38500 | |
| }, | |
| { | |
| "entropy": 1.3552922701835632, | |
| "epoch": 1.2585289412686493, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 1.9861909962440006e-06, | |
| "loss": 0.1325, | |
| "mean_token_accuracy": 0.9696126735210419, | |
| "num_tokens": 5442577839.0, | |
| "step": 38550 | |
| }, | |
| { | |
| "entropy": 1.3554336166381835, | |
| "epoch": 1.260161274525807, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.978573896339455e-06, | |
| "loss": 0.1203, | |
| "mean_token_accuracy": 0.9720760262012482, | |
| "num_tokens": 5449083615.0, | |
| "step": 38600 | |
| }, | |
| { | |
| "entropy": 1.3641015887260437, | |
| "epoch": 1.261793607782965, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.9709642374180845e-06, | |
| "loss": 0.1304, | |
| "mean_token_accuracy": 0.9704764342308044, | |
| "num_tokens": 5456277556.0, | |
| "step": 38650 | |
| }, | |
| { | |
| "entropy": 1.3642727065086364, | |
| "epoch": 1.2634259410401227, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.9633620749154656e-06, | |
| "loss": 0.129, | |
| "mean_token_accuracy": 0.9703558135032654, | |
| "num_tokens": 5462943399.0, | |
| "step": 38700 | |
| }, | |
| { | |
| "entropy": 1.3643340587615966, | |
| "epoch": 1.2650582742972805, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.9557674642125618e-06, | |
| "loss": 0.1276, | |
| "mean_token_accuracy": 0.9705350911617279, | |
| "num_tokens": 5470083333.0, | |
| "step": 38750 | |
| }, | |
| { | |
| "entropy": 1.35529381275177, | |
| "epoch": 1.2666906075544384, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 1.9481804606353256e-06, | |
| "loss": 0.1207, | |
| "mean_token_accuracy": 0.9720316576957703, | |
| "num_tokens": 5477264167.0, | |
| "step": 38800 | |
| }, | |
| { | |
| "entropy": 1.347254078388214, | |
| "epoch": 1.2683229408115961, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 1.9406011194542896e-06, | |
| "loss": 0.1256, | |
| "mean_token_accuracy": 0.9704374611377716, | |
| "num_tokens": 5484197390.0, | |
| "step": 38850 | |
| }, | |
| { | |
| "entropy": 1.3472388553619385, | |
| "epoch": 1.2699552740687539, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.933029495884169e-06, | |
| "loss": 0.1166, | |
| "mean_token_accuracy": 0.9729468870162964, | |
| "num_tokens": 5490515143.0, | |
| "step": 38900 | |
| }, | |
| { | |
| "entropy": 1.3586375880241395, | |
| "epoch": 1.2715876073259116, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.925465645083455e-06, | |
| "loss": 0.1342, | |
| "mean_token_accuracy": 0.9688045310974122, | |
| "num_tokens": 5497798023.0, | |
| "step": 38950 | |
| }, | |
| { | |
| "entropy": 1.3617107224464418, | |
| "epoch": 1.2732199405830693, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.9179096221540163e-06, | |
| "loss": 0.1337, | |
| "mean_token_accuracy": 0.969189600944519, | |
| "num_tokens": 5504996504.0, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.2732199405830693, | |
| "eval_entropy": 1.3547975174585978, | |
| "eval_loss": 0.14426733553409576, | |
| "eval_mean_token_accuracy": 0.9672306942939758, | |
| "eval_num_tokens": 5504996504.0, | |
| "eval_runtime": 747.051, | |
| "eval_samples_per_second": 12.925, | |
| "eval_steps_per_second": 0.102, | |
| "step": 39000 | |
| }, | |
| { | |
| "entropy": 1.3557351016998291, | |
| "epoch": 1.2748522738402273, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 1.910361482140696e-06, | |
| "loss": 0.1248, | |
| "mean_token_accuracy": 0.9716817474365235, | |
| "num_tokens": 5512011287.0, | |
| "step": 39050 | |
| }, | |
| { | |
| "entropy": 1.353394594192505, | |
| "epoch": 1.276484607097385, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 1.90282128003091e-06, | |
| "loss": 0.12, | |
| "mean_token_accuracy": 0.9717885673046112, | |
| "num_tokens": 5518924203.0, | |
| "step": 39100 | |
| }, | |
| { | |
| "entropy": 1.3534292221069335, | |
| "epoch": 1.2781169403545427, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 1.895289070754249e-06, | |
| "loss": 0.131, | |
| "mean_token_accuracy": 0.969032096862793, | |
| "num_tokens": 5526036302.0, | |
| "step": 39150 | |
| }, | |
| { | |
| "entropy": 1.3387615489959717, | |
| "epoch": 1.2797492736117007, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.887764909182076e-06, | |
| "loss": 0.1203, | |
| "mean_token_accuracy": 0.9723163688182831, | |
| "num_tokens": 5532560928.0, | |
| "step": 39200 | |
| }, | |
| { | |
| "entropy": 1.3516933727264404, | |
| "epoch": 1.2813816068688584, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.8802488501271259e-06, | |
| "loss": 0.1325, | |
| "mean_token_accuracy": 0.9697006630897522, | |
| "num_tokens": 5539749961.0, | |
| "step": 39250 | |
| }, | |
| { | |
| "entropy": 1.358582181930542, | |
| "epoch": 1.283013940126016, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 1.8727409483431112e-06, | |
| "loss": 0.1276, | |
| "mean_token_accuracy": 0.9706891429424286, | |
| "num_tokens": 5546991771.0, | |
| "step": 39300 | |
| }, | |
| { | |
| "entropy": 1.3414494490623474, | |
| "epoch": 1.2846462733831738, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 1.8652412585243158e-06, | |
| "loss": 0.1205, | |
| "mean_token_accuracy": 0.9722245049476623, | |
| "num_tokens": 5553461638.0, | |
| "step": 39350 | |
| }, | |
| { | |
| "entropy": 1.3569713354110717, | |
| "epoch": 1.2862786066403316, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.8577498353052025e-06, | |
| "loss": 0.1257, | |
| "mean_token_accuracy": 0.9703078508377075, | |
| "num_tokens": 5560423404.0, | |
| "step": 39400 | |
| }, | |
| { | |
| "entropy": 1.3759264373779296, | |
| "epoch": 1.2879109398974895, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 1.850266733260012e-06, | |
| "loss": 0.1489, | |
| "mean_token_accuracy": 0.9663536393642426, | |
| "num_tokens": 5568028795.0, | |
| "step": 39450 | |
| }, | |
| { | |
| "entropy": 1.3500820732116698, | |
| "epoch": 1.2895432731546472, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 1.8427920069023658e-06, | |
| "loss": 0.1198, | |
| "mean_token_accuracy": 0.9720592284202576, | |
| "num_tokens": 5574696411.0, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.2895432731546472, | |
| "eval_entropy": 1.3591709502538045, | |
| "eval_loss": 0.14416085183620453, | |
| "eval_mean_token_accuracy": 0.9672661876678467, | |
| "eval_num_tokens": 5574696411.0, | |
| "eval_runtime": 754.3873, | |
| "eval_samples_per_second": 12.8, | |
| "eval_steps_per_second": 0.101, | |
| "step": 39500 | |
| }, | |
| { | |
| "entropy": 1.363064079284668, | |
| "epoch": 1.291175606411805, | |
| "grad_norm": 1.25, | |
| "learning_rate": 1.8353257106848703e-06, | |
| "loss": 0.1242, | |
| "mean_token_accuracy": 0.9711923873424531, | |
| "num_tokens": 5581679975.0, | |
| "step": 39550 | |
| }, | |
| { | |
| "entropy": 1.356317241191864, | |
| "epoch": 1.292807939668963, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.8278678989987178e-06, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9687803375720978, | |
| "num_tokens": 5589014993.0, | |
| "step": 39600 | |
| }, | |
| { | |
| "entropy": 1.3506816673278808, | |
| "epoch": 1.2944402729261206, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 1.8204186261732938e-06, | |
| "loss": 0.1313, | |
| "mean_token_accuracy": 0.9694370079040527, | |
| "num_tokens": 5595958511.0, | |
| "step": 39650 | |
| }, | |
| { | |
| "entropy": 1.3653843665122987, | |
| "epoch": 1.2960726061832784, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.8129779464757774e-06, | |
| "loss": 0.1327, | |
| "mean_token_accuracy": 0.9690759527683258, | |
| "num_tokens": 5603031399.0, | |
| "step": 39700 | |
| }, | |
| { | |
| "entropy": 1.3516422724723816, | |
| "epoch": 1.297704939440436, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.8055459141107477e-06, | |
| "loss": 0.1184, | |
| "mean_token_accuracy": 0.9726696240901948, | |
| "num_tokens": 5610046160.0, | |
| "step": 39750 | |
| }, | |
| { | |
| "entropy": 1.376489179134369, | |
| "epoch": 1.2993372726975938, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 1.7981225832197894e-06, | |
| "loss": 0.1399, | |
| "mean_token_accuracy": 0.9676700031757355, | |
| "num_tokens": 5617399853.0, | |
| "step": 39800 | |
| }, | |
| { | |
| "entropy": 1.345590648651123, | |
| "epoch": 1.3009696059547518, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 1.7907080078810983e-06, | |
| "loss": 0.1242, | |
| "mean_token_accuracy": 0.9713241112232208, | |
| "num_tokens": 5624175713.0, | |
| "step": 39850 | |
| }, | |
| { | |
| "entropy": 1.3524721622467042, | |
| "epoch": 1.3026019392119095, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 1.7833022421090858e-06, | |
| "loss": 0.1226, | |
| "mean_token_accuracy": 0.9718897187709808, | |
| "num_tokens": 5631014007.0, | |
| "step": 39900 | |
| }, | |
| { | |
| "entropy": 1.3629893851280213, | |
| "epoch": 1.3042342724690672, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 1.7759053398539873e-06, | |
| "loss": 0.1312, | |
| "mean_token_accuracy": 0.970218130350113, | |
| "num_tokens": 5638370611.0, | |
| "step": 39950 | |
| }, | |
| { | |
| "entropy": 1.3576352548599244, | |
| "epoch": 1.3058666057262251, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 1.7685173550014671e-06, | |
| "loss": 0.134, | |
| "mean_token_accuracy": 0.9691074180603028, | |
| "num_tokens": 5645778490.0, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 1.3058666057262251, | |
| "eval_entropy": 1.355252981185913, | |
| "eval_loss": 0.14380384981632233, | |
| "eval_mean_token_accuracy": 0.9673471585909525, | |
| "eval_num_tokens": 5645778490.0, | |
| "eval_runtime": 745.8824, | |
| "eval_samples_per_second": 12.946, | |
| "eval_steps_per_second": 0.102, | |
| "step": 40000 | |
| }, | |
| { | |
| "entropy": 1.3658280086517334, | |
| "epoch": 1.3074989389833829, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 1.7611383413722303e-06, | |
| "loss": 0.1408, | |
| "mean_token_accuracy": 0.9673460054397583, | |
| "num_tokens": 5653261891.0, | |
| "step": 40050 | |
| }, | |
| { | |
| "entropy": 1.3569263100624085, | |
| "epoch": 1.3091312722405406, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 1.7537683527216242e-06, | |
| "loss": 0.1303, | |
| "mean_token_accuracy": 0.9697596049308777, | |
| "num_tokens": 5660316880.0, | |
| "step": 40100 | |
| }, | |
| { | |
| "entropy": 1.3518936610221863, | |
| "epoch": 1.3107636054976983, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 1.7464074427392512e-06, | |
| "loss": 0.1291, | |
| "mean_token_accuracy": 0.9700168156623841, | |
| "num_tokens": 5667680932.0, | |
| "step": 40150 | |
| }, | |
| { | |
| "entropy": 1.3594465851783752, | |
| "epoch": 1.3123959387548563, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.7390556650485782e-06, | |
| "loss": 0.13, | |
| "mean_token_accuracy": 0.9701092898845672, | |
| "num_tokens": 5674932027.0, | |
| "step": 40200 | |
| }, | |
| { | |
| "entropy": 1.3511109519004822, | |
| "epoch": 1.314028272012014, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 1.7317130732065411e-06, | |
| "loss": 0.1236, | |
| "mean_token_accuracy": 0.9719385504722595, | |
| "num_tokens": 5681669046.0, | |
| "step": 40250 | |
| }, | |
| { | |
| "entropy": 1.3469841027259826, | |
| "epoch": 1.3156606052691717, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 1.7243797207031596e-06, | |
| "loss": 0.1204, | |
| "mean_token_accuracy": 0.9718147456645966, | |
| "num_tokens": 5688464363.0, | |
| "step": 40300 | |
| }, | |
| { | |
| "entropy": 1.3597651433944702, | |
| "epoch": 1.3172929385263297, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.7170556609611477e-06, | |
| "loss": 0.1321, | |
| "mean_token_accuracy": 0.9700502359867096, | |
| "num_tokens": 5695659707.0, | |
| "step": 40350 | |
| }, | |
| { | |
| "entropy": 1.359071056842804, | |
| "epoch": 1.3189252717834874, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.709740947335518e-06, | |
| "loss": 0.1232, | |
| "mean_token_accuracy": 0.9715420746803284, | |
| "num_tokens": 5702611675.0, | |
| "step": 40400 | |
| }, | |
| { | |
| "entropy": 1.3486275053024293, | |
| "epoch": 1.3205576050406451, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.7024356331132025e-06, | |
| "loss": 0.1197, | |
| "mean_token_accuracy": 0.9719861805438995, | |
| "num_tokens": 5709648249.0, | |
| "step": 40450 | |
| }, | |
| { | |
| "entropy": 1.366611111164093, | |
| "epoch": 1.3221899382978028, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 1.695139771512655e-06, | |
| "loss": 0.1363, | |
| "mean_token_accuracy": 0.96794402718544, | |
| "num_tokens": 5716864381.0, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 1.3221899382978028, | |
| "eval_entropy": 1.3547792641321819, | |
| "eval_loss": 0.14348579943180084, | |
| "eval_mean_token_accuracy": 0.9674063356717427, | |
| "eval_num_tokens": 5716864381.0, | |
| "eval_runtime": 751.715, | |
| "eval_samples_per_second": 12.845, | |
| "eval_steps_per_second": 0.101, | |
| "step": 40500 | |
| }, | |
| { | |
| "entropy": 1.3567886662483215, | |
| "epoch": 1.3238222715549606, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.687853415683473e-06, | |
| "loss": 0.118, | |
| "mean_token_accuracy": 0.9730365478992462, | |
| "num_tokens": 5723690263.0, | |
| "step": 40550 | |
| }, | |
| { | |
| "entropy": 1.3625458979606628, | |
| "epoch": 1.3254546048121185, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 1.6805766187059998e-06, | |
| "loss": 0.1359, | |
| "mean_token_accuracy": 0.9688236701488495, | |
| "num_tokens": 5731246956.0, | |
| "step": 40600 | |
| }, | |
| { | |
| "entropy": 1.3589171147346497, | |
| "epoch": 1.3270869380692762, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 1.6733094335909486e-06, | |
| "loss": 0.1366, | |
| "mean_token_accuracy": 0.968874671459198, | |
| "num_tokens": 5738659112.0, | |
| "step": 40650 | |
| }, | |
| { | |
| "entropy": 1.3442611718177795, | |
| "epoch": 1.328719271326434, | |
| "grad_norm": 1.25, | |
| "learning_rate": 1.666051913279007e-06, | |
| "loss": 0.1259, | |
| "mean_token_accuracy": 0.9707275819778443, | |
| "num_tokens": 5745995037.0, | |
| "step": 40700 | |
| }, | |
| { | |
| "entropy": 1.3413966584205628, | |
| "epoch": 1.330351604583592, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.658804110640458e-06, | |
| "loss": 0.1199, | |
| "mean_token_accuracy": 0.972169394493103, | |
| "num_tokens": 5752781216.0, | |
| "step": 40750 | |
| }, | |
| { | |
| "entropy": 1.3544580388069152, | |
| "epoch": 1.3319839378407496, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 1.6515660784747933e-06, | |
| "loss": 0.1287, | |
| "mean_token_accuracy": 0.9699071037769318, | |
| "num_tokens": 5759681433.0, | |
| "step": 40800 | |
| }, | |
| { | |
| "entropy": 1.3634159827232362, | |
| "epoch": 1.3336162710979074, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 1.6443378695103233e-06, | |
| "loss": 0.1422, | |
| "mean_token_accuracy": 0.9680401980876923, | |
| "num_tokens": 5767127976.0, | |
| "step": 40850 | |
| }, | |
| { | |
| "entropy": 1.3604375696182252, | |
| "epoch": 1.335248604355065, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 1.6371195364038034e-06, | |
| "loss": 0.1281, | |
| "mean_token_accuracy": 0.9703556621074676, | |
| "num_tokens": 5773885662.0, | |
| "step": 40900 | |
| }, | |
| { | |
| "entropy": 1.367766616344452, | |
| "epoch": 1.3368809376122228, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 1.6299111317400382e-06, | |
| "loss": 0.1311, | |
| "mean_token_accuracy": 0.9696219575405121, | |
| "num_tokens": 5780884429.0, | |
| "step": 40950 | |
| }, | |
| { | |
| "entropy": 1.3579600000381469, | |
| "epoch": 1.3385132708693808, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 1.6227127080315103e-06, | |
| "loss": 0.1271, | |
| "mean_token_accuracy": 0.9701367461681366, | |
| "num_tokens": 5787949031.0, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 1.3385132708693808, | |
| "eval_entropy": 1.3570391607284547, | |
| "eval_loss": 0.14324620366096497, | |
| "eval_mean_token_accuracy": 0.9674420754114786, | |
| "eval_num_tokens": 5787949031.0, | |
| "eval_runtime": 754.7744, | |
| "eval_samples_per_second": 12.793, | |
| "eval_steps_per_second": 0.101, | |
| "step": 41000 | |
| }, | |
| { | |
| "entropy": 1.3567052793502807, | |
| "epoch": 1.3401456041265385, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 1.6155243177179873e-06, | |
| "loss": 0.125, | |
| "mean_token_accuracy": 0.9710537779331208, | |
| "num_tokens": 5795334113.0, | |
| "step": 41050 | |
| }, | |
| { | |
| "entropy": 1.3398216152191162, | |
| "epoch": 1.3417779373836962, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.6083460131661477e-06, | |
| "loss": 0.1133, | |
| "mean_token_accuracy": 0.9733657622337342, | |
| "num_tokens": 5802143914.0, | |
| "step": 41100 | |
| }, | |
| { | |
| "entropy": 1.3610263323783875, | |
| "epoch": 1.3434102706408542, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 1.6011778466691951e-06, | |
| "loss": 0.1231, | |
| "mean_token_accuracy": 0.9718475365638732, | |
| "num_tokens": 5808928463.0, | |
| "step": 41150 | |
| }, | |
| { | |
| "entropy": 1.3601333618164062, | |
| "epoch": 1.345042603898012, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 1.5940198704464793e-06, | |
| "loss": 0.1302, | |
| "mean_token_accuracy": 0.969777444601059, | |
| "num_tokens": 5816047556.0, | |
| "step": 41200 | |
| }, | |
| { | |
| "entropy": 1.36517019033432, | |
| "epoch": 1.3466749371551696, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.5868721366431148e-06, | |
| "loss": 0.1311, | |
| "mean_token_accuracy": 0.9697459650039673, | |
| "num_tokens": 5823417847.0, | |
| "step": 41250 | |
| }, | |
| { | |
| "entropy": 1.3562990355491638, | |
| "epoch": 1.3483072704123273, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.5797346973295984e-06, | |
| "loss": 0.1294, | |
| "mean_token_accuracy": 0.9702081382274628, | |
| "num_tokens": 5830494397.0, | |
| "step": 41300 | |
| }, | |
| { | |
| "entropy": 1.3457439398765565, | |
| "epoch": 1.349939603669485, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 1.5726076045014376e-06, | |
| "loss": 0.1206, | |
| "mean_token_accuracy": 0.9718204891681671, | |
| "num_tokens": 5837675412.0, | |
| "step": 41350 | |
| }, | |
| { | |
| "entropy": 1.3478499841690064, | |
| "epoch": 1.351571936926643, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 1.565490910078761e-06, | |
| "loss": 0.1224, | |
| "mean_token_accuracy": 0.971153804063797, | |
| "num_tokens": 5844543024.0, | |
| "step": 41400 | |
| }, | |
| { | |
| "entropy": 1.3584490442276, | |
| "epoch": 1.3532042701838007, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 1.5583846659059525e-06, | |
| "loss": 0.12, | |
| "mean_token_accuracy": 0.9715865242481232, | |
| "num_tokens": 5851449870.0, | |
| "step": 41450 | |
| }, | |
| { | |
| "entropy": 1.3518057036399842, | |
| "epoch": 1.3548366034409585, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 1.5512889237512604e-06, | |
| "loss": 0.13, | |
| "mean_token_accuracy": 0.9702707767486572, | |
| "num_tokens": 5858314021.0, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 1.3548366034409585, | |
| "eval_entropy": 1.3525403213500977, | |
| "eval_loss": 0.14299461245536804, | |
| "eval_mean_token_accuracy": 0.9675085457166036, | |
| "eval_num_tokens": 5858314021.0, | |
| "eval_runtime": 751.2489, | |
| "eval_samples_per_second": 12.853, | |
| "eval_steps_per_second": 0.101, | |
| "step": 41500 | |
| }, | |
| { | |
| "entropy": 1.3594915199279785, | |
| "epoch": 1.3564689366981164, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.5442037353064314e-06, | |
| "loss": 0.1252, | |
| "mean_token_accuracy": 0.97132617354393, | |
| "num_tokens": 5865594325.0, | |
| "step": 41550 | |
| }, | |
| { | |
| "entropy": 1.352486503124237, | |
| "epoch": 1.3581012699552741, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 1.537129152186329e-06, | |
| "loss": 0.1254, | |
| "mean_token_accuracy": 0.9710965490341187, | |
| "num_tokens": 5872751335.0, | |
| "step": 41600 | |
| }, | |
| { | |
| "entropy": 1.3547274160385132, | |
| "epoch": 1.3597336032124319, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.530065225928555e-06, | |
| "loss": 0.13, | |
| "mean_token_accuracy": 0.9703376948833465, | |
| "num_tokens": 5880128383.0, | |
| "step": 41650 | |
| }, | |
| { | |
| "entropy": 1.3553478455543517, | |
| "epoch": 1.3613659364695896, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.5230120079930814e-06, | |
| "loss": 0.1216, | |
| "mean_token_accuracy": 0.9711664152145386, | |
| "num_tokens": 5887029878.0, | |
| "step": 41700 | |
| }, | |
| { | |
| "entropy": 1.3534876823425293, | |
| "epoch": 1.3629982697267473, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.515969549761867e-06, | |
| "loss": 0.1274, | |
| "mean_token_accuracy": 0.9711965453624726, | |
| "num_tokens": 5893747154.0, | |
| "step": 41750 | |
| }, | |
| { | |
| "entropy": 1.360564501285553, | |
| "epoch": 1.3646306029839053, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 1.5089379025384912e-06, | |
| "loss": 0.1351, | |
| "mean_token_accuracy": 0.969272004365921, | |
| "num_tokens": 5901094204.0, | |
| "step": 41800 | |
| }, | |
| { | |
| "entropy": 1.342657322883606, | |
| "epoch": 1.366262936241063, | |
| "grad_norm": 0.020263671875, | |
| "learning_rate": 1.501917117547772e-06, | |
| "loss": 0.1229, | |
| "mean_token_accuracy": 0.9716498827934266, | |
| "num_tokens": 5908182746.0, | |
| "step": 41850 | |
| }, | |
| { | |
| "entropy": 1.3452470707893371, | |
| "epoch": 1.3678952694982207, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 1.4949072459354022e-06, | |
| "loss": 0.1212, | |
| "mean_token_accuracy": 0.972344673871994, | |
| "num_tokens": 5915128546.0, | |
| "step": 41900 | |
| }, | |
| { | |
| "entropy": 1.3409530735015869, | |
| "epoch": 1.3695276027553787, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 1.4879083387675666e-06, | |
| "loss": 0.1229, | |
| "mean_token_accuracy": 0.9715113770961762, | |
| "num_tokens": 5922010723.0, | |
| "step": 41950 | |
| }, | |
| { | |
| "entropy": 1.3670923542976379, | |
| "epoch": 1.3711599360125364, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 1.4809204470305788e-06, | |
| "loss": 0.1351, | |
| "mean_token_accuracy": 0.9693290328979492, | |
| "num_tokens": 5929411699.0, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.3711599360125364, | |
| "eval_entropy": 1.352181215286255, | |
| "eval_loss": 0.14293250441551208, | |
| "eval_mean_token_accuracy": 0.9675895547866822, | |
| "eval_num_tokens": 5929411699.0, | |
| "eval_runtime": 753.3298, | |
| "eval_samples_per_second": 12.818, | |
| "eval_steps_per_second": 0.101, | |
| "step": 42000 | |
| }, | |
| { | |
| "entropy": 1.3608778929710388, | |
| "epoch": 1.372792269269694, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 1.4739436216305063e-06, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9693701839447022, | |
| "num_tokens": 5936747601.0, | |
| "step": 42050 | |
| }, | |
| { | |
| "entropy": 1.361914451122284, | |
| "epoch": 1.3744246025268518, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.4669779133927956e-06, | |
| "loss": 0.1255, | |
| "mean_token_accuracy": 0.9715308749675751, | |
| "num_tokens": 5943992563.0, | |
| "step": 42100 | |
| }, | |
| { | |
| "entropy": 1.3531924724578857, | |
| "epoch": 1.3760569357840096, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 1.460023373061911e-06, | |
| "loss": 0.1291, | |
| "mean_token_accuracy": 0.9694987845420837, | |
| "num_tokens": 5951045516.0, | |
| "step": 42150 | |
| }, | |
| { | |
| "entropy": 1.3581135630607606, | |
| "epoch": 1.3776892690411675, | |
| "grad_norm": 0.005950927734375, | |
| "learning_rate": 1.4530800513009545e-06, | |
| "loss": 0.1265, | |
| "mean_token_accuracy": 0.9705501091480255, | |
| "num_tokens": 5958106338.0, | |
| "step": 42200 | |
| }, | |
| { | |
| "entropy": 1.361065561771393, | |
| "epoch": 1.3793216022983252, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.4461479986913075e-06, | |
| "loss": 0.1232, | |
| "mean_token_accuracy": 0.9708436739444732, | |
| "num_tokens": 5964787874.0, | |
| "step": 42250 | |
| }, | |
| { | |
| "entropy": 1.3638162088394166, | |
| "epoch": 1.380953935555483, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.43922726573225e-06, | |
| "loss": 0.1297, | |
| "mean_token_accuracy": 0.9698211419582367, | |
| "num_tokens": 5972312102.0, | |
| "step": 42300 | |
| }, | |
| { | |
| "entropy": 1.3620294046401977, | |
| "epoch": 1.382586268812641, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 1.4323179028406086e-06, | |
| "loss": 0.126, | |
| "mean_token_accuracy": 0.9703358936309815, | |
| "num_tokens": 5979432191.0, | |
| "step": 42350 | |
| }, | |
| { | |
| "entropy": 1.3516957235336304, | |
| "epoch": 1.3842186020697986, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.4254199603503709e-06, | |
| "loss": 0.1185, | |
| "mean_token_accuracy": 0.9726535677909851, | |
| "num_tokens": 5986862863.0, | |
| "step": 42400 | |
| }, | |
| { | |
| "entropy": 1.3603333234786987, | |
| "epoch": 1.3858509353269564, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.4185334885123332e-06, | |
| "loss": 0.1225, | |
| "mean_token_accuracy": 0.9719485092163086, | |
| "num_tokens": 5993969922.0, | |
| "step": 42450 | |
| }, | |
| { | |
| "entropy": 1.3303070521354676, | |
| "epoch": 1.387483268584114, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 1.4116585374937304e-06, | |
| "loss": 0.1134, | |
| "mean_token_accuracy": 0.9734328532218933, | |
| "num_tokens": 6000796242.0, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 1.387483268584114, | |
| "eval_entropy": 1.3550557088851929, | |
| "eval_loss": 0.14276918768882751, | |
| "eval_mean_token_accuracy": 0.9676290774345397, | |
| "eval_num_tokens": 6000796242.0, | |
| "eval_runtime": 751.2401, | |
| "eval_samples_per_second": 12.853, | |
| "eval_steps_per_second": 0.101, | |
| "step": 42500 | |
| }, | |
| { | |
| "entropy": 1.3588794898986816, | |
| "epoch": 1.3891156018412718, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 1.4047951573778641e-06, | |
| "loss": 0.1206, | |
| "mean_token_accuracy": 0.9720923590660095, | |
| "num_tokens": 6007327659.0, | |
| "step": 42550 | |
| }, | |
| { | |
| "entropy": 1.3503399062156678, | |
| "epoch": 1.3907479350984298, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 1.3979433981637493e-06, | |
| "loss": 0.1193, | |
| "mean_token_accuracy": 0.9727861452102661, | |
| "num_tokens": 6014078918.0, | |
| "step": 42600 | |
| }, | |
| { | |
| "entropy": 1.3634498286247254, | |
| "epoch": 1.3923802683555875, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 1.3911033097657374e-06, | |
| "loss": 0.1204, | |
| "mean_token_accuracy": 0.9721335184574127, | |
| "num_tokens": 6020712824.0, | |
| "step": 42650 | |
| }, | |
| { | |
| "entropy": 1.3603689241409302, | |
| "epoch": 1.3940126016127452, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.3842749420131663e-06, | |
| "loss": 0.1297, | |
| "mean_token_accuracy": 0.969935257434845, | |
| "num_tokens": 6027950128.0, | |
| "step": 42700 | |
| }, | |
| { | |
| "entropy": 1.3577491450309753, | |
| "epoch": 1.3956449348699032, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 1.3774583446499835e-06, | |
| "loss": 0.1383, | |
| "mean_token_accuracy": 0.9680879211425781, | |
| "num_tokens": 6035576325.0, | |
| "step": 42750 | |
| }, | |
| { | |
| "entropy": 1.349846076965332, | |
| "epoch": 1.3972772681270609, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 1.3706535673343945e-06, | |
| "loss": 0.1289, | |
| "mean_token_accuracy": 0.9708491718769073, | |
| "num_tokens": 6042939083.0, | |
| "step": 42800 | |
| }, | |
| { | |
| "entropy": 1.3469208598136901, | |
| "epoch": 1.3989096013842186, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.3638606596384973e-06, | |
| "loss": 0.1186, | |
| "mean_token_accuracy": 0.9719974470138549, | |
| "num_tokens": 6050053050.0, | |
| "step": 42850 | |
| }, | |
| { | |
| "entropy": 1.3655208253860474, | |
| "epoch": 1.4005419346413763, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.3570796710479174e-06, | |
| "loss": 0.13, | |
| "mean_token_accuracy": 0.9699460983276367, | |
| "num_tokens": 6057064095.0, | |
| "step": 42900 | |
| }, | |
| { | |
| "entropy": 1.351545627117157, | |
| "epoch": 1.402174267898534, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.3503106509614553e-06, | |
| "loss": 0.1235, | |
| "mean_token_accuracy": 0.9710482954978943, | |
| "num_tokens": 6064091867.0, | |
| "step": 42950 | |
| }, | |
| { | |
| "entropy": 1.3588644528388978, | |
| "epoch": 1.403806601155692, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.3435536486907172e-06, | |
| "loss": 0.1234, | |
| "mean_token_accuracy": 0.9709674298763276, | |
| "num_tokens": 6071438514.0, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 1.403806601155692, | |
| "eval_entropy": 1.3524145444234212, | |
| "eval_loss": 0.14265932142734528, | |
| "eval_mean_token_accuracy": 0.9675256490707398, | |
| "eval_num_tokens": 6071438514.0, | |
| "eval_runtime": 753.0235, | |
| "eval_samples_per_second": 12.823, | |
| "eval_steps_per_second": 0.101, | |
| "step": 43000 | |
| }, | |
| { | |
| "entropy": 1.3578597354888915, | |
| "epoch": 1.4054389344128497, | |
| "grad_norm": 0.01470947265625, | |
| "learning_rate": 1.3368087134597663e-06, | |
| "loss": 0.1238, | |
| "mean_token_accuracy": 0.9716431427001954, | |
| "num_tokens": 6078596404.0, | |
| "step": 43050 | |
| }, | |
| { | |
| "entropy": 1.3505292820930481, | |
| "epoch": 1.4070712676700075, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 1.3300758944047536e-06, | |
| "loss": 0.1321, | |
| "mean_token_accuracy": 0.9695228207111358, | |
| "num_tokens": 6086011992.0, | |
| "step": 43100 | |
| }, | |
| { | |
| "entropy": 1.3455975699424743, | |
| "epoch": 1.4087036009271654, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.3233552405735694e-06, | |
| "loss": 0.1156, | |
| "mean_token_accuracy": 0.9727073276042938, | |
| "num_tokens": 6092681553.0, | |
| "step": 43150 | |
| }, | |
| { | |
| "entropy": 1.3595034289360046, | |
| "epoch": 1.4103359341843231, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.3166468009254766e-06, | |
| "loss": 0.1196, | |
| "mean_token_accuracy": 0.9722693479061126, | |
| "num_tokens": 6099527516.0, | |
| "step": 43200 | |
| }, | |
| { | |
| "entropy": 1.3427305126190185, | |
| "epoch": 1.4119682674414809, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 1.309950624330764e-06, | |
| "loss": 0.1194, | |
| "mean_token_accuracy": 0.9728332602977753, | |
| "num_tokens": 6106350241.0, | |
| "step": 43250 | |
| }, | |
| { | |
| "entropy": 1.3501028728485107, | |
| "epoch": 1.4136006006986386, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 1.3032667595703842e-06, | |
| "loss": 0.1259, | |
| "mean_token_accuracy": 0.9705328547954559, | |
| "num_tokens": 6113268163.0, | |
| "step": 43300 | |
| }, | |
| { | |
| "entropy": 1.3498585319519043, | |
| "epoch": 1.4152329339557963, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 1.2965952553355958e-06, | |
| "loss": 0.1277, | |
| "mean_token_accuracy": 0.9707833099365234, | |
| "num_tokens": 6120281186.0, | |
| "step": 43350 | |
| }, | |
| { | |
| "entropy": 1.35530499458313, | |
| "epoch": 1.4168652672129542, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.2899361602276175e-06, | |
| "loss": 0.1265, | |
| "mean_token_accuracy": 0.9698385155200958, | |
| "num_tokens": 6127237591.0, | |
| "step": 43400 | |
| }, | |
| { | |
| "entropy": 1.3571084594726563, | |
| "epoch": 1.418497600470112, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 1.2832895227572622e-06, | |
| "loss": 0.1271, | |
| "mean_token_accuracy": 0.9701455044746399, | |
| "num_tokens": 6134551254.0, | |
| "step": 43450 | |
| }, | |
| { | |
| "entropy": 1.3430089569091797, | |
| "epoch": 1.4201299337272697, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.2766553913445993e-06, | |
| "loss": 0.1201, | |
| "mean_token_accuracy": 0.9717100954055786, | |
| "num_tokens": 6141418599.0, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 1.4201299337272697, | |
| "eval_entropy": 1.3463400856653849, | |
| "eval_loss": 0.1425597369670868, | |
| "eval_mean_token_accuracy": 0.9676672736803691, | |
| "eval_num_tokens": 6141418599.0, | |
| "eval_runtime": 753.679, | |
| "eval_samples_per_second": 12.812, | |
| "eval_steps_per_second": 0.101, | |
| "step": 43500 | |
| }, | |
| { | |
| "entropy": 1.3354617381095886, | |
| "epoch": 1.4217622669844276, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.2700338143185843e-06, | |
| "loss": 0.1162, | |
| "mean_token_accuracy": 0.972969708442688, | |
| "num_tokens": 6148322455.0, | |
| "step": 43550 | |
| }, | |
| { | |
| "entropy": 1.3403245830535888, | |
| "epoch": 1.4233946002415854, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.2634248399167203e-06, | |
| "loss": 0.1193, | |
| "mean_token_accuracy": 0.9722915697097778, | |
| "num_tokens": 6155533008.0, | |
| "step": 43600 | |
| }, | |
| { | |
| "entropy": 1.353358724117279, | |
| "epoch": 1.425026933498743, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 1.2568285162846987e-06, | |
| "loss": 0.1328, | |
| "mean_token_accuracy": 0.9696524286270142, | |
| "num_tokens": 6162774102.0, | |
| "step": 43650 | |
| }, | |
| { | |
| "entropy": 1.3451939988136292, | |
| "epoch": 1.4266592667559008, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 1.2502448914760533e-06, | |
| "loss": 0.1138, | |
| "mean_token_accuracy": 0.9730928063392639, | |
| "num_tokens": 6169457314.0, | |
| "step": 43700 | |
| }, | |
| { | |
| "entropy": 1.3512369799613952, | |
| "epoch": 1.4282916000130585, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.2436740134518094e-06, | |
| "loss": 0.1283, | |
| "mean_token_accuracy": 0.9699138104915619, | |
| "num_tokens": 6176504604.0, | |
| "step": 43750 | |
| }, | |
| { | |
| "entropy": 1.3418658518791198, | |
| "epoch": 1.4299239332702165, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.2371159300801284e-06, | |
| "loss": 0.1169, | |
| "mean_token_accuracy": 0.9729372990131379, | |
| "num_tokens": 6183277441.0, | |
| "step": 43800 | |
| }, | |
| { | |
| "entropy": 1.3383120560646058, | |
| "epoch": 1.4315562665273742, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 1.2305706891359698e-06, | |
| "loss": 0.117, | |
| "mean_token_accuracy": 0.9732610857486725, | |
| "num_tokens": 6190472536.0, | |
| "step": 43850 | |
| }, | |
| { | |
| "entropy": 1.3513377356529235, | |
| "epoch": 1.433188599784532, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 1.2240383383007325e-06, | |
| "loss": 0.135, | |
| "mean_token_accuracy": 0.9690117561817169, | |
| "num_tokens": 6197990663.0, | |
| "step": 43900 | |
| }, | |
| { | |
| "entropy": 1.3517193269729615, | |
| "epoch": 1.43482093304169, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.2175189251619168e-06, | |
| "loss": 0.1177, | |
| "mean_token_accuracy": 0.9728011786937714, | |
| "num_tokens": 6204993365.0, | |
| "step": 43950 | |
| }, | |
| { | |
| "entropy": 1.3436356329917907, | |
| "epoch": 1.4364532662988476, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 1.2110124972127686e-06, | |
| "loss": 0.1262, | |
| "mean_token_accuracy": 0.9713046276569366, | |
| "num_tokens": 6212573620.0, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 1.4364532662988476, | |
| "eval_entropy": 1.3463455152511596, | |
| "eval_loss": 0.14246320724487305, | |
| "eval_mean_token_accuracy": 0.9677392840385437, | |
| "eval_num_tokens": 6212573620.0, | |
| "eval_runtime": 754.1482, | |
| "eval_samples_per_second": 12.804, | |
| "eval_steps_per_second": 0.101, | |
| "step": 44000 | |
| }, | |
| { | |
| "entropy": 1.3434760403633117, | |
| "epoch": 1.4380855995560053, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.2045191018519415e-06, | |
| "loss": 0.1169, | |
| "mean_token_accuracy": 0.9728634548187256, | |
| "num_tokens": 6219647458.0, | |
| "step": 44050 | |
| }, | |
| { | |
| "entropy": 1.3590314435958861, | |
| "epoch": 1.439717932813163, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 1.1980387863831478e-06, | |
| "loss": 0.126, | |
| "mean_token_accuracy": 0.9703594601154327, | |
| "num_tokens": 6226844151.0, | |
| "step": 44100 | |
| }, | |
| { | |
| "entropy": 1.3532491779327394, | |
| "epoch": 1.4413502660703208, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 1.1915715980148117e-06, | |
| "loss": 0.1309, | |
| "mean_token_accuracy": 0.9699800384044647, | |
| "num_tokens": 6234263907.0, | |
| "step": 44150 | |
| }, | |
| { | |
| "entropy": 1.349997682571411, | |
| "epoch": 1.4429825993274787, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.1851175838597306e-06, | |
| "loss": 0.121, | |
| "mean_token_accuracy": 0.9720881938934326, | |
| "num_tokens": 6241048484.0, | |
| "step": 44200 | |
| }, | |
| { | |
| "entropy": 1.354639277458191, | |
| "epoch": 1.4446149325846365, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.1786767909347268e-06, | |
| "loss": 0.1375, | |
| "mean_token_accuracy": 0.9680390894412995, | |
| "num_tokens": 6248857975.0, | |
| "step": 44250 | |
| }, | |
| { | |
| "entropy": 1.3580044388771058, | |
| "epoch": 1.4462472658417942, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.1722492661603098e-06, | |
| "loss": 0.126, | |
| "mean_token_accuracy": 0.9711121428012848, | |
| "num_tokens": 6255829230.0, | |
| "step": 44300 | |
| }, | |
| { | |
| "entropy": 1.3534194731712341, | |
| "epoch": 1.4478795990989521, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.165835056360329e-06, | |
| "loss": 0.121, | |
| "mean_token_accuracy": 0.9716777575016021, | |
| "num_tokens": 6262589815.0, | |
| "step": 44350 | |
| }, | |
| { | |
| "entropy": 1.3448237705230712, | |
| "epoch": 1.4495119323561099, | |
| "grad_norm": 1.625, | |
| "learning_rate": 1.1594342082616386e-06, | |
| "loss": 0.1269, | |
| "mean_token_accuracy": 0.9700544607639313, | |
| "num_tokens": 6269556274.0, | |
| "step": 44400 | |
| }, | |
| { | |
| "entropy": 1.3512716341018676, | |
| "epoch": 1.4511442656132676, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.1530467684937514e-06, | |
| "loss": 0.1226, | |
| "mean_token_accuracy": 0.9714391076564789, | |
| "num_tokens": 6276345824.0, | |
| "step": 44450 | |
| }, | |
| { | |
| "entropy": 1.3386855101585389, | |
| "epoch": 1.4527765988704253, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.146672783588504e-06, | |
| "loss": 0.1132, | |
| "mean_token_accuracy": 0.973590886592865, | |
| "num_tokens": 6282741770.0, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 1.4527765988704253, | |
| "eval_entropy": 1.3487922525405884, | |
| "eval_loss": 0.14237141609191895, | |
| "eval_mean_token_accuracy": 0.967642084757487, | |
| "eval_num_tokens": 6282741770.0, | |
| "eval_runtime": 750.7822, | |
| "eval_samples_per_second": 12.861, | |
| "eval_steps_per_second": 0.101, | |
| "step": 44500 | |
| }, | |
| { | |
| "entropy": 1.3471359300613404, | |
| "epoch": 1.454408932127583, | |
| "grad_norm": 0.82421875, | |
| "learning_rate": 1.1403122999797162e-06, | |
| "loss": 0.122, | |
| "mean_token_accuracy": 0.9717613708972931, | |
| "num_tokens": 6289684604.0, | |
| "step": 44550 | |
| }, | |
| { | |
| "entropy": 1.357615110874176, | |
| "epoch": 1.456041265384741, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.133965364002848e-06, | |
| "loss": 0.1318, | |
| "mean_token_accuracy": 0.9699806833267212, | |
| "num_tokens": 6296718052.0, | |
| "step": 44600 | |
| }, | |
| { | |
| "entropy": 1.3517170500755311, | |
| "epoch": 1.4576735986418987, | |
| "grad_norm": 0.004852294921875, | |
| "learning_rate": 1.1276320218946737e-06, | |
| "loss": 0.1167, | |
| "mean_token_accuracy": 0.9736652266979218, | |
| "num_tokens": 6303023793.0, | |
| "step": 44650 | |
| }, | |
| { | |
| "entropy": 1.3588165473937988, | |
| "epoch": 1.4593059318990564, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.1213123197929296e-06, | |
| "loss": 0.1289, | |
| "mean_token_accuracy": 0.9699364423751831, | |
| "num_tokens": 6310502615.0, | |
| "step": 44700 | |
| }, | |
| { | |
| "entropy": 1.3603587317466737, | |
| "epoch": 1.4609382651562144, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 1.1150063037359927e-06, | |
| "loss": 0.131, | |
| "mean_token_accuracy": 0.9688486230373382, | |
| "num_tokens": 6317956478.0, | |
| "step": 44750 | |
| }, | |
| { | |
| "entropy": 1.3491565418243407, | |
| "epoch": 1.4625705984133721, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.108714019662533e-06, | |
| "loss": 0.1178, | |
| "mean_token_accuracy": 0.9722418069839478, | |
| "num_tokens": 6324906866.0, | |
| "step": 44800 | |
| }, | |
| { | |
| "entropy": 1.347842710018158, | |
| "epoch": 1.4642029316705298, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.1024355134111894e-06, | |
| "loss": 0.1182, | |
| "mean_token_accuracy": 0.9722170174121857, | |
| "num_tokens": 6331464077.0, | |
| "step": 44850 | |
| }, | |
| { | |
| "entropy": 1.3519015336036682, | |
| "epoch": 1.4658352649276876, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 1.096170830720226e-06, | |
| "loss": 0.1297, | |
| "mean_token_accuracy": 0.9708062732219696, | |
| "num_tokens": 6338494070.0, | |
| "step": 44900 | |
| }, | |
| { | |
| "entropy": 1.3482877349853515, | |
| "epoch": 1.4674675981848453, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.0899200172272073e-06, | |
| "loss": 0.1274, | |
| "mean_token_accuracy": 0.9707257854938507, | |
| "num_tokens": 6345499457.0, | |
| "step": 44950 | |
| }, | |
| { | |
| "entropy": 1.3303207564353943, | |
| "epoch": 1.4690999314420032, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 1.0836831184686621e-06, | |
| "loss": 0.1154, | |
| "mean_token_accuracy": 0.972674525976181, | |
| "num_tokens": 6352217211.0, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.4690999314420032, | |
| "eval_entropy": 1.3471796067555746, | |
| "eval_loss": 0.14247018098831177, | |
| "eval_mean_token_accuracy": 0.9676760347684225, | |
| "eval_num_tokens": 6352217211.0, | |
| "eval_runtime": 746.9306, | |
| "eval_samples_per_second": 12.928, | |
| "eval_steps_per_second": 0.102, | |
| "step": 45000 | |
| }, | |
| { | |
| "entropy": 1.3490448307991028, | |
| "epoch": 1.470732264699161, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.0774601798797487e-06, | |
| "loss": 0.1202, | |
| "mean_token_accuracy": 0.971461181640625, | |
| "num_tokens": 6359326989.0, | |
| "step": 45050 | |
| }, | |
| { | |
| "entropy": 1.3362992668151856, | |
| "epoch": 1.4723645979563187, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 1.071251246793931e-06, | |
| "loss": 0.1305, | |
| "mean_token_accuracy": 0.9699250388145447, | |
| "num_tokens": 6366511833.0, | |
| "step": 45100 | |
| }, | |
| { | |
| "entropy": 1.3491197919845581, | |
| "epoch": 1.4739969312134766, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.0650563644426402e-06, | |
| "loss": 0.1287, | |
| "mean_token_accuracy": 0.9701677405834198, | |
| "num_tokens": 6373790440.0, | |
| "step": 45150 | |
| }, | |
| { | |
| "entropy": 1.3424671697616577, | |
| "epoch": 1.4756292644706344, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.0588755779549534e-06, | |
| "loss": 0.1305, | |
| "mean_token_accuracy": 0.9701132154464722, | |
| "num_tokens": 6381354563.0, | |
| "step": 45200 | |
| }, | |
| { | |
| "entropy": 1.3442249703407287, | |
| "epoch": 1.477261597727792, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 1.0527089323572568e-06, | |
| "loss": 0.1235, | |
| "mean_token_accuracy": 0.9716306221485138, | |
| "num_tokens": 6388718219.0, | |
| "step": 45250 | |
| }, | |
| { | |
| "entropy": 1.3491050267219544, | |
| "epoch": 1.4788939309849498, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.0465564725729245e-06, | |
| "loss": 0.1337, | |
| "mean_token_accuracy": 0.9686619007587433, | |
| "num_tokens": 6396204282.0, | |
| "step": 45300 | |
| }, | |
| { | |
| "entropy": 1.358343975543976, | |
| "epoch": 1.4805262642421075, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 1.040418243421989e-06, | |
| "loss": 0.1324, | |
| "mean_token_accuracy": 0.9696023035049438, | |
| "num_tokens": 6403488343.0, | |
| "step": 45350 | |
| }, | |
| { | |
| "entropy": 1.3462382221221925, | |
| "epoch": 1.4821585974992655, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 1.0342942896208105e-06, | |
| "loss": 0.1263, | |
| "mean_token_accuracy": 0.9713137638568878, | |
| "num_tokens": 6410641119.0, | |
| "step": 45400 | |
| }, | |
| { | |
| "entropy": 1.3580231857299805, | |
| "epoch": 1.4837909307564232, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.028184655781759e-06, | |
| "loss": 0.1288, | |
| "mean_token_accuracy": 0.9706091582775116, | |
| "num_tokens": 6418060885.0, | |
| "step": 45450 | |
| }, | |
| { | |
| "entropy": 1.3437149500846863, | |
| "epoch": 1.485423264013581, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 1.0220893864128809e-06, | |
| "loss": 0.1204, | |
| "mean_token_accuracy": 0.9711262369155884, | |
| "num_tokens": 6424982813.0, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 1.485423264013581, | |
| "eval_entropy": 1.3490506156285604, | |
| "eval_loss": 0.14239265024662018, | |
| "eval_mean_token_accuracy": 0.9676458183924357, | |
| "eval_num_tokens": 6424982813.0, | |
| "eval_runtime": 753.7774, | |
| "eval_samples_per_second": 12.81, | |
| "eval_steps_per_second": 0.101, | |
| "step": 45500 | |
| }, | |
| { | |
| "entropy": 1.3549335837364196, | |
| "epoch": 1.4870555972707389, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.0160085259175834e-06, | |
| "loss": 0.12, | |
| "mean_token_accuracy": 0.9724775660037994, | |
| "num_tokens": 6431726519.0, | |
| "step": 45550 | |
| }, | |
| { | |
| "entropy": 1.3539844870567321, | |
| "epoch": 1.4886879305278966, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 1.0099421185943016e-06, | |
| "loss": 0.1171, | |
| "mean_token_accuracy": 0.9727089703083038, | |
| "num_tokens": 6439000590.0, | |
| "step": 45600 | |
| }, | |
| { | |
| "entropy": 1.3505275821685792, | |
| "epoch": 1.4903202637850543, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 1.0038902086361862e-06, | |
| "loss": 0.1234, | |
| "mean_token_accuracy": 0.9712493371963501, | |
| "num_tokens": 6446110346.0, | |
| "step": 45650 | |
| }, | |
| { | |
| "entropy": 1.3557030415534974, | |
| "epoch": 1.491952597042212, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 9.97852840130771e-07, | |
| "loss": 0.1243, | |
| "mean_token_accuracy": 0.9712071406841278, | |
| "num_tokens": 6453417508.0, | |
| "step": 45700 | |
| }, | |
| { | |
| "entropy": 1.3410662007331848, | |
| "epoch": 1.4935849302993698, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 9.918300570596596e-07, | |
| "loss": 0.1175, | |
| "mean_token_accuracy": 0.9729030966758728, | |
| "num_tokens": 6460130049.0, | |
| "step": 45750 | |
| }, | |
| { | |
| "entropy": 1.3425062656402589, | |
| "epoch": 1.4952172635565277, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 9.858219032982019e-07, | |
| "loss": 0.1225, | |
| "mean_token_accuracy": 0.9715787386894226, | |
| "num_tokens": 6467242766.0, | |
| "step": 45800 | |
| }, | |
| { | |
| "entropy": 1.3404623532295228, | |
| "epoch": 1.4968495968136855, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 9.798284226151751e-07, | |
| "loss": 0.1299, | |
| "mean_token_accuracy": 0.9696118628978729, | |
| "num_tokens": 6474698613.0, | |
| "step": 45850 | |
| }, | |
| { | |
| "entropy": 1.3416164851188659, | |
| "epoch": 1.4984819300708432, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 9.738496586724644e-07, | |
| "loss": 0.1247, | |
| "mean_token_accuracy": 0.9708961296081543, | |
| "num_tokens": 6481733346.0, | |
| "step": 45900 | |
| }, | |
| { | |
| "entropy": 1.3436065912246704, | |
| "epoch": 1.5001142633280011, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 9.678856550247433e-07, | |
| "loss": 0.1197, | |
| "mean_token_accuracy": 0.9722868132591248, | |
| "num_tokens": 6488600216.0, | |
| "step": 45950 | |
| }, | |
| { | |
| "entropy": 1.3471774291992187, | |
| "epoch": 1.5017465965851589, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 9.619364551191615e-07, | |
| "loss": 0.1234, | |
| "mean_token_accuracy": 0.9715406239032746, | |
| "num_tokens": 6495842641.0, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 1.5017465965851589, | |
| "eval_entropy": 1.3393350235621135, | |
| "eval_loss": 0.14240698516368866, | |
| "eval_mean_token_accuracy": 0.967587119738261, | |
| "eval_num_tokens": 6495842641.0, | |
| "eval_runtime": 753.9166, | |
| "eval_samples_per_second": 12.808, | |
| "eval_steps_per_second": 0.101, | |
| "step": 46000 | |
| }, | |
| { | |
| "entropy": 1.3315513157844543, | |
| "epoch": 1.5033789298423166, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 9.560021022950201e-07, | |
| "loss": 0.1152, | |
| "mean_token_accuracy": 0.9735651075839996, | |
| "num_tokens": 6502730531.0, | |
| "step": 46050 | |
| }, | |
| { | |
| "entropy": 1.3440596199035644, | |
| "epoch": 1.5050112630994743, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 9.500826397834667e-07, | |
| "loss": 0.1363, | |
| "mean_token_accuracy": 0.9686136949062347, | |
| "num_tokens": 6510207647.0, | |
| "step": 46100 | |
| }, | |
| { | |
| "entropy": 1.3446049523353576, | |
| "epoch": 1.506643596356632, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.44178110707169e-07, | |
| "loss": 0.1165, | |
| "mean_token_accuracy": 0.972632863521576, | |
| "num_tokens": 6517212379.0, | |
| "step": 46150 | |
| }, | |
| { | |
| "entropy": 1.3464979553222656, | |
| "epoch": 1.50827592961379, | |
| "grad_norm": 1.125, | |
| "learning_rate": 9.382885580800094e-07, | |
| "loss": 0.1341, | |
| "mean_token_accuracy": 0.969519715309143, | |
| "num_tokens": 6524843034.0, | |
| "step": 46200 | |
| }, | |
| { | |
| "entropy": 1.337069320678711, | |
| "epoch": 1.5099082628709477, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 9.324140248067691e-07, | |
| "loss": 0.123, | |
| "mean_token_accuracy": 0.9713637149333954, | |
| "num_tokens": 6531941894.0, | |
| "step": 46250 | |
| }, | |
| { | |
| "entropy": 1.3362834978103637, | |
| "epoch": 1.5115405961281057, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 9.265545536828111e-07, | |
| "loss": 0.1154, | |
| "mean_token_accuracy": 0.9725685477256775, | |
| "num_tokens": 6538791279.0, | |
| "step": 46300 | |
| }, | |
| { | |
| "entropy": 1.338006112575531, | |
| "epoch": 1.5131729293852634, | |
| "grad_norm": 1.5, | |
| "learning_rate": 9.207101873937768e-07, | |
| "loss": 0.1259, | |
| "mean_token_accuracy": 0.971262993812561, | |
| "num_tokens": 6545902334.0, | |
| "step": 46350 | |
| }, | |
| { | |
| "entropy": 1.3359116435050964, | |
| "epoch": 1.514805262642421, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 9.14880968515266e-07, | |
| "loss": 0.1162, | |
| "mean_token_accuracy": 0.9728143513202667, | |
| "num_tokens": 6553190103.0, | |
| "step": 46400 | |
| }, | |
| { | |
| "entropy": 1.3362672972679137, | |
| "epoch": 1.5164375958995788, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 9.090669395125351e-07, | |
| "loss": 0.1155, | |
| "mean_token_accuracy": 0.9727682447433472, | |
| "num_tokens": 6559814465.0, | |
| "step": 46450 | |
| }, | |
| { | |
| "entropy": 1.3332003378868102, | |
| "epoch": 1.5180699291567366, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 9.032681427401806e-07, | |
| "loss": 0.1094, | |
| "mean_token_accuracy": 0.9738853967189789, | |
| "num_tokens": 6566382565.0, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 1.5180699291567366, | |
| "eval_entropy": 1.3380318832397462, | |
| "eval_loss": 0.14237698912620544, | |
| "eval_mean_token_accuracy": 0.9675415523846944, | |
| "eval_num_tokens": 6566382565.0, | |
| "eval_runtime": 751.4412, | |
| "eval_samples_per_second": 12.85, | |
| "eval_steps_per_second": 0.101, | |
| "step": 46500 | |
| }, | |
| { | |
| "entropy": 1.3400082111358642, | |
| "epoch": 1.5197022624138943, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 8.974846204418361e-07, | |
| "loss": 0.1245, | |
| "mean_token_accuracy": 0.9713966703414917, | |
| "num_tokens": 6573501735.0, | |
| "step": 46550 | |
| }, | |
| { | |
| "entropy": 1.3266588830947876, | |
| "epoch": 1.5213345956710522, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 8.917164147498621e-07, | |
| "loss": 0.1156, | |
| "mean_token_accuracy": 0.9723888230323792, | |
| "num_tokens": 6580331916.0, | |
| "step": 46600 | |
| }, | |
| { | |
| "entropy": 1.340979859828949, | |
| "epoch": 1.52296692892821, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 8.859635676850372e-07, | |
| "loss": 0.1174, | |
| "mean_token_accuracy": 0.9720415270328522, | |
| "num_tokens": 6586882066.0, | |
| "step": 46650 | |
| }, | |
| { | |
| "entropy": 1.338642556667328, | |
| "epoch": 1.524599262185368, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 8.802261211562563e-07, | |
| "loss": 0.1206, | |
| "mean_token_accuracy": 0.9713714873790741, | |
| "num_tokens": 6593693750.0, | |
| "step": 46700 | |
| }, | |
| { | |
| "entropy": 1.3478483366966247, | |
| "epoch": 1.5262315954425256, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 8.745041169602207e-07, | |
| "loss": 0.1278, | |
| "mean_token_accuracy": 0.9710940301418305, | |
| "num_tokens": 6601030060.0, | |
| "step": 46750 | |
| }, | |
| { | |
| "entropy": 1.3367830848693847, | |
| "epoch": 1.5278639286996833, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 8.687975967811393e-07, | |
| "loss": 0.1235, | |
| "mean_token_accuracy": 0.9711613404750824, | |
| "num_tokens": 6607766556.0, | |
| "step": 46800 | |
| }, | |
| { | |
| "entropy": 1.3393477821350097, | |
| "epoch": 1.529496261956841, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 8.631066021904173e-07, | |
| "loss": 0.1281, | |
| "mean_token_accuracy": 0.9700063776969909, | |
| "num_tokens": 6615264797.0, | |
| "step": 46850 | |
| }, | |
| { | |
| "entropy": 1.3377933168411256, | |
| "epoch": 1.5311285952139988, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 8.574311746463602e-07, | |
| "loss": 0.1219, | |
| "mean_token_accuracy": 0.9712197721004486, | |
| "num_tokens": 6622625782.0, | |
| "step": 46900 | |
| }, | |
| { | |
| "entropy": 1.3495184230804442, | |
| "epoch": 1.5327609284711565, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 8.517713554938698e-07, | |
| "loss": 0.1291, | |
| "mean_token_accuracy": 0.9702865195274353, | |
| "num_tokens": 6629917201.0, | |
| "step": 46950 | |
| }, | |
| { | |
| "entropy": 1.337637755870819, | |
| "epoch": 1.5343932617283145, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 8.461271859641413e-07, | |
| "loss": 0.124, | |
| "mean_token_accuracy": 0.9718515348434448, | |
| "num_tokens": 6636921749.0, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 1.5343932617283145, | |
| "eval_entropy": 1.3388334194819131, | |
| "eval_loss": 0.14232522249221802, | |
| "eval_mean_token_accuracy": 0.9674970960617065, | |
| "eval_num_tokens": 6636921749.0, | |
| "eval_runtime": 752.3993, | |
| "eval_samples_per_second": 12.834, | |
| "eval_steps_per_second": 0.101, | |
| "step": 47000 | |
| }, | |
| { | |
| "entropy": 1.3416547775268555, | |
| "epoch": 1.5360255949854722, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 8.404987071743628e-07, | |
| "loss": 0.1152, | |
| "mean_token_accuracy": 0.9724602663516998, | |
| "num_tokens": 6643500789.0, | |
| "step": 47050 | |
| }, | |
| { | |
| "entropy": 1.3312445497512817, | |
| "epoch": 1.5376579282426301, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 8.348859601274191e-07, | |
| "loss": 0.1141, | |
| "mean_token_accuracy": 0.9736961400508881, | |
| "num_tokens": 6650321316.0, | |
| "step": 47100 | |
| }, | |
| { | |
| "entropy": 1.3447378778457642, | |
| "epoch": 1.5392902614997879, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 8.292889857115906e-07, | |
| "loss": 0.1251, | |
| "mean_token_accuracy": 0.9713517308235169, | |
| "num_tokens": 6657596511.0, | |
| "step": 47150 | |
| }, | |
| { | |
| "entropy": 1.3332865810394288, | |
| "epoch": 1.5409225947569456, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 8.237078247002536e-07, | |
| "loss": 0.1144, | |
| "mean_token_accuracy": 0.9731272792816162, | |
| "num_tokens": 6664455846.0, | |
| "step": 47200 | |
| }, | |
| { | |
| "entropy": 1.3360036635398864, | |
| "epoch": 1.5425549280141033, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 8.181425177515887e-07, | |
| "loss": 0.1181, | |
| "mean_token_accuracy": 0.9728715085983276, | |
| "num_tokens": 6671444402.0, | |
| "step": 47250 | |
| }, | |
| { | |
| "entropy": 1.3451998877525329, | |
| "epoch": 1.544187261271261, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 8.125931054082775e-07, | |
| "loss": 0.1189, | |
| "mean_token_accuracy": 0.9724408376216889, | |
| "num_tokens": 6678449907.0, | |
| "step": 47300 | |
| }, | |
| { | |
| "entropy": 1.346355800628662, | |
| "epoch": 1.5458195945284188, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 8.070596280972152e-07, | |
| "loss": 0.1311, | |
| "mean_token_accuracy": 0.9690172004699708, | |
| "num_tokens": 6685797698.0, | |
| "step": 47350 | |
| }, | |
| { | |
| "entropy": 1.341564166545868, | |
| "epoch": 1.5474519277855767, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 8.01542126129208e-07, | |
| "loss": 0.1231, | |
| "mean_token_accuracy": 0.9712344872951507, | |
| "num_tokens": 6692582052.0, | |
| "step": 47400 | |
| }, | |
| { | |
| "entropy": 1.3289701747894287, | |
| "epoch": 1.5490842610427344, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 7.960406396986855e-07, | |
| "loss": 0.1137, | |
| "mean_token_accuracy": 0.9735037076473236, | |
| "num_tokens": 6699347585.0, | |
| "step": 47450 | |
| }, | |
| { | |
| "entropy": 1.3448949909210206, | |
| "epoch": 1.5507165942998924, | |
| "grad_norm": 1.875, | |
| "learning_rate": 7.905552088834074e-07, | |
| "loss": 0.1267, | |
| "mean_token_accuracy": 0.9710251951217651, | |
| "num_tokens": 6706378386.0, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 1.5507165942998924, | |
| "eval_entropy": 1.338226900100708, | |
| "eval_loss": 0.14231520891189575, | |
| "eval_mean_token_accuracy": 0.9675397229194641, | |
| "eval_num_tokens": 6706378386.0, | |
| "eval_runtime": 753.0219, | |
| "eval_samples_per_second": 12.823, | |
| "eval_steps_per_second": 0.101, | |
| "step": 47500 | |
| }, | |
| { | |
| "entropy": 1.340337586402893, | |
| "epoch": 1.5523489275570501, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 7.850858736441654e-07, | |
| "loss": 0.1177, | |
| "mean_token_accuracy": 0.9723483467102051, | |
| "num_tokens": 6713336578.0, | |
| "step": 47550 | |
| }, | |
| { | |
| "entropy": 1.3498352313041686, | |
| "epoch": 1.5539812608142078, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 7.796326738245014e-07, | |
| "loss": 0.1213, | |
| "mean_token_accuracy": 0.9714156925678253, | |
| "num_tokens": 6720699596.0, | |
| "step": 47600 | |
| }, | |
| { | |
| "entropy": 1.3324111270904542, | |
| "epoch": 1.5556135940713656, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 7.741956491504081e-07, | |
| "loss": 0.1174, | |
| "mean_token_accuracy": 0.9724169254302979, | |
| "num_tokens": 6727286916.0, | |
| "step": 47650 | |
| }, | |
| { | |
| "entropy": 1.3425724387168885, | |
| "epoch": 1.5572459273285233, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 7.687748392300481e-07, | |
| "loss": 0.1211, | |
| "mean_token_accuracy": 0.9714575302600861, | |
| "num_tokens": 6734228986.0, | |
| "step": 47700 | |
| }, | |
| { | |
| "entropy": 1.3485777735710145, | |
| "epoch": 1.558878260585681, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 7.633702835534574e-07, | |
| "loss": 0.1245, | |
| "mean_token_accuracy": 0.9718271470069886, | |
| "num_tokens": 6741417194.0, | |
| "step": 47750 | |
| }, | |
| { | |
| "entropy": 1.3329823040962219, | |
| "epoch": 1.560510593842839, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 7.579820214922639e-07, | |
| "loss": 0.1068, | |
| "mean_token_accuracy": 0.9753419077396392, | |
| "num_tokens": 6747898229.0, | |
| "step": 47800 | |
| }, | |
| { | |
| "entropy": 1.3319637727737428, | |
| "epoch": 1.5621429270999967, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 7.526100922993989e-07, | |
| "loss": 0.1122, | |
| "mean_token_accuracy": 0.9736012244224548, | |
| "num_tokens": 6754984506.0, | |
| "step": 47850 | |
| }, | |
| { | |
| "entropy": 1.3472395992279054, | |
| "epoch": 1.5637752603571546, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 7.472545351088072e-07, | |
| "loss": 0.1171, | |
| "mean_token_accuracy": 0.9723641383647919, | |
| "num_tokens": 6761747061.0, | |
| "step": 47900 | |
| }, | |
| { | |
| "entropy": 1.322529821395874, | |
| "epoch": 1.5654075936143124, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 7.419153889351687e-07, | |
| "loss": 0.1112, | |
| "mean_token_accuracy": 0.9733606302738189, | |
| "num_tokens": 6768792123.0, | |
| "step": 47950 | |
| }, | |
| { | |
| "entropy": 1.338774642944336, | |
| "epoch": 1.56703992687147, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 7.365926926736079e-07, | |
| "loss": 0.1298, | |
| "mean_token_accuracy": 0.969396116733551, | |
| "num_tokens": 6776660559.0, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.56703992687147, | |
| "eval_entropy": 1.3360849984486898, | |
| "eval_loss": 0.14228671789169312, | |
| "eval_mean_token_accuracy": 0.9675799965858459, | |
| "eval_num_tokens": 6776660559.0, | |
| "eval_runtime": 744.7808, | |
| "eval_samples_per_second": 12.965, | |
| "eval_steps_per_second": 0.102, | |
| "step": 48000 | |
| }, | |
| { | |
| "entropy": 1.3329459977149964, | |
| "epoch": 1.5686722601286278, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 7.312864850994151e-07, | |
| "loss": 0.1107, | |
| "mean_token_accuracy": 0.9741031527519226, | |
| "num_tokens": 6783181818.0, | |
| "step": 48050 | |
| }, | |
| { | |
| "entropy": 1.3349730682373047, | |
| "epoch": 1.5703045933857855, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 7.259968048677626e-07, | |
| "loss": 0.1136, | |
| "mean_token_accuracy": 0.9728020560741425, | |
| "num_tokens": 6790226377.0, | |
| "step": 48100 | |
| }, | |
| { | |
| "entropy": 1.3363687252998353, | |
| "epoch": 1.5719369266429433, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 7.207236905134222e-07, | |
| "loss": 0.1159, | |
| "mean_token_accuracy": 0.9732009255886078, | |
| "num_tokens": 6797331847.0, | |
| "step": 48150 | |
| }, | |
| { | |
| "entropy": 1.3224567222595214, | |
| "epoch": 1.5735692599001012, | |
| "grad_norm": 2.375, | |
| "learning_rate": 7.154671804504838e-07, | |
| "loss": 0.1187, | |
| "mean_token_accuracy": 0.9726608419418334, | |
| "num_tokens": 6804197080.0, | |
| "step": 48200 | |
| }, | |
| { | |
| "entropy": 1.3282025313377381, | |
| "epoch": 1.575201593157259, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 7.102273129720785e-07, | |
| "loss": 0.1171, | |
| "mean_token_accuracy": 0.9726303327083587, | |
| "num_tokens": 6811554275.0, | |
| "step": 48250 | |
| }, | |
| { | |
| "entropy": 1.3339614725112916, | |
| "epoch": 1.5768339264144169, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 7.050041262500963e-07, | |
| "loss": 0.1211, | |
| "mean_token_accuracy": 0.9716296088695526, | |
| "num_tokens": 6818575585.0, | |
| "step": 48300 | |
| }, | |
| { | |
| "entropy": 1.3479553842544556, | |
| "epoch": 1.5784662596715746, | |
| "grad_norm": 1.25, | |
| "learning_rate": 6.99797658334911e-07, | |
| "loss": 0.1297, | |
| "mean_token_accuracy": 0.9702737581729889, | |
| "num_tokens": 6825774443.0, | |
| "step": 48350 | |
| }, | |
| { | |
| "entropy": 1.3463473081588746, | |
| "epoch": 1.5800985929287323, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 6.946079471551018e-07, | |
| "loss": 0.1289, | |
| "mean_token_accuracy": 0.9703529167175293, | |
| "num_tokens": 6833071654.0, | |
| "step": 48400 | |
| }, | |
| { | |
| "entropy": 1.3393766927719115, | |
| "epoch": 1.58173092618589, | |
| "grad_norm": 2.0, | |
| "learning_rate": 6.894350305171747e-07, | |
| "loss": 0.1196, | |
| "mean_token_accuracy": 0.9719527661800385, | |
| "num_tokens": 6840009616.0, | |
| "step": 48450 | |
| }, | |
| { | |
| "entropy": 1.333970193862915, | |
| "epoch": 1.5833632594430478, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 6.842789461052923e-07, | |
| "loss": 0.1157, | |
| "mean_token_accuracy": 0.9726070737838746, | |
| "num_tokens": 6847179809.0, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 1.5833632594430478, | |
| "eval_entropy": 1.3332714064915976, | |
| "eval_loss": 0.14229924976825714, | |
| "eval_mean_token_accuracy": 0.9675857615470886, | |
| "eval_num_tokens": 6847179809.0, | |
| "eval_runtime": 747.985, | |
| "eval_samples_per_second": 12.909, | |
| "eval_steps_per_second": 0.102, | |
| "step": 48500 | |
| }, | |
| { | |
| "entropy": 1.3204735660552978, | |
| "epoch": 1.5849955927002055, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 6.791397314809928e-07, | |
| "loss": 0.107, | |
| "mean_token_accuracy": 0.9743911874294281, | |
| "num_tokens": 6853704094.0, | |
| "step": 48550 | |
| }, | |
| { | |
| "entropy": 1.3412930655479431, | |
| "epoch": 1.5866279259573635, | |
| "grad_norm": 2.0, | |
| "learning_rate": 6.740174240829229e-07, | |
| "loss": 0.1119, | |
| "mean_token_accuracy": 0.973388170003891, | |
| "num_tokens": 6860416510.0, | |
| "step": 48600 | |
| }, | |
| { | |
| "entropy": 1.3327165865898132, | |
| "epoch": 1.5882602592145212, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 6.689120612265592e-07, | |
| "loss": 0.1166, | |
| "mean_token_accuracy": 0.9722766876220703, | |
| "num_tokens": 6867304006.0, | |
| "step": 48650 | |
| }, | |
| { | |
| "entropy": 1.3274698781967162, | |
| "epoch": 1.5898925924716791, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 6.638236801039406e-07, | |
| "loss": 0.1179, | |
| "mean_token_accuracy": 0.9724640393257141, | |
| "num_tokens": 6874107973.0, | |
| "step": 48700 | |
| }, | |
| { | |
| "entropy": 1.3326347541809083, | |
| "epoch": 1.5915249257288369, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 6.587523177833969e-07, | |
| "loss": 0.119, | |
| "mean_token_accuracy": 0.9718975865840912, | |
| "num_tokens": 6881463304.0, | |
| "step": 48750 | |
| }, | |
| { | |
| "entropy": 1.3314825320243835, | |
| "epoch": 1.5931572589859946, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 6.536980112092748e-07, | |
| "loss": 0.1296, | |
| "mean_token_accuracy": 0.970567889213562, | |
| "num_tokens": 6888936324.0, | |
| "step": 48800 | |
| }, | |
| { | |
| "entropy": 1.335792977809906, | |
| "epoch": 1.5947895922431523, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 6.486607972016746e-07, | |
| "loss": 0.1091, | |
| "mean_token_accuracy": 0.9739424622058869, | |
| "num_tokens": 6895517659.0, | |
| "step": 48850 | |
| }, | |
| { | |
| "entropy": 1.3427368450164794, | |
| "epoch": 1.59642192550031, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 6.436407124561761e-07, | |
| "loss": 0.12, | |
| "mean_token_accuracy": 0.9717347931861877, | |
| "num_tokens": 6902937758.0, | |
| "step": 48900 | |
| }, | |
| { | |
| "entropy": 1.3364178919792176, | |
| "epoch": 1.5980542587574678, | |
| "grad_norm": 1.5, | |
| "learning_rate": 6.386377935435774e-07, | |
| "loss": 0.1177, | |
| "mean_token_accuracy": 0.9717580342292785, | |
| "num_tokens": 6909650064.0, | |
| "step": 48950 | |
| }, | |
| { | |
| "entropy": 1.3345691514015199, | |
| "epoch": 1.5996865920146257, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 6.336520769096215e-07, | |
| "loss": 0.1242, | |
| "mean_token_accuracy": 0.9705976390838623, | |
| "num_tokens": 6916954844.0, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 1.5996865920146257, | |
| "eval_entropy": 1.3313024059931438, | |
| "eval_loss": 0.14232422411441803, | |
| "eval_mean_token_accuracy": 0.9676321744918823, | |
| "eval_num_tokens": 6916954844.0, | |
| "eval_runtime": 750.8283, | |
| "eval_samples_per_second": 12.86, | |
| "eval_steps_per_second": 0.101, | |
| "step": 49000 | |
| }, | |
| { | |
| "entropy": 1.3268843412399292, | |
| "epoch": 1.6013189252717834, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 6.286835988747385e-07, | |
| "loss": 0.12, | |
| "mean_token_accuracy": 0.9719444465637207, | |
| "num_tokens": 6924452826.0, | |
| "step": 49050 | |
| }, | |
| { | |
| "entropy": 1.3368324255943298, | |
| "epoch": 1.6029512585289414, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 6.237323956337755e-07, | |
| "loss": 0.1192, | |
| "mean_token_accuracy": 0.9717724549770356, | |
| "num_tokens": 6931690729.0, | |
| "step": 49100 | |
| }, | |
| { | |
| "entropy": 1.3325298118591309, | |
| "epoch": 1.604583591786099, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 6.18798503255733e-07, | |
| "loss": 0.1192, | |
| "mean_token_accuracy": 0.9713486981391907, | |
| "num_tokens": 6938882010.0, | |
| "step": 49150 | |
| }, | |
| { | |
| "entropy": 1.339358766078949, | |
| "epoch": 1.6062159250432568, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 6.138819576835056e-07, | |
| "loss": 0.1148, | |
| "mean_token_accuracy": 0.9726817321777343, | |
| "num_tokens": 6945842260.0, | |
| "step": 49200 | |
| }, | |
| { | |
| "entropy": 1.343044672012329, | |
| "epoch": 1.6078482583004146, | |
| "grad_norm": 1.375, | |
| "learning_rate": 6.089827947336176e-07, | |
| "loss": 0.1264, | |
| "mean_token_accuracy": 0.9707775366306305, | |
| "num_tokens": 6953111267.0, | |
| "step": 49250 | |
| }, | |
| { | |
| "entropy": 1.3318463802337646, | |
| "epoch": 1.6094805915575723, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 6.041010500959636e-07, | |
| "loss": 0.1129, | |
| "mean_token_accuracy": 0.9740157461166382, | |
| "num_tokens": 6960306994.0, | |
| "step": 49300 | |
| }, | |
| { | |
| "entropy": 1.329837028980255, | |
| "epoch": 1.6111129248147302, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 5.992367593335453e-07, | |
| "loss": 0.1108, | |
| "mean_token_accuracy": 0.9734898245334626, | |
| "num_tokens": 6966883891.0, | |
| "step": 49350 | |
| }, | |
| { | |
| "entropy": 1.3326479887962341, | |
| "epoch": 1.612745258071888, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 5.943899578822175e-07, | |
| "loss": 0.1136, | |
| "mean_token_accuracy": 0.9735347747802734, | |
| "num_tokens": 6973945682.0, | |
| "step": 49400 | |
| }, | |
| { | |
| "entropy": 1.326375277042389, | |
| "epoch": 1.614377591329046, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 5.895606810504245e-07, | |
| "loss": 0.1126, | |
| "mean_token_accuracy": 0.9733131611347199, | |
| "num_tokens": 6980960627.0, | |
| "step": 49450 | |
| }, | |
| { | |
| "entropy": 1.3266846776008605, | |
| "epoch": 1.6160099245862036, | |
| "grad_norm": 1.625, | |
| "learning_rate": 5.847489640189483e-07, | |
| "loss": 0.1138, | |
| "mean_token_accuracy": 0.9726812386512756, | |
| "num_tokens": 6987961577.0, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 1.6160099245862036, | |
| "eval_entropy": 1.327593413988749, | |
| "eval_loss": 0.1423512101173401, | |
| "eval_mean_token_accuracy": 0.9675911315282186, | |
| "eval_num_tokens": 6987961577.0, | |
| "eval_runtime": 752.8917, | |
| "eval_samples_per_second": 12.825, | |
| "eval_steps_per_second": 0.101, | |
| "step": 49500 | |
| }, | |
| { | |
| "entropy": 1.3377934908866882, | |
| "epoch": 1.6176422578433614, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 5.799548418406465e-07, | |
| "loss": 0.1259, | |
| "mean_token_accuracy": 0.9708180844783783, | |
| "num_tokens": 6995561320.0, | |
| "step": 49550 | |
| }, | |
| { | |
| "entropy": 1.329156894683838, | |
| "epoch": 1.619274591100519, | |
| "grad_norm": 0.10498046875, | |
| "learning_rate": 5.751783494402026e-07, | |
| "loss": 0.1231, | |
| "mean_token_accuracy": 0.9713264811038971, | |
| "num_tokens": 7002493076.0, | |
| "step": 49600 | |
| }, | |
| { | |
| "entropy": 1.3364929604530333, | |
| "epoch": 1.6209069243576768, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 5.704195216138692e-07, | |
| "loss": 0.1268, | |
| "mean_token_accuracy": 0.9701859080791473, | |
| "num_tokens": 7010022324.0, | |
| "step": 49650 | |
| }, | |
| { | |
| "entropy": 1.3322447371482848, | |
| "epoch": 1.6225392576148345, | |
| "grad_norm": 0.0128173828125, | |
| "learning_rate": 5.656783930292111e-07, | |
| "loss": 0.1099, | |
| "mean_token_accuracy": 0.9747293889522552, | |
| "num_tokens": 7016790835.0, | |
| "step": 49700 | |
| }, | |
| { | |
| "entropy": 1.3278017139434815, | |
| "epoch": 1.6241715908719925, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 5.609549982248599e-07, | |
| "loss": 0.1191, | |
| "mean_token_accuracy": 0.9724988090991974, | |
| "num_tokens": 7023902551.0, | |
| "step": 49750 | |
| }, | |
| { | |
| "entropy": 1.3292198777198792, | |
| "epoch": 1.6258039241291502, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 5.562493716102552e-07, | |
| "loss": 0.1221, | |
| "mean_token_accuracy": 0.9709643149375915, | |
| "num_tokens": 7031377627.0, | |
| "step": 49800 | |
| }, | |
| { | |
| "entropy": 1.324635624885559, | |
| "epoch": 1.6274362573863081, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 5.515615474653998e-07, | |
| "loss": 0.124, | |
| "mean_token_accuracy": 0.9710781908035279, | |
| "num_tokens": 7039286978.0, | |
| "step": 49850 | |
| }, | |
| { | |
| "entropy": 1.3296051907539368, | |
| "epoch": 1.6290685906434659, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 5.46891559940605e-07, | |
| "loss": 0.108, | |
| "mean_token_accuracy": 0.9743620455265045, | |
| "num_tokens": 7046170706.0, | |
| "step": 49900 | |
| }, | |
| { | |
| "entropy": 1.3330603170394897, | |
| "epoch": 1.6307009239006236, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 5.422394430562457e-07, | |
| "loss": 0.1062, | |
| "mean_token_accuracy": 0.9752356350421906, | |
| "num_tokens": 7052934305.0, | |
| "step": 49950 | |
| }, | |
| { | |
| "entropy": 1.3311968517303467, | |
| "epoch": 1.6323332571577813, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 5.376052307025119e-07, | |
| "loss": 0.1239, | |
| "mean_token_accuracy": 0.9706245791912079, | |
| "num_tokens": 7060043084.0, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 1.6323332571577813, | |
| "eval_entropy": 1.3246519072850544, | |
| "eval_loss": 0.14238578081130981, | |
| "eval_mean_token_accuracy": 0.9675867708524069, | |
| "eval_num_tokens": 7060043084.0, | |
| "eval_runtime": 752.4961, | |
| "eval_samples_per_second": 12.832, | |
| "eval_steps_per_second": 0.101, | |
| "step": 50000 | |
| }, | |
| { | |
| "entropy": 1.3305332589149474, | |
| "epoch": 1.633965590414939, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 5.329889566391578e-07, | |
| "loss": 0.1144, | |
| "mean_token_accuracy": 0.9729759168624877, | |
| "num_tokens": 7066947331.0, | |
| "step": 50050 | |
| }, | |
| { | |
| "entropy": 1.3186654925346375, | |
| "epoch": 1.6355979236720968, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.283906544952627e-07, | |
| "loss": 0.1132, | |
| "mean_token_accuracy": 0.9731574881076813, | |
| "num_tokens": 7074008041.0, | |
| "step": 50100 | |
| }, | |
| { | |
| "entropy": 1.318843502998352, | |
| "epoch": 1.6372302569292547, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 5.238103577689788e-07, | |
| "loss": 0.1161, | |
| "mean_token_accuracy": 0.9729295611381531, | |
| "num_tokens": 7081071001.0, | |
| "step": 50150 | |
| }, | |
| { | |
| "entropy": 1.3307886505126953, | |
| "epoch": 1.6388625901864124, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 5.192480998272943e-07, | |
| "loss": 0.1142, | |
| "mean_token_accuracy": 0.972575945854187, | |
| "num_tokens": 7088102191.0, | |
| "step": 50200 | |
| }, | |
| { | |
| "entropy": 1.335254201889038, | |
| "epoch": 1.6404949234435704, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 5.147039139057831e-07, | |
| "loss": 0.1271, | |
| "mean_token_accuracy": 0.9703405356407165, | |
| "num_tokens": 7095646828.0, | |
| "step": 50250 | |
| }, | |
| { | |
| "entropy": 1.3202842998504638, | |
| "epoch": 1.6421272567007281, | |
| "grad_norm": 2.125, | |
| "learning_rate": 5.101778331083691e-07, | |
| "loss": 0.1085, | |
| "mean_token_accuracy": 0.9740463018417358, | |
| "num_tokens": 7102740051.0, | |
| "step": 50300 | |
| }, | |
| { | |
| "entropy": 1.3307878541946412, | |
| "epoch": 1.6437595899578858, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 5.05669890407081e-07, | |
| "loss": 0.1114, | |
| "mean_token_accuracy": 0.9739047718048096, | |
| "num_tokens": 7109640526.0, | |
| "step": 50350 | |
| }, | |
| { | |
| "entropy": 1.3166346144676209, | |
| "epoch": 1.6453919232150436, | |
| "grad_norm": 0.08984375, | |
| "learning_rate": 5.011801186418147e-07, | |
| "loss": 0.1082, | |
| "mean_token_accuracy": 0.974234766960144, | |
| "num_tokens": 7116612724.0, | |
| "step": 50400 | |
| }, | |
| { | |
| "entropy": 1.324273819923401, | |
| "epoch": 1.6470242564722013, | |
| "grad_norm": 1.75, | |
| "learning_rate": 4.967085505200896e-07, | |
| "loss": 0.1145, | |
| "mean_token_accuracy": 0.9735607969760894, | |
| "num_tokens": 7123679576.0, | |
| "step": 50450 | |
| }, | |
| { | |
| "entropy": 1.3159007930755615, | |
| "epoch": 1.648656589729359, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 4.922552186168168e-07, | |
| "loss": 0.1175, | |
| "mean_token_accuracy": 0.9724935472011567, | |
| "num_tokens": 7130980530.0, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 1.648656589729359, | |
| "eval_entropy": 1.324186561902364, | |
| "eval_loss": 0.14239400625228882, | |
| "eval_mean_token_accuracy": 0.9676336812973022, | |
| "eval_num_tokens": 7130980530.0, | |
| "eval_runtime": 748.1515, | |
| "eval_samples_per_second": 12.906, | |
| "eval_steps_per_second": 0.102, | |
| "step": 50500 | |
| }, | |
| { | |
| "entropy": 1.320852587223053, | |
| "epoch": 1.650288922986517, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 4.878201553740573e-07, | |
| "loss": 0.1148, | |
| "mean_token_accuracy": 0.9720003747940064, | |
| "num_tokens": 7137814687.0, | |
| "step": 50550 | |
| }, | |
| { | |
| "entropy": 1.3245435237884522, | |
| "epoch": 1.6519212562436747, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 4.834033931007857e-07, | |
| "loss": 0.1079, | |
| "mean_token_accuracy": 0.9746451807022095, | |
| "num_tokens": 7144119513.0, | |
| "step": 50600 | |
| }, | |
| { | |
| "entropy": 1.317584047317505, | |
| "epoch": 1.6535535895008326, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 4.790049639726581e-07, | |
| "loss": 0.1097, | |
| "mean_token_accuracy": 0.9739763534069061, | |
| "num_tokens": 7150741274.0, | |
| "step": 50650 | |
| }, | |
| { | |
| "entropy": 1.3234117150306701, | |
| "epoch": 1.6551859227579904, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 4.746249000317725e-07, | |
| "loss": 0.1069, | |
| "mean_token_accuracy": 0.9751713788509369, | |
| "num_tokens": 7157786331.0, | |
| "step": 50700 | |
| }, | |
| { | |
| "entropy": 1.3205085873603821, | |
| "epoch": 1.656818256015148, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 4.702632331864422e-07, | |
| "loss": 0.1046, | |
| "mean_token_accuracy": 0.9752000343799591, | |
| "num_tokens": 7164501001.0, | |
| "step": 50750 | |
| }, | |
| { | |
| "entropy": 1.3175451397895812, | |
| "epoch": 1.6584505892723058, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 4.6591999521095563e-07, | |
| "loss": 0.0972, | |
| "mean_token_accuracy": 0.9763833940029144, | |
| "num_tokens": 7170583305.0, | |
| "step": 50800 | |
| }, | |
| { | |
| "entropy": 1.326640043258667, | |
| "epoch": 1.6600829225294635, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 4.6159521774535153e-07, | |
| "loss": 0.1176, | |
| "mean_token_accuracy": 0.9723085272312164, | |
| "num_tokens": 7177641823.0, | |
| "step": 50850 | |
| }, | |
| { | |
| "entropy": 1.3212744474411011, | |
| "epoch": 1.6617152557866213, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 4.572889322951863e-07, | |
| "loss": 0.1152, | |
| "mean_token_accuracy": 0.972984424829483, | |
| "num_tokens": 7184801732.0, | |
| "step": 50900 | |
| }, | |
| { | |
| "entropy": 1.317273302078247, | |
| "epoch": 1.6633475890437792, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 4.530011702313006e-07, | |
| "loss": 0.1081, | |
| "mean_token_accuracy": 0.9745338428020477, | |
| "num_tokens": 7191485350.0, | |
| "step": 50950 | |
| }, | |
| { | |
| "entropy": 1.3148117685317993, | |
| "epoch": 1.664979922300937, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 4.487319627895976e-07, | |
| "loss": 0.1132, | |
| "mean_token_accuracy": 0.9734457182884216, | |
| "num_tokens": 7198363082.0, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 1.664979922300937, | |
| "eval_entropy": 1.3203301127751668, | |
| "eval_loss": 0.14241376519203186, | |
| "eval_mean_token_accuracy": 0.9676187674204508, | |
| "eval_num_tokens": 7198363082.0, | |
| "eval_runtime": 750.908, | |
| "eval_samples_per_second": 12.859, | |
| "eval_steps_per_second": 0.101, | |
| "step": 51000 | |
| }, | |
| { | |
| "entropy": 1.3135675048828126, | |
| "epoch": 1.666612255558095, | |
| "grad_norm": 0.0026092529296875, | |
| "learning_rate": 4.4448134107080895e-07, | |
| "loss": 0.1087, | |
| "mean_token_accuracy": 0.9744446206092835, | |
| "num_tokens": 7205394224.0, | |
| "step": 51050 | |
| }, | |
| { | |
| "entropy": 1.330612359046936, | |
| "epoch": 1.6682445888152526, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 4.4024933604027495e-07, | |
| "loss": 0.118, | |
| "mean_token_accuracy": 0.9718951296806335, | |
| "num_tokens": 7212621966.0, | |
| "step": 51100 | |
| }, | |
| { | |
| "entropy": 1.3243393778800965, | |
| "epoch": 1.6698769220724103, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 4.360359785277107e-07, | |
| "loss": 0.1136, | |
| "mean_token_accuracy": 0.9730891573429108, | |
| "num_tokens": 7219256110.0, | |
| "step": 51150 | |
| }, | |
| { | |
| "entropy": 1.321114592552185, | |
| "epoch": 1.671509255329568, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 4.3184129922699e-07, | |
| "loss": 0.1132, | |
| "mean_token_accuracy": 0.9732060301303863, | |
| "num_tokens": 7226320848.0, | |
| "step": 51200 | |
| }, | |
| { | |
| "entropy": 1.3259812951087953, | |
| "epoch": 1.6731415885867258, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 4.276653286959168e-07, | |
| "loss": 0.1046, | |
| "mean_token_accuracy": 0.9755737960338593, | |
| "num_tokens": 7233157988.0, | |
| "step": 51250 | |
| }, | |
| { | |
| "entropy": 1.3230745482444763, | |
| "epoch": 1.6747739218438835, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 4.2350809735600106e-07, | |
| "loss": 0.112, | |
| "mean_token_accuracy": 0.9729019176959991, | |
| "num_tokens": 7240476512.0, | |
| "step": 51300 | |
| }, | |
| { | |
| "entropy": 1.3131797289848328, | |
| "epoch": 1.6764062551010415, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 4.1936963549224396e-07, | |
| "loss": 0.1124, | |
| "mean_token_accuracy": 0.9734921300411224, | |
| "num_tokens": 7247352561.0, | |
| "step": 51350 | |
| }, | |
| { | |
| "entropy": 1.306911015510559, | |
| "epoch": 1.6780385883581992, | |
| "grad_norm": 1.25, | |
| "learning_rate": 4.1524997325290903e-07, | |
| "loss": 0.1107, | |
| "mean_token_accuracy": 0.9738042771816253, | |
| "num_tokens": 7254496106.0, | |
| "step": 51400 | |
| }, | |
| { | |
| "entropy": 1.3271653127670289, | |
| "epoch": 1.6796709216153571, | |
| "grad_norm": 0.00311279296875, | |
| "learning_rate": 4.1114914064930875e-07, | |
| "loss": 0.1095, | |
| "mean_token_accuracy": 0.973537621498108, | |
| "num_tokens": 7261315934.0, | |
| "step": 51450 | |
| }, | |
| { | |
| "entropy": 1.3219536185264587, | |
| "epoch": 1.6813032548725149, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 4.0706716755558326e-07, | |
| "loss": 0.1049, | |
| "mean_token_accuracy": 0.9747460389137268, | |
| "num_tokens": 7268210775.0, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 1.6813032548725149, | |
| "eval_entropy": 1.3198106654485067, | |
| "eval_loss": 0.14244017004966736, | |
| "eval_mean_token_accuracy": 0.9676569310824076, | |
| "eval_num_tokens": 7268210775.0, | |
| "eval_runtime": 753.6731, | |
| "eval_samples_per_second": 12.812, | |
| "eval_steps_per_second": 0.101, | |
| "step": 51500 | |
| }, | |
| { | |
| "entropy": 1.3197919082641603, | |
| "epoch": 1.6829355881296726, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 4.0300408370848365e-07, | |
| "loss": 0.1043, | |
| "mean_token_accuracy": 0.9753884637355804, | |
| "num_tokens": 7274845487.0, | |
| "step": 51550 | |
| }, | |
| { | |
| "entropy": 1.3303123378753663, | |
| "epoch": 1.6845679213868303, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 3.9895991870715264e-07, | |
| "loss": 0.1181, | |
| "mean_token_accuracy": 0.9722198081016541, | |
| "num_tokens": 7282313836.0, | |
| "step": 51600 | |
| }, | |
| { | |
| "entropy": 1.3250636410713197, | |
| "epoch": 1.686200254643988, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 3.9493470201291404e-07, | |
| "loss": 0.109, | |
| "mean_token_accuracy": 0.9739108157157897, | |
| "num_tokens": 7289309163.0, | |
| "step": 51650 | |
| }, | |
| { | |
| "entropy": 1.3373154950141908, | |
| "epoch": 1.6878325879011458, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 3.909284629490526e-07, | |
| "loss": 0.1186, | |
| "mean_token_accuracy": 0.9716354882717133, | |
| "num_tokens": 7296551436.0, | |
| "step": 51700 | |
| }, | |
| { | |
| "entropy": 1.3240708827972412, | |
| "epoch": 1.6894649211583037, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 3.8694123070060473e-07, | |
| "loss": 0.1103, | |
| "mean_token_accuracy": 0.9735696887969971, | |
| "num_tokens": 7303811515.0, | |
| "step": 51750 | |
| }, | |
| { | |
| "entropy": 1.3273040246963501, | |
| "epoch": 1.6910972544154614, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 3.8297303431414455e-07, | |
| "loss": 0.1176, | |
| "mean_token_accuracy": 0.9723230707645416, | |
| "num_tokens": 7310596062.0, | |
| "step": 51800 | |
| }, | |
| { | |
| "entropy": 1.3321073579788207, | |
| "epoch": 1.6927295876726194, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 3.7902390269756883e-07, | |
| "loss": 0.1132, | |
| "mean_token_accuracy": 0.9733636856079102, | |
| "num_tokens": 7318010738.0, | |
| "step": 51850 | |
| }, | |
| { | |
| "entropy": 1.3346837186813354, | |
| "epoch": 1.694361920929777, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 3.75093864619894e-07, | |
| "loss": 0.118, | |
| "mean_token_accuracy": 0.9715818917751312, | |
| "num_tokens": 7325178822.0, | |
| "step": 51900 | |
| }, | |
| { | |
| "entropy": 1.327819790840149, | |
| "epoch": 1.6959942541869348, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 3.7118294871103764e-07, | |
| "loss": 0.1129, | |
| "mean_token_accuracy": 0.9730360591411591, | |
| "num_tokens": 7332489545.0, | |
| "step": 51950 | |
| }, | |
| { | |
| "entropy": 1.3169277691841126, | |
| "epoch": 1.6976265874440926, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 3.672911834616175e-07, | |
| "loss": 0.1027, | |
| "mean_token_accuracy": 0.9758607912063598, | |
| "num_tokens": 7339208562.0, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 1.6976265874440926, | |
| "eval_entropy": 1.3195398267110188, | |
| "eval_loss": 0.1424235850572586, | |
| "eval_mean_token_accuracy": 0.9676573673884074, | |
| "eval_num_tokens": 7339208562.0, | |
| "eval_runtime": 751.2637, | |
| "eval_samples_per_second": 12.853, | |
| "eval_steps_per_second": 0.101, | |
| "step": 52000 | |
| }, | |
| { | |
| "entropy": 1.3143286561965943, | |
| "epoch": 1.6992589207012503, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 3.6341859722273907e-07, | |
| "loss": 0.1075, | |
| "mean_token_accuracy": 0.9739359652996064, | |
| "num_tokens": 7346374510.0, | |
| "step": 52050 | |
| }, | |
| { | |
| "entropy": 1.323881621360779, | |
| "epoch": 1.700891253958408, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 3.5956521820579126e-07, | |
| "loss": 0.0998, | |
| "mean_token_accuracy": 0.9760903561115265, | |
| "num_tokens": 7353032386.0, | |
| "step": 52100 | |
| }, | |
| { | |
| "entropy": 1.3202632975578308, | |
| "epoch": 1.702523587215566, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 3.5573107448224085e-07, | |
| "loss": 0.118, | |
| "mean_token_accuracy": 0.9715847325325012, | |
| "num_tokens": 7360252648.0, | |
| "step": 52150 | |
| }, | |
| { | |
| "entropy": 1.330101602077484, | |
| "epoch": 1.7041559204727237, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 3.519161939834264e-07, | |
| "loss": 0.1136, | |
| "mean_token_accuracy": 0.9737802124023438, | |
| "num_tokens": 7367323821.0, | |
| "step": 52200 | |
| }, | |
| { | |
| "entropy": 1.3269778847694398, | |
| "epoch": 1.7057882537298816, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 3.4812060450035723e-07, | |
| "loss": 0.1091, | |
| "mean_token_accuracy": 0.9739231073856354, | |
| "num_tokens": 7374681816.0, | |
| "step": 52250 | |
| }, | |
| { | |
| "entropy": 1.3149812078475953, | |
| "epoch": 1.7074205869870394, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 3.44344333683508e-07, | |
| "loss": 0.1049, | |
| "mean_token_accuracy": 0.9752970814704895, | |
| "num_tokens": 7381861290.0, | |
| "step": 52300 | |
| }, | |
| { | |
| "entropy": 1.31980233669281, | |
| "epoch": 1.709052920244197, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 3.4058740904262077e-07, | |
| "loss": 0.1137, | |
| "mean_token_accuracy": 0.9733495855331421, | |
| "num_tokens": 7389307478.0, | |
| "step": 52350 | |
| }, | |
| { | |
| "entropy": 1.3273230743408204, | |
| "epoch": 1.7106852535013548, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 3.3684985794650025e-07, | |
| "loss": 0.1119, | |
| "mean_token_accuracy": 0.9732711517810821, | |
| "num_tokens": 7396166622.0, | |
| "step": 52400 | |
| }, | |
| { | |
| "entropy": 1.3064412307739257, | |
| "epoch": 1.7123175867585125, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 3.3313170762281964e-07, | |
| "loss": 0.1004, | |
| "mean_token_accuracy": 0.9755911242961883, | |
| "num_tokens": 7403084742.0, | |
| "step": 52450 | |
| }, | |
| { | |
| "entropy": 1.3238463592529297, | |
| "epoch": 1.7139499200156703, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 3.294329851579181e-07, | |
| "loss": 0.101, | |
| "mean_token_accuracy": 0.9754888868331909, | |
| "num_tokens": 7409894593.0, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 1.7139499200156703, | |
| "eval_entropy": 1.3191882546742757, | |
| "eval_loss": 0.14246180653572083, | |
| "eval_mean_token_accuracy": 0.9676294851303101, | |
| "eval_num_tokens": 7409894593.0, | |
| "eval_runtime": 749.671, | |
| "eval_samples_per_second": 12.88, | |
| "eval_steps_per_second": 0.101, | |
| "step": 52500 | |
| }, | |
| { | |
| "entropy": 1.3101821112632752, | |
| "epoch": 1.7155822532728282, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 3.25753717496604e-07, | |
| "loss": 0.0956, | |
| "mean_token_accuracy": 0.9765710198879242, | |
| "num_tokens": 7416484075.0, | |
| "step": 52550 | |
| }, | |
| { | |
| "entropy": 1.3191359090805053, | |
| "epoch": 1.717214586529986, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 3.220939314419614e-07, | |
| "loss": 0.106, | |
| "mean_token_accuracy": 0.9754664206504822, | |
| "num_tokens": 7423584170.0, | |
| "step": 52600 | |
| }, | |
| { | |
| "entropy": 1.3270062279701234, | |
| "epoch": 1.7188469197871439, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 3.1845365365515136e-07, | |
| "loss": 0.1058, | |
| "mean_token_accuracy": 0.9742505669593811, | |
| "num_tokens": 7430705466.0, | |
| "step": 52650 | |
| }, | |
| { | |
| "entropy": 1.308781328201294, | |
| "epoch": 1.7204792530443016, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 3.14832910655221e-07, | |
| "loss": 0.101, | |
| "mean_token_accuracy": 0.9755427074432373, | |
| "num_tokens": 7437413037.0, | |
| "step": 52700 | |
| }, | |
| { | |
| "entropy": 1.3295643472671508, | |
| "epoch": 1.7221115863014593, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 3.1123172881890593e-07, | |
| "loss": 0.1149, | |
| "mean_token_accuracy": 0.9725350320339203, | |
| "num_tokens": 7445324755.0, | |
| "step": 52750 | |
| }, | |
| { | |
| "entropy": 1.306700224876404, | |
| "epoch": 1.723743919558617, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 3.076501343804432e-07, | |
| "loss": 0.1028, | |
| "mean_token_accuracy": 0.9747404766082763, | |
| "num_tokens": 7452440110.0, | |
| "step": 52800 | |
| }, | |
| { | |
| "entropy": 1.320225818157196, | |
| "epoch": 1.7253762528157748, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 3.0408815343137576e-07, | |
| "loss": 0.1097, | |
| "mean_token_accuracy": 0.9736387383937836, | |
| "num_tokens": 7459466767.0, | |
| "step": 52850 | |
| }, | |
| { | |
| "entropy": 1.319351954460144, | |
| "epoch": 1.7270085860729325, | |
| "grad_norm": 2.25, | |
| "learning_rate": 3.005458119203661e-07, | |
| "loss": 0.0996, | |
| "mean_token_accuracy": 0.9759821879863739, | |
| "num_tokens": 7466190850.0, | |
| "step": 52900 | |
| }, | |
| { | |
| "entropy": 1.3243736958503722, | |
| "epoch": 1.7286409193300905, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 2.970231356530037e-07, | |
| "loss": 0.1178, | |
| "mean_token_accuracy": 0.9724557065963745, | |
| "num_tokens": 7473713575.0, | |
| "step": 52950 | |
| }, | |
| { | |
| "entropy": 1.3095801281929016, | |
| "epoch": 1.7302732525872482, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.935201502916196e-07, | |
| "loss": 0.104, | |
| "mean_token_accuracy": 0.975416682958603, | |
| "num_tokens": 7480846261.0, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 1.7302732525872482, | |
| "eval_entropy": 1.3154603624343872, | |
| "eval_loss": 0.1425192952156067, | |
| "eval_mean_token_accuracy": 0.9676680334409078, | |
| "eval_num_tokens": 7480846261.0, | |
| "eval_runtime": 749.6932, | |
| "eval_samples_per_second": 12.88, | |
| "eval_steps_per_second": 0.101, | |
| "step": 53000 | |
| }, | |
| { | |
| "entropy": 1.327847077846527, | |
| "epoch": 1.7319055858444061, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 2.9003688135509996e-07, | |
| "loss": 0.1027, | |
| "mean_token_accuracy": 0.9751829147338867, | |
| "num_tokens": 7487902389.0, | |
| "step": 53050 | |
| }, | |
| { | |
| "entropy": 1.3144252371788026, | |
| "epoch": 1.7335379191015639, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 2.86573354218696e-07, | |
| "loss": 0.0943, | |
| "mean_token_accuracy": 0.9771701264381408, | |
| "num_tokens": 7494511451.0, | |
| "step": 53100 | |
| }, | |
| { | |
| "entropy": 1.317789807319641, | |
| "epoch": 1.7351702523587216, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 2.8312959411384496e-07, | |
| "loss": 0.1057, | |
| "mean_token_accuracy": 0.9748926043510437, | |
| "num_tokens": 7501653002.0, | |
| "step": 53150 | |
| }, | |
| { | |
| "entropy": 1.3131379342079164, | |
| "epoch": 1.7368025856158793, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 2.7970562612798003e-07, | |
| "loss": 0.0964, | |
| "mean_token_accuracy": 0.9761436748504638, | |
| "num_tokens": 7508205530.0, | |
| "step": 53200 | |
| }, | |
| { | |
| "entropy": 1.3117465686798095, | |
| "epoch": 1.738434918873037, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 2.7630147520435454e-07, | |
| "loss": 0.0973, | |
| "mean_token_accuracy": 0.9763562536239624, | |
| "num_tokens": 7514970855.0, | |
| "step": 53250 | |
| }, | |
| { | |
| "entropy": 1.3275910449028014, | |
| "epoch": 1.7400672521301948, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 2.729171661418536e-07, | |
| "loss": 0.1058, | |
| "mean_token_accuracy": 0.9749761927127838, | |
| "num_tokens": 7522035646.0, | |
| "step": 53300 | |
| }, | |
| { | |
| "entropy": 1.3121574544906616, | |
| "epoch": 1.7416995853873527, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 2.695527235948176e-07, | |
| "loss": 0.1045, | |
| "mean_token_accuracy": 0.9752842879295349, | |
| "num_tokens": 7529319921.0, | |
| "step": 53350 | |
| }, | |
| { | |
| "entropy": 1.3087862515449524, | |
| "epoch": 1.7433319186445104, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 2.662081720728621e-07, | |
| "loss": 0.0898, | |
| "mean_token_accuracy": 0.9782419979572297, | |
| "num_tokens": 7535926578.0, | |
| "step": 53400 | |
| }, | |
| { | |
| "entropy": 1.3212140440940856, | |
| "epoch": 1.7449642519016684, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 2.6288353594069716e-07, | |
| "loss": 0.0961, | |
| "mean_token_accuracy": 0.9764723992347717, | |
| "num_tokens": 7542728374.0, | |
| "step": 53450 | |
| }, | |
| { | |
| "entropy": 1.3193033647537231, | |
| "epoch": 1.746596585158826, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 2.595788394179528e-07, | |
| "loss": 0.1046, | |
| "mean_token_accuracy": 0.9745003497600555, | |
| "num_tokens": 7549497199.0, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 1.746596585158826, | |
| "eval_entropy": 1.316735652287801, | |
| "eval_loss": 0.14255692064762115, | |
| "eval_mean_token_accuracy": 0.9676073582967123, | |
| "eval_num_tokens": 7549497199.0, | |
| "eval_runtime": 749.254, | |
| "eval_samples_per_second": 12.887, | |
| "eval_steps_per_second": 0.101, | |
| "step": 53500 | |
| }, | |
| { | |
| "entropy": 1.3119576716423034, | |
| "epoch": 1.7482289184159838, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 2.562941065789989e-07, | |
| "loss": 0.1075, | |
| "mean_token_accuracy": 0.9740070915222168, | |
| "num_tokens": 7556760300.0, | |
| "step": 53550 | |
| }, | |
| { | |
| "entropy": 1.323666477203369, | |
| "epoch": 1.7498612516731415, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 2.530293613527752e-07, | |
| "loss": 0.1049, | |
| "mean_token_accuracy": 0.9741265332698822, | |
| "num_tokens": 7563873501.0, | |
| "step": 53600 | |
| }, | |
| { | |
| "entropy": 1.313588421344757, | |
| "epoch": 1.7514935849302993, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 2.497846275226101e-07, | |
| "loss": 0.0999, | |
| "mean_token_accuracy": 0.976412239074707, | |
| "num_tokens": 7570815590.0, | |
| "step": 53650 | |
| }, | |
| { | |
| "entropy": 1.310178370475769, | |
| "epoch": 1.753125918187457, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 2.4655992872605383e-07, | |
| "loss": 0.0919, | |
| "mean_token_accuracy": 0.9780379617214203, | |
| "num_tokens": 7577519323.0, | |
| "step": 53700 | |
| }, | |
| { | |
| "entropy": 1.3194669818878173, | |
| "epoch": 1.754758251444615, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 2.43355288454702e-07, | |
| "loss": 0.1098, | |
| "mean_token_accuracy": 0.973793808221817, | |
| "num_tokens": 7584868782.0, | |
| "step": 53750 | |
| }, | |
| { | |
| "entropy": 1.3247113370895385, | |
| "epoch": 1.7563905847017727, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 2.401707300540279e-07, | |
| "loss": 0.0981, | |
| "mean_token_accuracy": 0.9766256093978882, | |
| "num_tokens": 7591651694.0, | |
| "step": 53800 | |
| }, | |
| { | |
| "entropy": 1.320894329547882, | |
| "epoch": 1.7580229179589306, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 2.3700627672320707e-07, | |
| "loss": 0.0977, | |
| "mean_token_accuracy": 0.9763943207263946, | |
| "num_tokens": 7598567378.0, | |
| "step": 53850 | |
| }, | |
| { | |
| "entropy": 1.3126529788970946, | |
| "epoch": 1.7596552512160883, | |
| "grad_norm": 1.375, | |
| "learning_rate": 2.338619515149546e-07, | |
| "loss": 0.0914, | |
| "mean_token_accuracy": 0.9772222137451172, | |
| "num_tokens": 7605566065.0, | |
| "step": 53900 | |
| }, | |
| { | |
| "entropy": 1.3226768159866333, | |
| "epoch": 1.761287584473246, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 2.307377773353535e-07, | |
| "loss": 0.1152, | |
| "mean_token_accuracy": 0.9727129638195038, | |
| "num_tokens": 7613562049.0, | |
| "step": 53950 | |
| }, | |
| { | |
| "entropy": 1.3229237818717956, | |
| "epoch": 1.7629199177304038, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 2.2763377694368827e-07, | |
| "loss": 0.1008, | |
| "mean_token_accuracy": 0.975539722442627, | |
| "num_tokens": 7620524140.0, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 1.7629199177304038, | |
| "eval_entropy": 1.3142534939448038, | |
| "eval_loss": 0.14254117012023926, | |
| "eval_mean_token_accuracy": 0.9676929664611816, | |
| "eval_num_tokens": 7620524140.0, | |
| "eval_runtime": 752.4204, | |
| "eval_samples_per_second": 12.833, | |
| "eval_steps_per_second": 0.101, | |
| "step": 54000 | |
| }, | |
| { | |
| "entropy": 1.3087509632110597, | |
| "epoch": 1.7645522509875615, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 2.2454997295227985e-07, | |
| "loss": 0.101, | |
| "mean_token_accuracy": 0.9754433751106262, | |
| "num_tokens": 7627617937.0, | |
| "step": 54050 | |
| }, | |
| { | |
| "entropy": 1.3296643924713134, | |
| "epoch": 1.7661845842447192, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.2148638782631969e-07, | |
| "loss": 0.1022, | |
| "mean_token_accuracy": 0.9750396251678467, | |
| "num_tokens": 7634624283.0, | |
| "step": 54100 | |
| }, | |
| { | |
| "entropy": 1.3130810499191283, | |
| "epoch": 1.7678169175018772, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 2.1844304388370862e-07, | |
| "loss": 0.0997, | |
| "mean_token_accuracy": 0.9761820614337922, | |
| "num_tokens": 7642188094.0, | |
| "step": 54150 | |
| }, | |
| { | |
| "entropy": 1.3113509464263915, | |
| "epoch": 1.769449250759035, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 2.154199632948901e-07, | |
| "loss": 0.0925, | |
| "mean_token_accuracy": 0.9778699886798858, | |
| "num_tokens": 7648970433.0, | |
| "step": 54200 | |
| }, | |
| { | |
| "entropy": 1.321754252910614, | |
| "epoch": 1.7710815840161929, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 2.124171680826934e-07, | |
| "loss": 0.1044, | |
| "mean_token_accuracy": 0.974656708240509, | |
| "num_tokens": 7656027769.0, | |
| "step": 54250 | |
| }, | |
| { | |
| "entropy": 1.3192690873146058, | |
| "epoch": 1.7727139172733506, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 2.094346801221706e-07, | |
| "loss": 0.0962, | |
| "mean_token_accuracy": 0.9762521004676818, | |
| "num_tokens": 7662871105.0, | |
| "step": 54300 | |
| }, | |
| { | |
| "entropy": 1.3231295156478882, | |
| "epoch": 1.7743462505305083, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 2.0647252114043548e-07, | |
| "loss": 0.1029, | |
| "mean_token_accuracy": 0.975200617313385, | |
| "num_tokens": 7670265436.0, | |
| "step": 54350 | |
| }, | |
| { | |
| "entropy": 1.3153988409042359, | |
| "epoch": 1.775978583787666, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 2.0353071271651024e-07, | |
| "loss": 0.0908, | |
| "mean_token_accuracy": 0.9769929325580597, | |
| "num_tokens": 7677021371.0, | |
| "step": 54400 | |
| }, | |
| { | |
| "entropy": 1.3150772476196289, | |
| "epoch": 1.7776109170448238, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 2.006092762811631e-07, | |
| "loss": 0.0999, | |
| "mean_token_accuracy": 0.9757753646373749, | |
| "num_tokens": 7684035466.0, | |
| "step": 54450 | |
| }, | |
| { | |
| "entropy": 1.3111911249160766, | |
| "epoch": 1.7792432503019815, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.9770823311675622e-07, | |
| "loss": 0.0956, | |
| "mean_token_accuracy": 0.9768190658092499, | |
| "num_tokens": 7690811924.0, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 1.7792432503019815, | |
| "eval_entropy": 1.3134044408798218, | |
| "eval_loss": 0.14260686933994293, | |
| "eval_mean_token_accuracy": 0.9676902786890665, | |
| "eval_num_tokens": 7690811924.0, | |
| "eval_runtime": 751.7328, | |
| "eval_samples_per_second": 12.845, | |
| "eval_steps_per_second": 0.101, | |
| "step": 54500 | |
| }, | |
| { | |
| "entropy": 1.3163812851905823, | |
| "epoch": 1.7808755835591394, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 1.948276043570867e-07, | |
| "loss": 0.0971, | |
| "mean_token_accuracy": 0.9768479645252228, | |
| "num_tokens": 7697685594.0, | |
| "step": 54550 | |
| }, | |
| { | |
| "entropy": 1.3316512179374695, | |
| "epoch": 1.7825079168162972, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 1.9196741098723714e-07, | |
| "loss": 0.1016, | |
| "mean_token_accuracy": 0.9754971957206726, | |
| "num_tokens": 7704672835.0, | |
| "step": 54600 | |
| }, | |
| { | |
| "entropy": 1.3212800335884094, | |
| "epoch": 1.7841402500734551, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 1.8912767384341967e-07, | |
| "loss": 0.0971, | |
| "mean_token_accuracy": 0.9769463586807251, | |
| "num_tokens": 7712059362.0, | |
| "step": 54650 | |
| }, | |
| { | |
| "entropy": 1.3257331156730652, | |
| "epoch": 1.7857725833306128, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 1.863084136128239e-07, | |
| "loss": 0.0931, | |
| "mean_token_accuracy": 0.9771769893169403, | |
| "num_tokens": 7719025676.0, | |
| "step": 54700 | |
| }, | |
| { | |
| "entropy": 1.3107067704200746, | |
| "epoch": 1.7874049165877706, | |
| "grad_norm": 1.375, | |
| "learning_rate": 1.8350965083346883e-07, | |
| "loss": 0.0811, | |
| "mean_token_accuracy": 0.9800481641292572, | |
| "num_tokens": 7725468150.0, | |
| "step": 54750 | |
| }, | |
| { | |
| "entropy": 1.3124605083465577, | |
| "epoch": 1.7890372498449283, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.807314058940498e-07, | |
| "loss": 0.0905, | |
| "mean_token_accuracy": 0.9778806746006012, | |
| "num_tokens": 7732699148.0, | |
| "step": 54800 | |
| }, | |
| { | |
| "entropy": 1.3118835282325745, | |
| "epoch": 1.790669583102086, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 1.7797369903379447e-07, | |
| "loss": 0.0885, | |
| "mean_token_accuracy": 0.9785989081859588, | |
| "num_tokens": 7739580787.0, | |
| "step": 54850 | |
| }, | |
| { | |
| "entropy": 1.3218691396713256, | |
| "epoch": 1.7923019163592437, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 1.7523655034230913e-07, | |
| "loss": 0.1019, | |
| "mean_token_accuracy": 0.9758154857158661, | |
| "num_tokens": 7746871274.0, | |
| "step": 54900 | |
| }, | |
| { | |
| "entropy": 1.3150616145133973, | |
| "epoch": 1.7939342496164017, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 1.7251997975944023e-07, | |
| "loss": 0.0885, | |
| "mean_token_accuracy": 0.9779963111877441, | |
| "num_tokens": 7754114830.0, | |
| "step": 54950 | |
| }, | |
| { | |
| "entropy": 1.3193181252479553, | |
| "epoch": 1.7955665828735594, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 1.698240070751208e-07, | |
| "loss": 0.0954, | |
| "mean_token_accuracy": 0.9768703639507293, | |
| "num_tokens": 7761534891.0, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 1.7955665828735594, | |
| "eval_entropy": 1.3136341873804729, | |
| "eval_loss": 0.14263209700584412, | |
| "eval_mean_token_accuracy": 0.9676380705833435, | |
| "eval_num_tokens": 7761534891.0, | |
| "eval_runtime": 751.8782, | |
| "eval_samples_per_second": 12.843, | |
| "eval_steps_per_second": 0.101, | |
| "step": 55000 | |
| }, | |
| { | |
| "entropy": 1.3227564001083374, | |
| "epoch": 1.7971989161307174, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 1.6714865192923357e-07, | |
| "loss": 0.0931, | |
| "mean_token_accuracy": 0.9776843535900116, | |
| "num_tokens": 7768739013.0, | |
| "step": 55050 | |
| }, | |
| { | |
| "entropy": 1.3159151887893676, | |
| "epoch": 1.798831249387875, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.644939338114617e-07, | |
| "loss": 0.0891, | |
| "mean_token_accuracy": 0.9785628998279572, | |
| "num_tokens": 7775714510.0, | |
| "step": 55100 | |
| }, | |
| { | |
| "entropy": 1.3172850131988525, | |
| "epoch": 1.8004635826450328, | |
| "grad_norm": 1.5, | |
| "learning_rate": 1.618598720611517e-07, | |
| "loss": 0.0929, | |
| "mean_token_accuracy": 0.9780389821529388, | |
| "num_tokens": 7782939417.0, | |
| "step": 55150 | |
| }, | |
| { | |
| "entropy": 1.3124612760543823, | |
| "epoch": 1.8020959159021905, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 1.5924648586717106e-07, | |
| "loss": 0.0903, | |
| "mean_token_accuracy": 0.9784395337104798, | |
| "num_tokens": 7790076964.0, | |
| "step": 55200 | |
| }, | |
| { | |
| "entropy": 1.3138836860656737, | |
| "epoch": 1.8037282491593483, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 1.566537942677657e-07, | |
| "loss": 0.0906, | |
| "mean_token_accuracy": 0.9779064965248108, | |
| "num_tokens": 7796643475.0, | |
| "step": 55250 | |
| }, | |
| { | |
| "entropy": 1.3136559057235717, | |
| "epoch": 1.805360582416506, | |
| "grad_norm": 0.00165557861328125, | |
| "learning_rate": 1.5408181615042594e-07, | |
| "loss": 0.0843, | |
| "mean_token_accuracy": 0.9791945159435272, | |
| "num_tokens": 7803352885.0, | |
| "step": 55300 | |
| }, | |
| { | |
| "entropy": 1.3052284288406373, | |
| "epoch": 1.806992915673664, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 1.5153057025174432e-07, | |
| "loss": 0.09, | |
| "mean_token_accuracy": 0.9779746580123901, | |
| "num_tokens": 7810594549.0, | |
| "step": 55350 | |
| }, | |
| { | |
| "entropy": 1.3102794551849366, | |
| "epoch": 1.8086252489308219, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 1.4900007515728365e-07, | |
| "loss": 0.0844, | |
| "mean_token_accuracy": 0.9790527272224426, | |
| "num_tokens": 7817507513.0, | |
| "step": 55400 | |
| }, | |
| { | |
| "entropy": 1.324124116897583, | |
| "epoch": 1.8102575821879796, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 1.4649034930143722e-07, | |
| "loss": 0.094, | |
| "mean_token_accuracy": 0.9774300479888915, | |
| "num_tokens": 7824764989.0, | |
| "step": 55450 | |
| }, | |
| { | |
| "entropy": 1.3147008061408996, | |
| "epoch": 1.8118899154451373, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.440014109672978e-07, | |
| "loss": 0.0914, | |
| "mean_token_accuracy": 0.9776774108409881, | |
| "num_tokens": 7831265405.0, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 1.8118899154451373, | |
| "eval_entropy": 1.3136727301279705, | |
| "eval_loss": 0.1426331102848053, | |
| "eval_mean_token_accuracy": 0.9676420497894287, | |
| "eval_num_tokens": 7831265405.0, | |
| "eval_runtime": 749.6902, | |
| "eval_samples_per_second": 12.88, | |
| "eval_steps_per_second": 0.101, | |
| "step": 55500 | |
| }, | |
| { | |
| "entropy": 1.3262977242469787, | |
| "epoch": 1.813522248702295, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.415332782865235e-07, | |
| "loss": 0.0996, | |
| "mean_token_accuracy": 0.976102020740509, | |
| "num_tokens": 7838532553.0, | |
| "step": 55550 | |
| }, | |
| { | |
| "entropy": 1.314659128189087, | |
| "epoch": 1.8151545819594528, | |
| "grad_norm": 0.00151824951171875, | |
| "learning_rate": 1.3908596923920348e-07, | |
| "loss": 0.0876, | |
| "mean_token_accuracy": 0.9786652231216431, | |
| "num_tokens": 7845416264.0, | |
| "step": 55600 | |
| }, | |
| { | |
| "entropy": 1.2995626902580262, | |
| "epoch": 1.8167869152166105, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 1.3665950165373177e-07, | |
| "loss": 0.078, | |
| "mean_token_accuracy": 0.9805066454410553, | |
| "num_tokens": 7852143397.0, | |
| "step": 55650 | |
| }, | |
| { | |
| "entropy": 1.3230929231643678, | |
| "epoch": 1.8184192484737685, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 1.3425389320667126e-07, | |
| "loss": 0.0948, | |
| "mean_token_accuracy": 0.9774747908115387, | |
| "num_tokens": 7859183732.0, | |
| "step": 55700 | |
| }, | |
| { | |
| "entropy": 1.3118236589431762, | |
| "epoch": 1.8200515817309262, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 1.3186916142263138e-07, | |
| "loss": 0.0852, | |
| "mean_token_accuracy": 0.9793214762210846, | |
| "num_tokens": 7865800782.0, | |
| "step": 55750 | |
| }, | |
| { | |
| "entropy": 1.3139012956619263, | |
| "epoch": 1.8216839149880841, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 1.295053236741346e-07, | |
| "loss": 0.0858, | |
| "mean_token_accuracy": 0.9790966403484345, | |
| "num_tokens": 7873054112.0, | |
| "step": 55800 | |
| }, | |
| { | |
| "entropy": 1.3215832591056824, | |
| "epoch": 1.8233162482452419, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 1.2716239718149404e-07, | |
| "loss": 0.0915, | |
| "mean_token_accuracy": 0.9782492446899415, | |
| "num_tokens": 7880572440.0, | |
| "step": 55850 | |
| }, | |
| { | |
| "entropy": 1.3250614070892335, | |
| "epoch": 1.8249485815023996, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 1.248403990126864e-07, | |
| "loss": 0.0836, | |
| "mean_token_accuracy": 0.9802043890953064, | |
| "num_tokens": 7887372576.0, | |
| "step": 55900 | |
| }, | |
| { | |
| "entropy": 1.3125244760513306, | |
| "epoch": 1.8265809147595573, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.2253934608322704e-07, | |
| "loss": 0.0877, | |
| "mean_token_accuracy": 0.9786297881603241, | |
| "num_tokens": 7894499299.0, | |
| "step": 55950 | |
| }, | |
| { | |
| "entropy": 1.310175290107727, | |
| "epoch": 1.828213248016715, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 1.2025925515604797e-07, | |
| "loss": 0.0863, | |
| "mean_token_accuracy": 0.9788778626918793, | |
| "num_tokens": 7901474987.0, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 1.828213248016715, | |
| "eval_entropy": 1.3131318473815918, | |
| "eval_loss": 0.14263643324375153, | |
| "eval_mean_token_accuracy": 0.9676548767089844, | |
| "eval_num_tokens": 7901474987.0, | |
| "eval_runtime": 752.1988, | |
| "eval_samples_per_second": 12.837, | |
| "eval_steps_per_second": 0.101, | |
| "step": 56000 | |
| }, | |
| { | |
| "entropy": 1.307514934539795, | |
| "epoch": 1.8298455812738728, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 1.1800014284137439e-07, | |
| "loss": 0.0798, | |
| "mean_token_accuracy": 0.9802592170238494, | |
| "num_tokens": 7908470639.0, | |
| "step": 56050 | |
| }, | |
| { | |
| "entropy": 1.3173511362075805, | |
| "epoch": 1.8314779145310307, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 1.157620255966061e-07, | |
| "loss": 0.0857, | |
| "mean_token_accuracy": 0.9793569481372834, | |
| "num_tokens": 7915474847.0, | |
| "step": 56100 | |
| }, | |
| { | |
| "entropy": 1.326115915775299, | |
| "epoch": 1.8331102477881884, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 1.1354491972619418e-07, | |
| "loss": 0.0882, | |
| "mean_token_accuracy": 0.9790770506858826, | |
| "num_tokens": 7922729639.0, | |
| "step": 56150 | |
| }, | |
| { | |
| "entropy": 1.3074156522750855, | |
| "epoch": 1.8347425810453464, | |
| "grad_norm": 1.125, | |
| "learning_rate": 1.1134884138152556e-07, | |
| "loss": 0.0747, | |
| "mean_token_accuracy": 0.9815619790554047, | |
| "num_tokens": 7929688118.0, | |
| "step": 56200 | |
| }, | |
| { | |
| "entropy": 1.3235062193870544, | |
| "epoch": 1.836374914302504, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 1.0917380656080234e-07, | |
| "loss": 0.083, | |
| "mean_token_accuracy": 0.9802336692810059, | |
| "num_tokens": 7936632242.0, | |
| "step": 56250 | |
| }, | |
| { | |
| "entropy": 1.320460605621338, | |
| "epoch": 1.8380072475596618, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 1.0701983110892821e-07, | |
| "loss": 0.0871, | |
| "mean_token_accuracy": 0.9788316774368286, | |
| "num_tokens": 7944029771.0, | |
| "step": 56300 | |
| }, | |
| { | |
| "entropy": 1.3198864316940309, | |
| "epoch": 1.8396395808168196, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.0488693071738998e-07, | |
| "loss": 0.0914, | |
| "mean_token_accuracy": 0.9782080149650574, | |
| "num_tokens": 7951324936.0, | |
| "step": 56350 | |
| }, | |
| { | |
| "entropy": 1.3099779963493348, | |
| "epoch": 1.8412719140739773, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 1.0277512092414621e-07, | |
| "loss": 0.0808, | |
| "mean_token_accuracy": 0.9805028080940247, | |
| "num_tokens": 7958613468.0, | |
| "step": 56400 | |
| }, | |
| { | |
| "entropy": 1.3207287693023682, | |
| "epoch": 1.842904247331135, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.0068441711351239e-07, | |
| "loss": 0.0848, | |
| "mean_token_accuracy": 0.9797763514518738, | |
| "num_tokens": 7965649951.0, | |
| "step": 56450 | |
| }, | |
| { | |
| "entropy": 1.3234837770462036, | |
| "epoch": 1.844536580588293, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 9.861483451604803e-08, | |
| "loss": 0.0848, | |
| "mean_token_accuracy": 0.9798565399646759, | |
| "num_tokens": 7972824474.0, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 1.844536580588293, | |
| "eval_entropy": 1.3131659841537475, | |
| "eval_loss": 0.142597496509552, | |
| "eval_mean_token_accuracy": 0.9676955684026083, | |
| "eval_num_tokens": 7972824474.0, | |
| "eval_runtime": 754.1624, | |
| "eval_samples_per_second": 12.804, | |
| "eval_steps_per_second": 0.101, | |
| "step": 56500 | |
| }, | |
| { | |
| "entropy": 1.3255647730827331, | |
| "epoch": 1.8461689138454507, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 9.656638820844832e-08, | |
| "loss": 0.0797, | |
| "mean_token_accuracy": 0.9803315377235413, | |
| "num_tokens": 7979650165.0, | |
| "step": 56550 | |
| }, | |
| { | |
| "entropy": 1.3177052664756774, | |
| "epoch": 1.8478012471026086, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 9.453909311343168e-08, | |
| "loss": 0.082, | |
| "mean_token_accuracy": 0.9800474560260772, | |
| "num_tokens": 7986279045.0, | |
| "step": 56600 | |
| }, | |
| { | |
| "entropy": 1.3207802820205687, | |
| "epoch": 1.8494335803597663, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 9.253296399963306e-08, | |
| "loss": 0.0844, | |
| "mean_token_accuracy": 0.9793656885623931, | |
| "num_tokens": 7993389141.0, | |
| "step": 56650 | |
| }, | |
| { | |
| "entropy": 1.313080358505249, | |
| "epoch": 1.851065913616924, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 9.054801548149383e-08, | |
| "loss": 0.0835, | |
| "mean_token_accuracy": 0.9803290486335754, | |
| "num_tokens": 8000349967.0, | |
| "step": 56700 | |
| }, | |
| { | |
| "entropy": 1.316315357685089, | |
| "epoch": 1.8526982468740818, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 8.85842620191587e-08, | |
| "loss": 0.0823, | |
| "mean_token_accuracy": 0.9800036442279816, | |
| "num_tokens": 8007394269.0, | |
| "step": 56750 | |
| }, | |
| { | |
| "entropy": 1.323743233680725, | |
| "epoch": 1.8543305801312395, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 8.664171791836828e-08, | |
| "loss": 0.0783, | |
| "mean_token_accuracy": 0.9807388544082641, | |
| "num_tokens": 8014348208.0, | |
| "step": 56800 | |
| }, | |
| { | |
| "entropy": 1.3166941332817077, | |
| "epoch": 1.8559629133883973, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 8.472039733035375e-08, | |
| "loss": 0.0887, | |
| "mean_token_accuracy": 0.9791383862495422, | |
| "num_tokens": 8021975439.0, | |
| "step": 56850 | |
| }, | |
| { | |
| "entropy": 1.3115915489196777, | |
| "epoch": 1.8575952466455552, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 8.282031425173697e-08, | |
| "loss": 0.0742, | |
| "mean_token_accuracy": 0.9814891684055328, | |
| "num_tokens": 8028607289.0, | |
| "step": 56900 | |
| }, | |
| { | |
| "entropy": 1.3096832203865052, | |
| "epoch": 1.859227579902713, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 8.094148252442557e-08, | |
| "loss": 0.0806, | |
| "mean_token_accuracy": 0.9808643221855163, | |
| "num_tokens": 8035350719.0, | |
| "step": 56950 | |
| }, | |
| { | |
| "entropy": 1.312803740501404, | |
| "epoch": 1.8608599131598709, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 7.908391583551399e-08, | |
| "loss": 0.08, | |
| "mean_token_accuracy": 0.9804242384433747, | |
| "num_tokens": 8042661144.0, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 1.8608599131598709, | |
| "eval_entropy": 1.3134743928909303, | |
| "eval_loss": 0.14260423183441162, | |
| "eval_mean_token_accuracy": 0.9677057147026062, | |
| "eval_num_tokens": 8042661144.0, | |
| "eval_runtime": 752.8525, | |
| "eval_samples_per_second": 12.826, | |
| "eval_steps_per_second": 0.101, | |
| "step": 57000 | |
| }, | |
| { | |
| "entropy": 1.3151041221618653, | |
| "epoch": 1.8624922464170286, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 7.724762771718264e-08, | |
| "loss": 0.0703, | |
| "mean_token_accuracy": 0.9828312218189239, | |
| "num_tokens": 8049061325.0, | |
| "step": 57050 | |
| }, | |
| { | |
| "entropy": 1.3201626539230347, | |
| "epoch": 1.8641245796741863, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 7.543263154660018e-08, | |
| "loss": 0.0794, | |
| "mean_token_accuracy": 0.9805750000476837, | |
| "num_tokens": 8055861953.0, | |
| "step": 57100 | |
| }, | |
| { | |
| "entropy": 1.3100008058547974, | |
| "epoch": 1.865756912931344, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 7.363894054582543e-08, | |
| "loss": 0.0796, | |
| "mean_token_accuracy": 0.9810529005527496, | |
| "num_tokens": 8062705627.0, | |
| "step": 57150 | |
| }, | |
| { | |
| "entropy": 1.3101088619232177, | |
| "epoch": 1.8673892461885018, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 7.186656778171064e-08, | |
| "loss": 0.0848, | |
| "mean_token_accuracy": 0.9795117557048798, | |
| "num_tokens": 8069712418.0, | |
| "step": 57200 | |
| }, | |
| { | |
| "entropy": 1.3087351298332215, | |
| "epoch": 1.8690215794456595, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 7.011552616580763e-08, | |
| "loss": 0.0784, | |
| "mean_token_accuracy": 0.9811785018444061, | |
| "num_tokens": 8076520985.0, | |
| "step": 57250 | |
| }, | |
| { | |
| "entropy": 1.314774408340454, | |
| "epoch": 1.8706539127028174, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 6.838582845427322e-08, | |
| "loss": 0.0829, | |
| "mean_token_accuracy": 0.9801759088039398, | |
| "num_tokens": 8083425224.0, | |
| "step": 57300 | |
| }, | |
| { | |
| "entropy": 1.3010089206695556, | |
| "epoch": 1.8722862459599752, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 6.667748724777589e-08, | |
| "loss": 0.0837, | |
| "mean_token_accuracy": 0.9797486662864685, | |
| "num_tokens": 8090860003.0, | |
| "step": 57350 | |
| }, | |
| { | |
| "entropy": 1.3112692332267761, | |
| "epoch": 1.8739185792171331, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 6.499051499140363e-08, | |
| "loss": 0.0872, | |
| "mean_token_accuracy": 0.9786810195446014, | |
| "num_tokens": 8097627069.0, | |
| "step": 57400 | |
| }, | |
| { | |
| "entropy": 1.3242556214332581, | |
| "epoch": 1.8755509124742908, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 6.332492397457457e-08, | |
| "loss": 0.1031, | |
| "mean_token_accuracy": 0.9754553985595703, | |
| "num_tokens": 8104939412.0, | |
| "step": 57450 | |
| }, | |
| { | |
| "entropy": 1.3048901081085205, | |
| "epoch": 1.8771832457314486, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 6.168072633094578e-08, | |
| "loss": 0.0942, | |
| "mean_token_accuracy": 0.9768707299232483, | |
| "num_tokens": 8111717916.0, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 1.8771832457314486, | |
| "eval_entropy": 1.3135956350962321, | |
| "eval_loss": 0.14263209700584412, | |
| "eval_mean_token_accuracy": 0.9676501870155334, | |
| "eval_num_tokens": 8111717916.0, | |
| "eval_runtime": 751.2808, | |
| "eval_samples_per_second": 12.853, | |
| "eval_steps_per_second": 0.101, | |
| "step": 57500 | |
| }, | |
| { | |
| "entropy": 1.323448977470398, | |
| "epoch": 1.8788155789886063, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 6.00579340383277e-08, | |
| "loss": 0.1047, | |
| "mean_token_accuracy": 0.9757449758052826, | |
| "num_tokens": 8118451505.0, | |
| "step": 57550 | |
| }, | |
| { | |
| "entropy": 1.3023139286041259, | |
| "epoch": 1.880447912245764, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 5.845655891859247e-08, | |
| "loss": 0.097, | |
| "mean_token_accuracy": 0.9766959726810456, | |
| "num_tokens": 8125132870.0, | |
| "step": 57600 | |
| }, | |
| { | |
| "entropy": 1.3242397212982178, | |
| "epoch": 1.8820802455029217, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 5.68766126375927e-08, | |
| "loss": 0.1246, | |
| "mean_token_accuracy": 0.9708089303970336, | |
| "num_tokens": 8131883790.0, | |
| "step": 57650 | |
| }, | |
| { | |
| "entropy": 1.3141312503814697, | |
| "epoch": 1.8837125787600797, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 5.5318106705072535e-08, | |
| "loss": 0.1395, | |
| "mean_token_accuracy": 0.9681181204319, | |
| "num_tokens": 8138880080.0, | |
| "step": 57700 | |
| }, | |
| { | |
| "entropy": 1.3103543734550476, | |
| "epoch": 1.8853449120172374, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 5.378105247458609e-08, | |
| "loss": 0.1249, | |
| "mean_token_accuracy": 0.9709859442710876, | |
| "num_tokens": 8145787855.0, | |
| "step": 57750 | |
| }, | |
| { | |
| "entropy": 1.3131978607177734, | |
| "epoch": 1.8869772452743954, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 5.226546114341413e-08, | |
| "loss": 0.1242, | |
| "mean_token_accuracy": 0.971501134634018, | |
| "num_tokens": 8152787087.0, | |
| "step": 57800 | |
| }, | |
| { | |
| "entropy": 1.3190133213996886, | |
| "epoch": 1.888609578531553, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 5.077134375248183e-08, | |
| "loss": 0.1295, | |
| "mean_token_accuracy": 0.9703145730495453, | |
| "num_tokens": 8159391177.0, | |
| "step": 57850 | |
| }, | |
| { | |
| "entropy": 1.306349711418152, | |
| "epoch": 1.8902419117887108, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 4.9298711186279824e-08, | |
| "loss": 0.1169, | |
| "mean_token_accuracy": 0.9731926989555358, | |
| "num_tokens": 8166052317.0, | |
| "step": 57900 | |
| }, | |
| { | |
| "entropy": 1.312576003074646, | |
| "epoch": 1.8918742450458685, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 4.784757417278296e-08, | |
| "loss": 0.124, | |
| "mean_token_accuracy": 0.9711933457851409, | |
| "num_tokens": 8173047293.0, | |
| "step": 57950 | |
| }, | |
| { | |
| "entropy": 1.316941294670105, | |
| "epoch": 1.8935065783030263, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 4.641794328337434e-08, | |
| "loss": 0.1289, | |
| "mean_token_accuracy": 0.9705963468551636, | |
| "num_tokens": 8180073089.0, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 1.8935065783030263, | |
| "eval_entropy": 1.3130744123458862, | |
| "eval_loss": 0.142622709274292, | |
| "eval_mean_token_accuracy": 0.9676458064715068, | |
| "eval_num_tokens": 8180073089.0, | |
| "eval_runtime": 754.6584, | |
| "eval_samples_per_second": 12.795, | |
| "eval_steps_per_second": 0.101, | |
| "step": 58000 | |
| }, | |
| { | |
| "entropy": 1.3161793375015258, | |
| "epoch": 1.895138911560184, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 4.5009828932766395e-08, | |
| "loss": 0.1247, | |
| "mean_token_accuracy": 0.9718296456336976, | |
| "num_tokens": 8187129264.0, | |
| "step": 58050 | |
| }, | |
| { | |
| "entropy": 1.3136934351921081, | |
| "epoch": 1.896771244817342, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 4.362324137892626e-08, | |
| "loss": 0.1221, | |
| "mean_token_accuracy": 0.971994297504425, | |
| "num_tokens": 8194219348.0, | |
| "step": 58100 | |
| }, | |
| { | |
| "entropy": 1.3273291397094726, | |
| "epoch": 1.8984035780744997, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 4.225819072300019e-08, | |
| "loss": 0.1321, | |
| "mean_token_accuracy": 0.9701541066169739, | |
| "num_tokens": 8201245704.0, | |
| "step": 58150 | |
| }, | |
| { | |
| "entropy": 1.3125488138198853, | |
| "epoch": 1.9000359113316576, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 4.091468690924061e-08, | |
| "loss": 0.1336, | |
| "mean_token_accuracy": 0.9690061569213867, | |
| "num_tokens": 8208779023.0, | |
| "step": 58200 | |
| }, | |
| { | |
| "entropy": 1.3206189322471618, | |
| "epoch": 1.9016682445888153, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 3.9592739724933494e-08, | |
| "loss": 0.1326, | |
| "mean_token_accuracy": 0.9700911176204682, | |
| "num_tokens": 8215688508.0, | |
| "step": 58250 | |
| }, | |
| { | |
| "entropy": 1.3118561697006226, | |
| "epoch": 1.903300577845973, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 3.8292358800326774e-08, | |
| "loss": 0.1341, | |
| "mean_token_accuracy": 0.9699221074581146, | |
| "num_tokens": 8222957220.0, | |
| "step": 58300 | |
| }, | |
| { | |
| "entropy": 1.3096631002426147, | |
| "epoch": 1.9049329111031308, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 3.70135536085604e-08, | |
| "loss": 0.1289, | |
| "mean_token_accuracy": 0.970008145570755, | |
| "num_tokens": 8230419197.0, | |
| "step": 58350 | |
| }, | |
| { | |
| "entropy": 1.3086332321166991, | |
| "epoch": 1.9065652443602885, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 3.57563334655977e-08, | |
| "loss": 0.1196, | |
| "mean_token_accuracy": 0.9730131483078003, | |
| "num_tokens": 8237250702.0, | |
| "step": 58400 | |
| }, | |
| { | |
| "entropy": 1.31609708070755, | |
| "epoch": 1.9081975776174462, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 3.4520707530157125e-08, | |
| "loss": 0.1276, | |
| "mean_token_accuracy": 0.9696814298629761, | |
| "num_tokens": 8244072641.0, | |
| "step": 58450 | |
| }, | |
| { | |
| "entropy": 1.3143381929397584, | |
| "epoch": 1.9098299108746042, | |
| "grad_norm": 1.875, | |
| "learning_rate": 3.330668480364496e-08, | |
| "loss": 0.1288, | |
| "mean_token_accuracy": 0.9699802708625793, | |
| "num_tokens": 8251417973.0, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 1.9098299108746042, | |
| "eval_entropy": 1.313671735127767, | |
| "eval_loss": 0.14263801276683807, | |
| "eval_mean_token_accuracy": 0.9676080171267192, | |
| "eval_num_tokens": 8251417973.0, | |
| "eval_runtime": 754.9973, | |
| "eval_samples_per_second": 12.789, | |
| "eval_steps_per_second": 0.101, | |
| "step": 58500 | |
| }, | |
| { | |
| "entropy": 1.3198811602592468, | |
| "epoch": 1.911462244131762, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 3.2114274130091383e-08, | |
| "loss": 0.124, | |
| "mean_token_accuracy": 0.9716233015060425, | |
| "num_tokens": 8258494800.0, | |
| "step": 58550 | |
| }, | |
| { | |
| "entropy": 1.3157025051116944, | |
| "epoch": 1.9130945773889199, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 3.0943484196083836e-08, | |
| "loss": 0.1216, | |
| "mean_token_accuracy": 0.9713974285125733, | |
| "num_tokens": 8265439804.0, | |
| "step": 58600 | |
| }, | |
| { | |
| "entropy": 1.313891351222992, | |
| "epoch": 1.9147269106460776, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 2.979432353070577e-08, | |
| "loss": 0.1213, | |
| "mean_token_accuracy": 0.9722525453567505, | |
| "num_tokens": 8272196336.0, | |
| "step": 58650 | |
| }, | |
| { | |
| "entropy": 1.3181121969223022, | |
| "epoch": 1.9163592439032353, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 2.8666800505473655e-08, | |
| "loss": 0.1292, | |
| "mean_token_accuracy": 0.9704732525348664, | |
| "num_tokens": 8279415276.0, | |
| "step": 58700 | |
| }, | |
| { | |
| "entropy": 1.312175862789154, | |
| "epoch": 1.917991577160393, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 2.75609233342754e-08, | |
| "loss": 0.124, | |
| "mean_token_accuracy": 0.9711616253852844, | |
| "num_tokens": 8286266573.0, | |
| "step": 58750 | |
| }, | |
| { | |
| "entropy": 1.3171151232719422, | |
| "epoch": 1.9196239104175508, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.6476700073311376e-08, | |
| "loss": 0.1278, | |
| "mean_token_accuracy": 0.9706109237670898, | |
| "num_tokens": 8293368650.0, | |
| "step": 58800 | |
| }, | |
| { | |
| "entropy": 1.3135269594192505, | |
| "epoch": 1.9212562436747085, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 2.5414138621035477e-08, | |
| "loss": 0.1273, | |
| "mean_token_accuracy": 0.9706173431873322, | |
| "num_tokens": 8300206080.0, | |
| "step": 58850 | |
| }, | |
| { | |
| "entropy": 1.3087952733039856, | |
| "epoch": 1.9228885769318664, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 2.437324671809782e-08, | |
| "loss": 0.1256, | |
| "mean_token_accuracy": 0.9707586574554443, | |
| "num_tokens": 8307284608.0, | |
| "step": 58900 | |
| }, | |
| { | |
| "entropy": 1.310805425643921, | |
| "epoch": 1.9245209101890242, | |
| "grad_norm": 1.5, | |
| "learning_rate": 2.3354031947288136e-08, | |
| "loss": 0.1192, | |
| "mean_token_accuracy": 0.9720681381225585, | |
| "num_tokens": 8314051370.0, | |
| "step": 58950 | |
| }, | |
| { | |
| "entropy": 1.315222783088684, | |
| "epoch": 1.926153243446182, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 2.2356501733479806e-08, | |
| "loss": 0.1181, | |
| "mean_token_accuracy": 0.9725264024734497, | |
| "num_tokens": 8320964631.0, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 1.926153243446182, | |
| "eval_entropy": 1.313348093032837, | |
| "eval_loss": 0.14262661337852478, | |
| "eval_mean_token_accuracy": 0.9675622606277465, | |
| "eval_num_tokens": 8320964631.0, | |
| "eval_runtime": 750.7844, | |
| "eval_samples_per_second": 12.861, | |
| "eval_steps_per_second": 0.101, | |
| "step": 59000 | |
| }, | |
| { | |
| "entropy": 1.3111545324325562, | |
| "epoch": 1.9277855767033398, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 2.1380663343577246e-08, | |
| "loss": 0.1199, | |
| "mean_token_accuracy": 0.9722956836223602, | |
| "num_tokens": 8327540250.0, | |
| "step": 59050 | |
| }, | |
| { | |
| "entropy": 1.3120374631881715, | |
| "epoch": 1.9294179099604976, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 2.04265238864616e-08, | |
| "loss": 0.1307, | |
| "mean_token_accuracy": 0.9701423704624176, | |
| "num_tokens": 8334721588.0, | |
| "step": 59100 | |
| }, | |
| { | |
| "entropy": 1.3150414967536925, | |
| "epoch": 1.9310502432176553, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 1.949409031294014e-08, | |
| "loss": 0.1274, | |
| "mean_token_accuracy": 0.9704896664619446, | |
| "num_tokens": 8341662054.0, | |
| "step": 59150 | |
| }, | |
| { | |
| "entropy": 1.3131565976142883, | |
| "epoch": 1.932682576474813, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.8583369415694608e-08, | |
| "loss": 0.1277, | |
| "mean_token_accuracy": 0.9700733041763305, | |
| "num_tokens": 8349277940.0, | |
| "step": 59200 | |
| }, | |
| { | |
| "entropy": 1.3129038047790527, | |
| "epoch": 1.9343149097319707, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.769436782923195e-08, | |
| "loss": 0.137, | |
| "mean_token_accuracy": 0.9688089847564697, | |
| "num_tokens": 8356789493.0, | |
| "step": 59250 | |
| }, | |
| { | |
| "entropy": 1.3096733212471008, | |
| "epoch": 1.9359472429891287, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.6827092029836678e-08, | |
| "loss": 0.1235, | |
| "mean_token_accuracy": 0.9711651515960693, | |
| "num_tokens": 8363715543.0, | |
| "step": 59300 | |
| }, | |
| { | |
| "entropy": 1.3143090605735779, | |
| "epoch": 1.9375795762462864, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.59815483355229e-08, | |
| "loss": 0.1225, | |
| "mean_token_accuracy": 0.9719608640670776, | |
| "num_tokens": 8370411365.0, | |
| "step": 59350 | |
| }, | |
| { | |
| "entropy": 1.3088870167732238, | |
| "epoch": 1.9392119095034444, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.5157742905989037e-08, | |
| "loss": 0.1246, | |
| "mean_token_accuracy": 0.9717032659053803, | |
| "num_tokens": 8377216822.0, | |
| "step": 59400 | |
| }, | |
| { | |
| "entropy": 1.318067877292633, | |
| "epoch": 1.940844242760602, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 1.4355681742571847e-08, | |
| "loss": 0.1247, | |
| "mean_token_accuracy": 0.9717764687538147, | |
| "num_tokens": 8384607784.0, | |
| "step": 59450 | |
| }, | |
| { | |
| "entropy": 1.3116914916038513, | |
| "epoch": 1.9424765760177598, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.357537068820347e-08, | |
| "loss": 0.1276, | |
| "mean_token_accuracy": 0.970702486038208, | |
| "num_tokens": 8391872892.0, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 1.9424765760177598, | |
| "eval_entropy": 1.313501017888387, | |
| "eval_loss": 0.1426248997449875, | |
| "eval_mean_token_accuracy": 0.9675657065709432, | |
| "eval_num_tokens": 8391872892.0, | |
| "eval_runtime": 746.0898, | |
| "eval_samples_per_second": 12.942, | |
| "eval_steps_per_second": 0.102, | |
| "step": 59500 | |
| }, | |
| { | |
| "entropy": 1.310592589378357, | |
| "epoch": 1.9441089092749175, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.2816815427369455e-08, | |
| "loss": 0.1318, | |
| "mean_token_accuracy": 0.969595000743866, | |
| "num_tokens": 8399268788.0, | |
| "step": 59550 | |
| }, | |
| { | |
| "entropy": 1.3104493141174316, | |
| "epoch": 1.9457412425320753, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.208002148606613e-08, | |
| "loss": 0.114, | |
| "mean_token_accuracy": 0.9738408529758453, | |
| "num_tokens": 8405775548.0, | |
| "step": 59600 | |
| }, | |
| { | |
| "entropy": 1.3134746599197387, | |
| "epoch": 1.947373575789233, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 1.1364994231760295e-08, | |
| "loss": 0.129, | |
| "mean_token_accuracy": 0.9705848026275635, | |
| "num_tokens": 8413019964.0, | |
| "step": 59650 | |
| }, | |
| { | |
| "entropy": 1.3184413361549376, | |
| "epoch": 1.949005909046391, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.0671738873351932e-08, | |
| "loss": 0.1169, | |
| "mean_token_accuracy": 0.9727689456939698, | |
| "num_tokens": 8420004432.0, | |
| "step": 59700 | |
| }, | |
| { | |
| "entropy": 1.317722589969635, | |
| "epoch": 1.9506382423035487, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 1.0000260461134225e-08, | |
| "loss": 0.1233, | |
| "mean_token_accuracy": 0.9717906093597413, | |
| "num_tokens": 8426576444.0, | |
| "step": 59750 | |
| }, | |
| { | |
| "entropy": 1.315042221546173, | |
| "epoch": 1.9522705755607066, | |
| "grad_norm": 2.625, | |
| "learning_rate": 9.35056388675759e-09, | |
| "loss": 0.1258, | |
| "mean_token_accuracy": 0.9710904705524445, | |
| "num_tokens": 8433326272.0, | |
| "step": 59800 | |
| }, | |
| { | |
| "entropy": 1.3289856863021852, | |
| "epoch": 1.9539029088178643, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 8.722653883194375e-09, | |
| "loss": 0.1389, | |
| "mean_token_accuracy": 0.9682004976272583, | |
| "num_tokens": 8440652691.0, | |
| "step": 59850 | |
| }, | |
| { | |
| "entropy": 1.3113100409507752, | |
| "epoch": 1.955535242075022, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 8.116535024703554e-09, | |
| "loss": 0.1263, | |
| "mean_token_accuracy": 0.9709288036823273, | |
| "num_tokens": 8447487171.0, | |
| "step": 59900 | |
| }, | |
| { | |
| "entropy": 1.3235667037963867, | |
| "epoch": 1.9571675753321798, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 7.53221172679841e-09, | |
| "loss": 0.1217, | |
| "mean_token_accuracy": 0.972466766834259, | |
| "num_tokens": 8454238666.0, | |
| "step": 59950 | |
| }, | |
| { | |
| "entropy": 1.3157719588279724, | |
| "epoch": 1.9587999085893375, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 6.969688246213246e-09, | |
| "loss": 0.1267, | |
| "mean_token_accuracy": 0.971341325044632, | |
| "num_tokens": 8461254971.0, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 1.9587999085893375, | |
| "eval_entropy": 1.3126351674397787, | |
| "eval_loss": 0.14264898002147675, | |
| "eval_mean_token_accuracy": 0.9676442178090413, | |
| "eval_num_tokens": 8461254971.0, | |
| "eval_runtime": 749.6809, | |
| "eval_samples_per_second": 12.88, | |
| "eval_steps_per_second": 0.101, | |
| "step": 60000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 61262, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.0297347722946334e+20, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |