Instructions to use roonbug/q1umaz8e with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use roonbug/q1umaz8e with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="roonbug/q1umaz8e") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("roonbug/q1umaz8e") model = AutoModelForCausalLM.from_pretrained("roonbug/q1umaz8e") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use roonbug/q1umaz8e with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "roonbug/q1umaz8e" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/q1umaz8e", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/roonbug/q1umaz8e
- SGLang
How to use roonbug/q1umaz8e with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "roonbug/q1umaz8e" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/q1umaz8e", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "roonbug/q1umaz8e" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "roonbug/q1umaz8e", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use roonbug/q1umaz8e with Docker Model Runner:
docker model run hf.co/roonbug/q1umaz8e
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.2, | |
| "eval_steps": 100, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.813749196380377, | |
| "epoch": 0.016, | |
| "grad_norm": 21.625, | |
| "learning_rate": 1.8e-07, | |
| "loss": 2.3527, | |
| "mean_token_accuracy": 0.5754138542339206, | |
| "num_tokens": 195510.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.7928078502416611, | |
| "epoch": 0.032, | |
| "grad_norm": 16.75, | |
| "learning_rate": 3.8e-07, | |
| "loss": 2.3363, | |
| "mean_token_accuracy": 0.5766932301223278, | |
| "num_tokens": 391008.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.8388511516153812, | |
| "epoch": 0.048, | |
| "grad_norm": 17.25, | |
| "learning_rate": 5.800000000000001e-07, | |
| "loss": 2.3414, | |
| "mean_token_accuracy": 0.5755490336567164, | |
| "num_tokens": 590136.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.853544193506241, | |
| "epoch": 0.064, | |
| "grad_norm": 12.5, | |
| "learning_rate": 7.8e-07, | |
| "loss": 2.2941, | |
| "mean_token_accuracy": 0.577743387594819, | |
| "num_tokens": 791432.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.9382900901138782, | |
| "epoch": 0.08, | |
| "grad_norm": 12.3125, | |
| "learning_rate": 9.800000000000001e-07, | |
| "loss": 2.2698, | |
| "mean_token_accuracy": 0.5786384884268045, | |
| "num_tokens": 990342.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 2.012592934817076, | |
| "epoch": 0.096, | |
| "grad_norm": 7.59375, | |
| "learning_rate": 1.1800000000000001e-06, | |
| "loss": 2.2841, | |
| "mean_token_accuracy": 0.5789956836029887, | |
| "num_tokens": 1182339.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 2.020173179358244, | |
| "epoch": 0.112, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 1.3800000000000001e-06, | |
| "loss": 2.1881, | |
| "mean_token_accuracy": 0.5869531789794564, | |
| "num_tokens": 1386193.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 2.042734081298113, | |
| "epoch": 0.128, | |
| "grad_norm": 5.75, | |
| "learning_rate": 1.5800000000000001e-06, | |
| "loss": 2.1864, | |
| "mean_token_accuracy": 0.5859880622476339, | |
| "num_tokens": 1583261.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.9902476705610752, | |
| "epoch": 0.144, | |
| "grad_norm": 5.6875, | |
| "learning_rate": 1.7800000000000001e-06, | |
| "loss": 2.1581, | |
| "mean_token_accuracy": 0.5905994104221464, | |
| "num_tokens": 1774826.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 2.038115797191858, | |
| "epoch": 0.16, | |
| "grad_norm": 5.9375, | |
| "learning_rate": 1.98e-06, | |
| "loss": 2.1863, | |
| "mean_token_accuracy": 0.5856570154428482, | |
| "num_tokens": 1971310.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_biology_entropy": 1.9723965148925782, | |
| "eval_biology_loss": 2.1413300037384033, | |
| "eval_biology_mean_token_accuracy": 0.6000847628116608, | |
| "eval_biology_num_tokens": 1971310.0, | |
| "eval_biology_runtime": 38.6804, | |
| "eval_biology_samples_per_second": 12.926, | |
| "eval_biology_steps_per_second": 3.232, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_physics_entropy": 0.6801536810398102, | |
| "eval_physics_loss": 0.7345870733261108, | |
| "eval_physics_mean_token_accuracy": 0.8230011963844299, | |
| "eval_physics_num_tokens": 1971310.0, | |
| "eval_physics_runtime": 56.9468, | |
| "eval_physics_samples_per_second": 8.78, | |
| "eval_physics_steps_per_second": 2.195, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.9897258900105954, | |
| "epoch": 0.176, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 2.1800000000000003e-06, | |
| "loss": 2.1203, | |
| "mean_token_accuracy": 0.5903370209038258, | |
| "num_tokens": 2169453.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.9791964955627919, | |
| "epoch": 0.192, | |
| "grad_norm": 5.8125, | |
| "learning_rate": 2.38e-06, | |
| "loss": 2.1168, | |
| "mean_token_accuracy": 0.5923390585929156, | |
| "num_tokens": 2366844.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.9716377504169942, | |
| "epoch": 0.208, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 2.5800000000000003e-06, | |
| "loss": 2.1059, | |
| "mean_token_accuracy": 0.5896596999838948, | |
| "num_tokens": 2559708.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.9294324100017548, | |
| "epoch": 0.224, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 2.7800000000000005e-06, | |
| "loss": 2.0638, | |
| "mean_token_accuracy": 0.6001523662358522, | |
| "num_tokens": 2756533.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.9123736470937729, | |
| "epoch": 0.24, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 2.9800000000000003e-06, | |
| "loss": 2.0241, | |
| "mean_token_accuracy": 0.6036131344735622, | |
| "num_tokens": 2948600.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.9347217477858067, | |
| "epoch": 0.256, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 3.1800000000000005e-06, | |
| "loss": 2.0531, | |
| "mean_token_accuracy": 0.5974894173443317, | |
| "num_tokens": 3141473.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.9242084234952928, | |
| "epoch": 0.272, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 3.3800000000000007e-06, | |
| "loss": 2.0261, | |
| "mean_token_accuracy": 0.5972955338656902, | |
| "num_tokens": 3337285.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.8868879064917565, | |
| "epoch": 0.288, | |
| "grad_norm": 5.375, | |
| "learning_rate": 3.58e-06, | |
| "loss": 1.9858, | |
| "mean_token_accuracy": 0.6028371442109346, | |
| "num_tokens": 3541221.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.8624922472983598, | |
| "epoch": 0.304, | |
| "grad_norm": 4.71875, | |
| "learning_rate": 3.7800000000000002e-06, | |
| "loss": 1.943, | |
| "mean_token_accuracy": 0.6070948902517557, | |
| "num_tokens": 3734952.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.881417414546013, | |
| "epoch": 0.32, | |
| "grad_norm": 5.21875, | |
| "learning_rate": 3.980000000000001e-06, | |
| "loss": 1.9609, | |
| "mean_token_accuracy": 0.603725228458643, | |
| "num_tokens": 3922209.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_biology_entropy": 1.8500013475418091, | |
| "eval_biology_loss": 1.9382704496383667, | |
| "eval_biology_mean_token_accuracy": 0.6156538977622986, | |
| "eval_biology_num_tokens": 3922209.0, | |
| "eval_biology_runtime": 38.6714, | |
| "eval_biology_samples_per_second": 12.929, | |
| "eval_biology_steps_per_second": 3.232, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_physics_entropy": 0.6814348955154419, | |
| "eval_physics_loss": 0.7077023386955261, | |
| "eval_physics_mean_token_accuracy": 0.8221852016448975, | |
| "eval_physics_num_tokens": 3922209.0, | |
| "eval_physics_runtime": 56.9406, | |
| "eval_physics_samples_per_second": 8.781, | |
| "eval_physics_steps_per_second": 2.195, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.8236718311905862, | |
| "epoch": 0.336, | |
| "grad_norm": 5.125, | |
| "learning_rate": 4.18e-06, | |
| "loss": 1.9029, | |
| "mean_token_accuracy": 0.6111209694296121, | |
| "num_tokens": 4115941.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.851329033076763, | |
| "epoch": 0.352, | |
| "grad_norm": 5.0, | |
| "learning_rate": 4.38e-06, | |
| "loss": 1.8904, | |
| "mean_token_accuracy": 0.6098318418487907, | |
| "num_tokens": 4308755.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.8281462915241717, | |
| "epoch": 0.368, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 4.58e-06, | |
| "loss": 1.8692, | |
| "mean_token_accuracy": 0.6131063066422939, | |
| "num_tokens": 4505910.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.831401702016592, | |
| "epoch": 0.384, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 4.78e-06, | |
| "loss": 1.8357, | |
| "mean_token_accuracy": 0.6127742934972048, | |
| "num_tokens": 4696101.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.816836379468441, | |
| "epoch": 0.4, | |
| "grad_norm": 4.75, | |
| "learning_rate": 4.980000000000001e-06, | |
| "loss": 1.8158, | |
| "mean_token_accuracy": 0.614238902926445, | |
| "num_tokens": 4889445.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.8351925887167453, | |
| "epoch": 0.416, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 5.18e-06, | |
| "loss": 1.8225, | |
| "mean_token_accuracy": 0.6112797066569329, | |
| "num_tokens": 5087624.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.8272491071373225, | |
| "epoch": 0.432, | |
| "grad_norm": 5.5, | |
| "learning_rate": 5.380000000000001e-06, | |
| "loss": 1.8098, | |
| "mean_token_accuracy": 0.6129346057772637, | |
| "num_tokens": 5273794.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.8296721816062926, | |
| "epoch": 0.448, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 5.580000000000001e-06, | |
| "loss": 1.7925, | |
| "mean_token_accuracy": 0.6138202898204327, | |
| "num_tokens": 5463288.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.7533989433199166, | |
| "epoch": 0.464, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 5.78e-06, | |
| "loss": 1.7277, | |
| "mean_token_accuracy": 0.625254824757576, | |
| "num_tokens": 5656827.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.7856548748910428, | |
| "epoch": 0.48, | |
| "grad_norm": 4.75, | |
| "learning_rate": 5.98e-06, | |
| "loss": 1.7492, | |
| "mean_token_accuracy": 0.6199217233806849, | |
| "num_tokens": 5853311.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_biology_entropy": 1.766937505722046, | |
| "eval_biology_loss": 1.737308144569397, | |
| "eval_biology_mean_token_accuracy": 0.6296310710906983, | |
| "eval_biology_num_tokens": 5853311.0, | |
| "eval_biology_runtime": 38.676, | |
| "eval_biology_samples_per_second": 12.928, | |
| "eval_biology_steps_per_second": 3.232, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_physics_entropy": 0.699759515285492, | |
| "eval_physics_loss": 0.6600434184074402, | |
| "eval_physics_mean_token_accuracy": 0.8216852688789368, | |
| "eval_physics_num_tokens": 5853311.0, | |
| "eval_physics_runtime": 56.9184, | |
| "eval_physics_samples_per_second": 8.785, | |
| "eval_physics_steps_per_second": 2.196, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.6893749851733446, | |
| "epoch": 0.496, | |
| "grad_norm": 5.125, | |
| "learning_rate": 6.18e-06, | |
| "loss": 1.6529, | |
| "mean_token_accuracy": 0.6324049789458513, | |
| "num_tokens": 6049788.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.777088150382042, | |
| "epoch": 0.512, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 6.380000000000001e-06, | |
| "loss": 1.7373, | |
| "mean_token_accuracy": 0.6215202666819095, | |
| "num_tokens": 6243825.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.7654731079936028, | |
| "epoch": 0.528, | |
| "grad_norm": 4.75, | |
| "learning_rate": 6.5800000000000005e-06, | |
| "loss": 1.7104, | |
| "mean_token_accuracy": 0.6297121163457632, | |
| "num_tokens": 6434188.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.7458826407790184, | |
| "epoch": 0.544, | |
| "grad_norm": 4.59375, | |
| "learning_rate": 6.780000000000001e-06, | |
| "loss": 1.7134, | |
| "mean_token_accuracy": 0.6295231115072966, | |
| "num_tokens": 6629832.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.7250068325549364, | |
| "epoch": 0.56, | |
| "grad_norm": 4.375, | |
| "learning_rate": 6.98e-06, | |
| "loss": 1.6747, | |
| "mean_token_accuracy": 0.6324008051306009, | |
| "num_tokens": 6824668.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.6957500472664833, | |
| "epoch": 0.576, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 7.180000000000001e-06, | |
| "loss": 1.6528, | |
| "mean_token_accuracy": 0.6356299221515656, | |
| "num_tokens": 7025923.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.7250152129679919, | |
| "epoch": 0.592, | |
| "grad_norm": 4.375, | |
| "learning_rate": 7.3800000000000005e-06, | |
| "loss": 1.6596, | |
| "mean_token_accuracy": 0.63132487796247, | |
| "num_tokens": 7218224.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.7018816463649273, | |
| "epoch": 0.608, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 7.58e-06, | |
| "loss": 1.664, | |
| "mean_token_accuracy": 0.6356819782406091, | |
| "num_tokens": 7421216.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.7107378408312797, | |
| "epoch": 0.624, | |
| "grad_norm": 4.78125, | |
| "learning_rate": 7.78e-06, | |
| "loss": 1.6546, | |
| "mean_token_accuracy": 0.6328394934535027, | |
| "num_tokens": 7617258.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.7047107808291913, | |
| "epoch": 0.64, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 7.980000000000002e-06, | |
| "loss": 1.636, | |
| "mean_token_accuracy": 0.6354850973933935, | |
| "num_tokens": 7806173.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_biology_entropy": 1.6343248481750487, | |
| "eval_biology_loss": 1.6292321681976318, | |
| "eval_biology_mean_token_accuracy": 0.6456177935600281, | |
| "eval_biology_num_tokens": 7806173.0, | |
| "eval_biology_runtime": 38.6524, | |
| "eval_biology_samples_per_second": 12.936, | |
| "eval_biology_steps_per_second": 3.234, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_physics_entropy": 0.6857430481910706, | |
| "eval_physics_loss": 0.6523167490959167, | |
| "eval_physics_mean_token_accuracy": 0.8234750027656556, | |
| "eval_physics_num_tokens": 7806173.0, | |
| "eval_physics_runtime": 56.9086, | |
| "eval_physics_samples_per_second": 8.786, | |
| "eval_physics_steps_per_second": 2.197, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.6953211184591055, | |
| "epoch": 0.656, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 8.18e-06, | |
| "loss": 1.6492, | |
| "mean_token_accuracy": 0.635795733332634, | |
| "num_tokens": 8000514.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.7117650181055069, | |
| "epoch": 0.672, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 8.380000000000001e-06, | |
| "loss": 1.651, | |
| "mean_token_accuracy": 0.6328964576125145, | |
| "num_tokens": 8187884.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.6630911942571402, | |
| "epoch": 0.688, | |
| "grad_norm": 5.0, | |
| "learning_rate": 8.580000000000001e-06, | |
| "loss": 1.6129, | |
| "mean_token_accuracy": 0.6399691045284271, | |
| "num_tokens": 8390643.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.6603879615664483, | |
| "epoch": 0.704, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 8.78e-06, | |
| "loss": 1.6006, | |
| "mean_token_accuracy": 0.6405742786824703, | |
| "num_tokens": 8583266.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.5886821828782558, | |
| "epoch": 0.72, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 8.98e-06, | |
| "loss": 1.5254, | |
| "mean_token_accuracy": 0.6530824847519398, | |
| "num_tokens": 8786233.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.6262568928301335, | |
| "epoch": 0.736, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 9.180000000000002e-06, | |
| "loss": 1.5597, | |
| "mean_token_accuracy": 0.6445462703704834, | |
| "num_tokens": 8982898.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.6295693822205066, | |
| "epoch": 0.752, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 9.38e-06, | |
| "loss": 1.5648, | |
| "mean_token_accuracy": 0.6450414888560771, | |
| "num_tokens": 9174361.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.5759615428745746, | |
| "epoch": 0.768, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 9.58e-06, | |
| "loss": 1.5144, | |
| "mean_token_accuracy": 0.653152122348547, | |
| "num_tokens": 9373430.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.6083305418491363, | |
| "epoch": 0.784, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 9.780000000000001e-06, | |
| "loss": 1.5488, | |
| "mean_token_accuracy": 0.6459381774067878, | |
| "num_tokens": 9570409.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.6188334189355373, | |
| "epoch": 0.8, | |
| "grad_norm": 4.78125, | |
| "learning_rate": 9.980000000000001e-06, | |
| "loss": 1.5541, | |
| "mean_token_accuracy": 0.6456099949777127, | |
| "num_tokens": 9766360.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_biology_entropy": 1.570951623916626, | |
| "eval_biology_loss": 1.5443849563598633, | |
| "eval_biology_mean_token_accuracy": 0.6565022644996643, | |
| "eval_biology_num_tokens": 9766360.0, | |
| "eval_biology_runtime": 38.6538, | |
| "eval_biology_samples_per_second": 12.935, | |
| "eval_biology_steps_per_second": 3.234, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_physics_entropy": 0.6805566384792328, | |
| "eval_physics_loss": 0.6488668322563171, | |
| "eval_physics_mean_token_accuracy": 0.8232229099273681, | |
| "eval_physics_num_tokens": 9766360.0, | |
| "eval_physics_runtime": 56.9177, | |
| "eval_physics_samples_per_second": 8.785, | |
| "eval_physics_steps_per_second": 2.196, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.5766926735639573, | |
| "epoch": 0.816, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 1.018e-05, | |
| "loss": 1.5247, | |
| "mean_token_accuracy": 0.6521722257137299, | |
| "num_tokens": 9963834.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.559133169800043, | |
| "epoch": 0.832, | |
| "grad_norm": 4.5, | |
| "learning_rate": 1.038e-05, | |
| "loss": 1.5075, | |
| "mean_token_accuracy": 0.6519753206521273, | |
| "num_tokens": 10160950.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.5535101789981127, | |
| "epoch": 0.848, | |
| "grad_norm": 5.0, | |
| "learning_rate": 1.0580000000000002e-05, | |
| "loss": 1.496, | |
| "mean_token_accuracy": 0.6550566371530294, | |
| "num_tokens": 10363131.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.573872973024845, | |
| "epoch": 0.864, | |
| "grad_norm": 4.375, | |
| "learning_rate": 1.0780000000000002e-05, | |
| "loss": 1.5151, | |
| "mean_token_accuracy": 0.6522478803992271, | |
| "num_tokens": 10558003.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.5461064357310534, | |
| "epoch": 0.88, | |
| "grad_norm": 4.59375, | |
| "learning_rate": 1.0980000000000002e-05, | |
| "loss": 1.4907, | |
| "mean_token_accuracy": 0.6562161970883608, | |
| "num_tokens": 10754470.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.5923856895416975, | |
| "epoch": 0.896, | |
| "grad_norm": 4.5, | |
| "learning_rate": 1.1180000000000001e-05, | |
| "loss": 1.5274, | |
| "mean_token_accuracy": 0.6506992731243372, | |
| "num_tokens": 10948943.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.546384008973837, | |
| "epoch": 0.912, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 1.138e-05, | |
| "loss": 1.4879, | |
| "mean_token_accuracy": 0.655264026299119, | |
| "num_tokens": 11142613.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.5580323040485382, | |
| "epoch": 0.928, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 1.1580000000000001e-05, | |
| "loss": 1.4914, | |
| "mean_token_accuracy": 0.6533704947680234, | |
| "num_tokens": 11336779.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.4954375229775905, | |
| "epoch": 0.944, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 1.178e-05, | |
| "loss": 1.4496, | |
| "mean_token_accuracy": 0.6618651241064072, | |
| "num_tokens": 11536381.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.521157095208764, | |
| "epoch": 0.96, | |
| "grad_norm": 5.34375, | |
| "learning_rate": 1.198e-05, | |
| "loss": 1.4654, | |
| "mean_token_accuracy": 0.6575134672224522, | |
| "num_tokens": 11735527.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_biology_entropy": 1.4938988003730773, | |
| "eval_biology_loss": 1.4810906648635864, | |
| "eval_biology_mean_token_accuracy": 0.6647453079223633, | |
| "eval_biology_num_tokens": 11735527.0, | |
| "eval_biology_runtime": 38.6461, | |
| "eval_biology_samples_per_second": 12.938, | |
| "eval_biology_steps_per_second": 3.234, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_physics_entropy": 0.6846246910095215, | |
| "eval_physics_loss": 0.6483092308044434, | |
| "eval_physics_mean_token_accuracy": 0.8228992919921875, | |
| "eval_physics_num_tokens": 11735527.0, | |
| "eval_physics_runtime": 57.2247, | |
| "eval_physics_samples_per_second": 8.737, | |
| "eval_physics_steps_per_second": 2.184, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.4897711608558892, | |
| "epoch": 0.976, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 1.218e-05, | |
| "loss": 1.4364, | |
| "mean_token_accuracy": 0.6634432837367058, | |
| "num_tokens": 11930935.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.5174701895564795, | |
| "epoch": 0.992, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 1.2380000000000002e-05, | |
| "loss": 1.4657, | |
| "mean_token_accuracy": 0.6584253456443548, | |
| "num_tokens": 12129585.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 1.4917617563158274, | |
| "epoch": 1.008, | |
| "grad_norm": 4.5, | |
| "learning_rate": 1.2580000000000002e-05, | |
| "loss": 1.4435, | |
| "mean_token_accuracy": 0.6625992625951767, | |
| "num_tokens": 12325847.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.429138394445181, | |
| "epoch": 1.024, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 1.2780000000000001e-05, | |
| "loss": 1.3693, | |
| "mean_token_accuracy": 0.6733787953853607, | |
| "num_tokens": 12530820.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.4665595509111882, | |
| "epoch": 1.04, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 1.2980000000000001e-05, | |
| "loss": 1.4151, | |
| "mean_token_accuracy": 0.6643557522445918, | |
| "num_tokens": 12725384.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.4320495713502168, | |
| "epoch": 1.056, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 1.3180000000000001e-05, | |
| "loss": 1.3639, | |
| "mean_token_accuracy": 0.6728409513831138, | |
| "num_tokens": 12924592.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.456987388432026, | |
| "epoch": 1.072, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 1.3380000000000002e-05, | |
| "loss": 1.3964, | |
| "mean_token_accuracy": 0.6672081802040338, | |
| "num_tokens": 13112848.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 1.4462645407766104, | |
| "epoch": 1.088, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 1.3580000000000002e-05, | |
| "loss": 1.3914, | |
| "mean_token_accuracy": 0.6701537113636732, | |
| "num_tokens": 13305725.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.439976465329528, | |
| "epoch": 1.104, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 1.378e-05, | |
| "loss": 1.3853, | |
| "mean_token_accuracy": 0.6701969560235739, | |
| "num_tokens": 13498685.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.4542303152382374, | |
| "epoch": 1.12, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 1.398e-05, | |
| "loss": 1.3835, | |
| "mean_token_accuracy": 0.6701514054089784, | |
| "num_tokens": 13682146.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_biology_entropy": 1.3601415209770202, | |
| "eval_biology_loss": 1.4304583072662354, | |
| "eval_biology_mean_token_accuracy": 0.6715204911231994, | |
| "eval_biology_num_tokens": 13682146.0, | |
| "eval_biology_runtime": 38.6654, | |
| "eval_biology_samples_per_second": 12.931, | |
| "eval_biology_steps_per_second": 3.233, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "eval_physics_entropy": 0.65302716588974, | |
| "eval_physics_loss": 0.6474171876907349, | |
| "eval_physics_mean_token_accuracy": 0.8226029920578003, | |
| "eval_physics_num_tokens": 13682146.0, | |
| "eval_physics_runtime": 56.8929, | |
| "eval_physics_samples_per_second": 8.788, | |
| "eval_physics_steps_per_second": 2.197, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.4230232119560242, | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 4.9375, | |
| "learning_rate": 1.418e-05, | |
| "loss": 1.3747, | |
| "mean_token_accuracy": 0.6713559620082379, | |
| "num_tokens": 13876656.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.398179117962718, | |
| "epoch": 1.152, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 1.4380000000000001e-05, | |
| "loss": 1.3438, | |
| "mean_token_accuracy": 0.675188298150897, | |
| "num_tokens": 14086104.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.4270055878907442, | |
| "epoch": 1.168, | |
| "grad_norm": 4.375, | |
| "learning_rate": 1.4580000000000001e-05, | |
| "loss": 1.373, | |
| "mean_token_accuracy": 0.6712808165699243, | |
| "num_tokens": 14274520.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 1.4195893313735723, | |
| "epoch": 1.184, | |
| "grad_norm": 4.0, | |
| "learning_rate": 1.478e-05, | |
| "loss": 1.3657, | |
| "mean_token_accuracy": 0.6732822258025408, | |
| "num_tokens": 14473623.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.4137178607285024, | |
| "epoch": 1.2, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 1.498e-05, | |
| "loss": 1.3527, | |
| "mean_token_accuracy": 0.6739591065794229, | |
| "num_tokens": 14661238.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.3734467361122369, | |
| "epoch": 1.216, | |
| "grad_norm": 4.25, | |
| "learning_rate": 1.5180000000000002e-05, | |
| "loss": 1.3173, | |
| "mean_token_accuracy": 0.6782099220901727, | |
| "num_tokens": 14865915.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 1.3874890733510257, | |
| "epoch": 1.232, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 1.5380000000000002e-05, | |
| "loss": 1.3318, | |
| "mean_token_accuracy": 0.6802723795175553, | |
| "num_tokens": 15055682.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.3982337843626738, | |
| "epoch": 1.248, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 1.5580000000000003e-05, | |
| "loss": 1.3481, | |
| "mean_token_accuracy": 0.6750583201646805, | |
| "num_tokens": 15249576.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.4128487702459096, | |
| "epoch": 1.264, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 1.578e-05, | |
| "loss": 1.3488, | |
| "mean_token_accuracy": 0.6755638718605042, | |
| "num_tokens": 15446149.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.4168333638459445, | |
| "epoch": 1.28, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 1.5980000000000003e-05, | |
| "loss": 1.3691, | |
| "mean_token_accuracy": 0.6701620414853096, | |
| "num_tokens": 15639317.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_biology_entropy": 1.3710014867782592, | |
| "eval_biology_loss": 1.3889533281326294, | |
| "eval_biology_mean_token_accuracy": 0.6776719264984131, | |
| "eval_biology_num_tokens": 15639317.0, | |
| "eval_biology_runtime": 38.6779, | |
| "eval_biology_samples_per_second": 12.927, | |
| "eval_biology_steps_per_second": 3.232, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_physics_entropy": 0.6683794283866882, | |
| "eval_physics_loss": 0.6475260257720947, | |
| "eval_physics_mean_token_accuracy": 0.8223769798278808, | |
| "eval_physics_num_tokens": 15639317.0, | |
| "eval_physics_runtime": 56.9416, | |
| "eval_physics_samples_per_second": 8.781, | |
| "eval_physics_steps_per_second": 2.195, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.415640415251255, | |
| "epoch": 1.296, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 1.618e-05, | |
| "loss": 1.3559, | |
| "mean_token_accuracy": 0.6743101593106985, | |
| "num_tokens": 15835805.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 1.4065573081374168, | |
| "epoch": 1.312, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 1.638e-05, | |
| "loss": 1.3441, | |
| "mean_token_accuracy": 0.6742365792393684, | |
| "num_tokens": 16028375.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 1.4003059070557355, | |
| "epoch": 1.328, | |
| "grad_norm": 3.9375, | |
| "learning_rate": 1.658e-05, | |
| "loss": 1.3416, | |
| "mean_token_accuracy": 0.6751531921327114, | |
| "num_tokens": 16230555.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.3695629570633172, | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 1.6780000000000002e-05, | |
| "loss": 1.323, | |
| "mean_token_accuracy": 0.6816294804215431, | |
| "num_tokens": 16436297.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.3895002137869596, | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 1.698e-05, | |
| "loss": 1.3223, | |
| "mean_token_accuracy": 0.678603322058916, | |
| "num_tokens": 16630563.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.4141130354255438, | |
| "epoch": 1.376, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 1.718e-05, | |
| "loss": 1.3514, | |
| "mean_token_accuracy": 0.6730918735265732, | |
| "num_tokens": 16822602.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.3758564725518228, | |
| "epoch": 1.392, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 1.7380000000000003e-05, | |
| "loss": 1.3266, | |
| "mean_token_accuracy": 0.6803319718688726, | |
| "num_tokens": 17015750.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.3721271395683288, | |
| "epoch": 1.408, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 1.758e-05, | |
| "loss": 1.3173, | |
| "mean_token_accuracy": 0.6792092099785805, | |
| "num_tokens": 17207248.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.378094071522355, | |
| "epoch": 1.424, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 1.7780000000000003e-05, | |
| "loss": 1.3327, | |
| "mean_token_accuracy": 0.6793139282613992, | |
| "num_tokens": 17403869.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.3699288651347161, | |
| "epoch": 1.44, | |
| "grad_norm": 4.5, | |
| "learning_rate": 1.798e-05, | |
| "loss": 1.3147, | |
| "mean_token_accuracy": 0.6776431813836098, | |
| "num_tokens": 17597428.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_biology_entropy": 1.3595392904281616, | |
| "eval_biology_loss": 1.3485888242721558, | |
| "eval_biology_mean_token_accuracy": 0.6834584150314331, | |
| "eval_biology_num_tokens": 17597428.0, | |
| "eval_biology_runtime": 38.682, | |
| "eval_biology_samples_per_second": 12.926, | |
| "eval_biology_steps_per_second": 3.231, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "eval_physics_entropy": 0.6735630776882172, | |
| "eval_physics_loss": 0.6497707366943359, | |
| "eval_physics_mean_token_accuracy": 0.8218984279632569, | |
| "eval_physics_num_tokens": 17597428.0, | |
| "eval_physics_runtime": 56.9341, | |
| "eval_physics_samples_per_second": 8.782, | |
| "eval_physics_steps_per_second": 2.196, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.3743872378021478, | |
| "epoch": 1.456, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 1.8180000000000002e-05, | |
| "loss": 1.3244, | |
| "mean_token_accuracy": 0.6799841322004795, | |
| "num_tokens": 17798083.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.3584794465452432, | |
| "epoch": 1.472, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 1.8380000000000004e-05, | |
| "loss": 1.2994, | |
| "mean_token_accuracy": 0.6824300821870566, | |
| "num_tokens": 17993802.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 1.3503762260079384, | |
| "epoch": 1.488, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 1.858e-05, | |
| "loss": 1.3014, | |
| "mean_token_accuracy": 0.6844853088259697, | |
| "num_tokens": 18185570.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 1.3729203592985868, | |
| "epoch": 1.504, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 1.878e-05, | |
| "loss": 1.3216, | |
| "mean_token_accuracy": 0.6780106417834759, | |
| "num_tokens": 18377903.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.3659293711185456, | |
| "epoch": 1.52, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 1.898e-05, | |
| "loss": 1.318, | |
| "mean_token_accuracy": 0.6781526006758213, | |
| "num_tokens": 18579284.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.366529828310013, | |
| "epoch": 1.536, | |
| "grad_norm": 4.5, | |
| "learning_rate": 1.918e-05, | |
| "loss": 1.3038, | |
| "mean_token_accuracy": 0.6802028331905603, | |
| "num_tokens": 18765324.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 1.3478929094970227, | |
| "epoch": 1.552, | |
| "grad_norm": 3.9375, | |
| "learning_rate": 1.938e-05, | |
| "loss": 1.2881, | |
| "mean_token_accuracy": 0.68250414095819, | |
| "num_tokens": 18966382.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.339873855561018, | |
| "epoch": 1.568, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 1.9580000000000002e-05, | |
| "loss": 1.2915, | |
| "mean_token_accuracy": 0.6844375465065241, | |
| "num_tokens": 19160509.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.3251274175941945, | |
| "epoch": 1.584, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 1.978e-05, | |
| "loss": 1.2752, | |
| "mean_token_accuracy": 0.6852322082966567, | |
| "num_tokens": 19354662.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.384224559739232, | |
| "epoch": 1.6, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 1.9980000000000002e-05, | |
| "loss": 1.3287, | |
| "mean_token_accuracy": 0.6757747534662485, | |
| "num_tokens": 19543088.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_biology_entropy": 1.337540853023529, | |
| "eval_biology_loss": 1.3165961503982544, | |
| "eval_biology_mean_token_accuracy": 0.6879782662391662, | |
| "eval_biology_num_tokens": 19543088.0, | |
| "eval_biology_runtime": 38.6581, | |
| "eval_biology_samples_per_second": 12.934, | |
| "eval_biology_steps_per_second": 3.233, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_physics_entropy": 0.6811050243377685, | |
| "eval_physics_loss": 0.6504300832748413, | |
| "eval_physics_mean_token_accuracy": 0.8209615325927735, | |
| "eval_physics_num_tokens": 19543088.0, | |
| "eval_physics_runtime": 56.8501, | |
| "eval_physics_samples_per_second": 8.795, | |
| "eval_physics_steps_per_second": 2.199, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 1.3300264578312635, | |
| "epoch": 1.616, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 1.9980000000000002e-05, | |
| "loss": 1.2769, | |
| "mean_token_accuracy": 0.685854634270072, | |
| "num_tokens": 19743506.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 1.3689603984355927, | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 1.995777777777778e-05, | |
| "loss": 1.3153, | |
| "mean_token_accuracy": 0.6796921446919442, | |
| "num_tokens": 19937567.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 1.3543198898434639, | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.9935555555555557e-05, | |
| "loss": 1.2901, | |
| "mean_token_accuracy": 0.6821238681674003, | |
| "num_tokens": 20129743.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 1.3178342673927546, | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 1.9913333333333335e-05, | |
| "loss": 1.2644, | |
| "mean_token_accuracy": 0.6889090698212386, | |
| "num_tokens": 20331548.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 1.3326357528567314, | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 1.9891111111111112e-05, | |
| "loss": 1.2834, | |
| "mean_token_accuracy": 0.6844820819795132, | |
| "num_tokens": 20524247.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 1.327241437882185, | |
| "epoch": 1.696, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 1.986888888888889e-05, | |
| "loss": 1.2708, | |
| "mean_token_accuracy": 0.6858115192502737, | |
| "num_tokens": 20718121.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 1.3415468953549863, | |
| "epoch": 1.712, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 1.9846666666666668e-05, | |
| "loss": 1.2797, | |
| "mean_token_accuracy": 0.684544776007533, | |
| "num_tokens": 20921419.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 1.320882337912917, | |
| "epoch": 1.728, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 1.9824444444444445e-05, | |
| "loss": 1.267, | |
| "mean_token_accuracy": 0.6839414596557617, | |
| "num_tokens": 21118581.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 1.2898854076862336, | |
| "epoch": 1.744, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 1.9802222222222226e-05, | |
| "loss": 1.2375, | |
| "mean_token_accuracy": 0.6910523246973753, | |
| "num_tokens": 21315128.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 1.31239253282547, | |
| "epoch": 1.76, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 1.978e-05, | |
| "loss": 1.2603, | |
| "mean_token_accuracy": 0.6899993035942316, | |
| "num_tokens": 21511066.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_biology_entropy": 1.2963400874137878, | |
| "eval_biology_loss": 1.2855476140975952, | |
| "eval_biology_mean_token_accuracy": 0.6923068904876709, | |
| "eval_biology_num_tokens": 21511066.0, | |
| "eval_biology_runtime": 38.6761, | |
| "eval_biology_samples_per_second": 12.928, | |
| "eval_biology_steps_per_second": 3.232, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "eval_physics_entropy": 0.6632856893539428, | |
| "eval_physics_loss": 0.6495886445045471, | |
| "eval_physics_mean_token_accuracy": 0.8213236899375915, | |
| "eval_physics_num_tokens": 21511066.0, | |
| "eval_physics_runtime": 56.9409, | |
| "eval_physics_samples_per_second": 8.781, | |
| "eval_physics_steps_per_second": 2.195, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.3157803509384394, | |
| "epoch": 1.776, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 1.975777777777778e-05, | |
| "loss": 1.2599, | |
| "mean_token_accuracy": 0.6879700906574726, | |
| "num_tokens": 21704368.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 1.2888959780335427, | |
| "epoch": 1.792, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 1.9735555555555556e-05, | |
| "loss": 1.2331, | |
| "mean_token_accuracy": 0.6911204513162374, | |
| "num_tokens": 21905784.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 1.3017573833465577, | |
| "epoch": 1.808, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 1.9713333333333337e-05, | |
| "loss": 1.2521, | |
| "mean_token_accuracy": 0.6918430212885142, | |
| "num_tokens": 22094159.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 1.329062307998538, | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 1.969111111111111e-05, | |
| "loss": 1.2592, | |
| "mean_token_accuracy": 0.684392886608839, | |
| "num_tokens": 22290654.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 1.339531321078539, | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 1.9668888888888892e-05, | |
| "loss": 1.2846, | |
| "mean_token_accuracy": 0.6835382975637913, | |
| "num_tokens": 22485675.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.295450346916914, | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 1.9646666666666666e-05, | |
| "loss": 1.2508, | |
| "mean_token_accuracy": 0.6886295765638352, | |
| "num_tokens": 22689900.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 1.2797737825661897, | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 1.9624444444444447e-05, | |
| "loss": 1.2204, | |
| "mean_token_accuracy": 0.6948459330946207, | |
| "num_tokens": 22887275.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 1.3289342291653157, | |
| "epoch": 1.888, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 1.9602222222222225e-05, | |
| "loss": 1.2744, | |
| "mean_token_accuracy": 0.6839813269674778, | |
| "num_tokens": 23081241.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 1.2998227212578057, | |
| "epoch": 1.904, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 1.9580000000000002e-05, | |
| "loss": 1.2522, | |
| "mean_token_accuracy": 0.6898551240563393, | |
| "num_tokens": 23276417.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 1.300378565862775, | |
| "epoch": 1.92, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 1.955777777777778e-05, | |
| "loss": 1.2471, | |
| "mean_token_accuracy": 0.6882933590561151, | |
| "num_tokens": 23476340.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_biology_entropy": 1.2630186223983764, | |
| "eval_biology_loss": 1.2595529556274414, | |
| "eval_biology_mean_token_accuracy": 0.6963475484848023, | |
| "eval_biology_num_tokens": 23476340.0, | |
| "eval_biology_runtime": 38.6878, | |
| "eval_biology_samples_per_second": 12.924, | |
| "eval_biology_steps_per_second": 3.231, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_physics_entropy": 0.6685261228084565, | |
| "eval_physics_loss": 0.6503589749336243, | |
| "eval_physics_mean_token_accuracy": 0.8208199772834778, | |
| "eval_physics_num_tokens": 23476340.0, | |
| "eval_physics_runtime": 56.9337, | |
| "eval_physics_samples_per_second": 8.782, | |
| "eval_physics_steps_per_second": 2.196, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.2681011691689492, | |
| "epoch": 1.936, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 1.9535555555555557e-05, | |
| "loss": 1.2121, | |
| "mean_token_accuracy": 0.6959899343550205, | |
| "num_tokens": 23673208.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 1.29563331566751, | |
| "epoch": 1.952, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.9513333333333335e-05, | |
| "loss": 1.2335, | |
| "mean_token_accuracy": 0.6913977723568678, | |
| "num_tokens": 23870993.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 1.315883708000183, | |
| "epoch": 1.968, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.9491111111111113e-05, | |
| "loss": 1.269, | |
| "mean_token_accuracy": 0.6871387537568807, | |
| "num_tokens": 24066234.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 1.2918797850608825, | |
| "epoch": 1.984, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 1.946888888888889e-05, | |
| "loss": 1.2409, | |
| "mean_token_accuracy": 0.690430635213852, | |
| "num_tokens": 24262389.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 1.2895139936357736, | |
| "epoch": 2.0, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 1.9446666666666668e-05, | |
| "loss": 1.2465, | |
| "mean_token_accuracy": 0.6922377996146679, | |
| "num_tokens": 24455544.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.1053394179791212, | |
| "epoch": 2.016, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 1.9424444444444446e-05, | |
| "loss": 1.0366, | |
| "mean_token_accuracy": 0.7227501507848502, | |
| "num_tokens": 24645425.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 1.0759795285761355, | |
| "epoch": 2.032, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 1.9402222222222223e-05, | |
| "loss": 1.0324, | |
| "mean_token_accuracy": 0.7241251844912767, | |
| "num_tokens": 24835956.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 1.065844392031431, | |
| "epoch": 2.048, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 1.938e-05, | |
| "loss": 0.9998, | |
| "mean_token_accuracy": 0.7277645654976368, | |
| "num_tokens": 25036470.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 1.0796132825315, | |
| "epoch": 2.064, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 1.935777777777778e-05, | |
| "loss": 1.0147, | |
| "mean_token_accuracy": 0.7255880139768124, | |
| "num_tokens": 25230959.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 1.0651828214526176, | |
| "epoch": 2.08, | |
| "grad_norm": 3.25, | |
| "learning_rate": 1.9335555555555556e-05, | |
| "loss": 1.0108, | |
| "mean_token_accuracy": 0.727525070682168, | |
| "num_tokens": 25422625.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_biology_entropy": 1.0677382831573485, | |
| "eval_biology_loss": 1.290307641029358, | |
| "eval_biology_mean_token_accuracy": 0.6951840238571168, | |
| "eval_biology_num_tokens": 25422625.0, | |
| "eval_biology_runtime": 38.6809, | |
| "eval_biology_samples_per_second": 12.926, | |
| "eval_biology_steps_per_second": 3.232, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "eval_physics_entropy": 0.5783973336219788, | |
| "eval_physics_loss": 0.6678071022033691, | |
| "eval_physics_mean_token_accuracy": 0.8184197754859924, | |
| "eval_physics_num_tokens": 25422625.0, | |
| "eval_physics_runtime": 56.9183, | |
| "eval_physics_samples_per_second": 8.785, | |
| "eval_physics_steps_per_second": 2.196, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 1.0735113076865672, | |
| "epoch": 2.096, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 1.9313333333333334e-05, | |
| "loss": 1.0185, | |
| "mean_token_accuracy": 0.7264322709292174, | |
| "num_tokens": 25614286.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 1.0468817871063947, | |
| "epoch": 2.112, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 1.9291111111111115e-05, | |
| "loss": 0.9933, | |
| "mean_token_accuracy": 0.7301209557801485, | |
| "num_tokens": 25810384.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 1.057022250816226, | |
| "epoch": 2.128, | |
| "grad_norm": 3.5, | |
| "learning_rate": 1.926888888888889e-05, | |
| "loss": 1.0037, | |
| "mean_token_accuracy": 0.7284754924476147, | |
| "num_tokens": 26005047.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 1.0537616554647684, | |
| "epoch": 2.144, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 1.924666666666667e-05, | |
| "loss": 1.0101, | |
| "mean_token_accuracy": 0.7281042989343405, | |
| "num_tokens": 26207169.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 1.0744788832962513, | |
| "epoch": 2.16, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 1.9224444444444444e-05, | |
| "loss": 1.0203, | |
| "mean_token_accuracy": 0.7265679869800806, | |
| "num_tokens": 26402150.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 1.0806033357977867, | |
| "epoch": 2.176, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 1.9202222222222225e-05, | |
| "loss": 1.0319, | |
| "mean_token_accuracy": 0.7219661567360163, | |
| "num_tokens": 26599464.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 1.0509729556739331, | |
| "epoch": 2.192, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.918e-05, | |
| "loss": 0.9872, | |
| "mean_token_accuracy": 0.7306252598762513, | |
| "num_tokens": 26790601.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 1.068820282816887, | |
| "epoch": 2.208, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 1.915777777777778e-05, | |
| "loss": 1.0224, | |
| "mean_token_accuracy": 0.7255501691251993, | |
| "num_tokens": 26988598.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 1.0967998839914799, | |
| "epoch": 2.224, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 1.9135555555555555e-05, | |
| "loss": 1.0393, | |
| "mean_token_accuracy": 0.7217595972120762, | |
| "num_tokens": 27181933.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 1.0403927110135556, | |
| "epoch": 2.24, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 1.9113333333333336e-05, | |
| "loss": 0.982, | |
| "mean_token_accuracy": 0.7317505519837141, | |
| "num_tokens": 27378686.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_biology_entropy": 1.053062720298767, | |
| "eval_biology_loss": 1.2769854068756104, | |
| "eval_biology_mean_token_accuracy": 0.6977940616607666, | |
| "eval_biology_num_tokens": 27378686.0, | |
| "eval_biology_runtime": 38.6188, | |
| "eval_biology_samples_per_second": 12.947, | |
| "eval_biology_steps_per_second": 3.237, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "eval_physics_entropy": 0.5554806351661682, | |
| "eval_physics_loss": 0.6756051182746887, | |
| "eval_physics_mean_token_accuracy": 0.8174587740898133, | |
| "eval_physics_num_tokens": 27378686.0, | |
| "eval_physics_runtime": 56.8627, | |
| "eval_physics_samples_per_second": 8.793, | |
| "eval_physics_steps_per_second": 2.198, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 1.0398972254246472, | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 1.9091111111111113e-05, | |
| "loss": 0.9991, | |
| "mean_token_accuracy": 0.7316640116274357, | |
| "num_tokens": 27571908.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 1.0447341743856668, | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 1.906888888888889e-05, | |
| "loss": 0.9972, | |
| "mean_token_accuracy": 0.728259452059865, | |
| "num_tokens": 27770478.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 1.0823504395782948, | |
| "epoch": 2.288, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 1.904666666666667e-05, | |
| "loss": 1.0252, | |
| "mean_token_accuracy": 0.7220408860594034, | |
| "num_tokens": 27965558.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 1.0753419052809476, | |
| "epoch": 2.304, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 1.9024444444444446e-05, | |
| "loss": 1.0203, | |
| "mean_token_accuracy": 0.7257058396935463, | |
| "num_tokens": 28165624.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 1.0463432766497136, | |
| "epoch": 2.32, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 1.9002222222222224e-05, | |
| "loss": 0.9989, | |
| "mean_token_accuracy": 0.731269496306777, | |
| "num_tokens": 28363594.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 1.0562681667506695, | |
| "epoch": 2.336, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 1.898e-05, | |
| "loss": 1.0089, | |
| "mean_token_accuracy": 0.7288151986896991, | |
| "num_tokens": 28558104.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 1.07792302146554, | |
| "epoch": 2.352, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 1.895777777777778e-05, | |
| "loss": 1.0219, | |
| "mean_token_accuracy": 0.7233119513839483, | |
| "num_tokens": 28747265.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 1.06618184261024, | |
| "epoch": 2.368, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 1.8935555555555556e-05, | |
| "loss": 1.0027, | |
| "mean_token_accuracy": 0.7261252999305725, | |
| "num_tokens": 28944694.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 1.082535107806325, | |
| "epoch": 2.384, | |
| "grad_norm": 3.546875, | |
| "learning_rate": 1.8913333333333334e-05, | |
| "loss": 1.0256, | |
| "mean_token_accuracy": 0.7226470891386271, | |
| "num_tokens": 29136617.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 1.0529342222958804, | |
| "epoch": 2.4, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 1.8891111111111115e-05, | |
| "loss": 0.9972, | |
| "mean_token_accuracy": 0.7272268772125244, | |
| "num_tokens": 29333526.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_biology_entropy": 1.046908447265625, | |
| "eval_biology_loss": 1.2635581493377686, | |
| "eval_biology_mean_token_accuracy": 0.7004145407676696, | |
| "eval_biology_num_tokens": 29333526.0, | |
| "eval_biology_runtime": 38.7807, | |
| "eval_biology_samples_per_second": 12.893, | |
| "eval_biology_steps_per_second": 3.223, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_physics_entropy": 0.5629296214580536, | |
| "eval_physics_loss": 0.6746402978897095, | |
| "eval_physics_mean_token_accuracy": 0.8170881199836731, | |
| "eval_physics_num_tokens": 29333526.0, | |
| "eval_physics_runtime": 56.9496, | |
| "eval_physics_samples_per_second": 8.78, | |
| "eval_physics_steps_per_second": 2.195, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 1.0541506320238114, | |
| "epoch": 2.416, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 1.886888888888889e-05, | |
| "loss": 1.0065, | |
| "mean_token_accuracy": 0.7273991428315639, | |
| "num_tokens": 29534153.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 1.053679431974888, | |
| "epoch": 2.432, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 1.884666666666667e-05, | |
| "loss": 0.9968, | |
| "mean_token_accuracy": 0.7284378662705422, | |
| "num_tokens": 29736571.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 1.0887313183397054, | |
| "epoch": 2.448, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 1.8824444444444445e-05, | |
| "loss": 1.0429, | |
| "mean_token_accuracy": 0.7227712966501713, | |
| "num_tokens": 29932423.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 1.0347874112427236, | |
| "epoch": 2.464, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 1.8802222222222226e-05, | |
| "loss": 0.9796, | |
| "mean_token_accuracy": 0.7323532313108444, | |
| "num_tokens": 30137125.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 1.048006160184741, | |
| "epoch": 2.48, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 1.878e-05, | |
| "loss": 1.001, | |
| "mean_token_accuracy": 0.7286384627223015, | |
| "num_tokens": 30341656.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.052099209651351, | |
| "epoch": 2.496, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 1.875777777777778e-05, | |
| "loss": 0.9888, | |
| "mean_token_accuracy": 0.7288977019488811, | |
| "num_tokens": 30533732.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 1.0728581339120864, | |
| "epoch": 2.512, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 1.873555555555556e-05, | |
| "loss": 1.0181, | |
| "mean_token_accuracy": 0.7265560120344162, | |
| "num_tokens": 30730599.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 1.0553664829581977, | |
| "epoch": 2.528, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 1.8713333333333336e-05, | |
| "loss": 0.9993, | |
| "mean_token_accuracy": 0.7286464177072048, | |
| "num_tokens": 30927950.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 1.0592417079955339, | |
| "epoch": 2.544, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 1.8691111111111114e-05, | |
| "loss": 1.0135, | |
| "mean_token_accuracy": 0.7272635698318481, | |
| "num_tokens": 31125667.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 1.0688998382538557, | |
| "epoch": 2.56, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 1.866888888888889e-05, | |
| "loss": 1.0172, | |
| "mean_token_accuracy": 0.7267541728913784, | |
| "num_tokens": 31315742.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_biology_entropy": 1.0306266536712647, | |
| "eval_biology_loss": 1.2540794610977173, | |
| "eval_biology_mean_token_accuracy": 0.7014414758682251, | |
| "eval_biology_num_tokens": 31315742.0, | |
| "eval_biology_runtime": 38.994, | |
| "eval_biology_samples_per_second": 12.822, | |
| "eval_biology_steps_per_second": 3.206, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_physics_entropy": 0.5480240859985351, | |
| "eval_physics_loss": 0.6785013675689697, | |
| "eval_physics_mean_token_accuracy": 0.81651726770401, | |
| "eval_physics_num_tokens": 31315742.0, | |
| "eval_physics_runtime": 56.9063, | |
| "eval_physics_samples_per_second": 8.786, | |
| "eval_physics_steps_per_second": 2.197, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.072851437330246, | |
| "epoch": 2.576, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 1.864666666666667e-05, | |
| "loss": 1.0177, | |
| "mean_token_accuracy": 0.7259860239923001, | |
| "num_tokens": 31505374.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 1.0911020763218402, | |
| "epoch": 2.592, | |
| "grad_norm": 3.375, | |
| "learning_rate": 1.8624444444444446e-05, | |
| "loss": 1.0375, | |
| "mean_token_accuracy": 0.7193623151630163, | |
| "num_tokens": 31701680.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 1.0759818445891143, | |
| "epoch": 2.608, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 1.8602222222222224e-05, | |
| "loss": 1.024, | |
| "mean_token_accuracy": 0.7245077144354581, | |
| "num_tokens": 31893221.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 1.0434412751346827, | |
| "epoch": 2.624, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 1.858e-05, | |
| "loss": 0.9884, | |
| "mean_token_accuracy": 0.7292033292353153, | |
| "num_tokens": 32087176.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 1.077140349522233, | |
| "epoch": 2.64, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 1.855777777777778e-05, | |
| "loss": 1.0218, | |
| "mean_token_accuracy": 0.7251086611300707, | |
| "num_tokens": 32280689.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 1.0449730832129718, | |
| "epoch": 2.656, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 1.8535555555555557e-05, | |
| "loss": 0.9897, | |
| "mean_token_accuracy": 0.7329251658171415, | |
| "num_tokens": 32483392.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 1.0476958800107241, | |
| "epoch": 2.672, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 1.8513333333333335e-05, | |
| "loss": 0.9996, | |
| "mean_token_accuracy": 0.7290864728391171, | |
| "num_tokens": 32683246.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 1.0343516409397124, | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 1.8491111111111112e-05, | |
| "loss": 0.9794, | |
| "mean_token_accuracy": 0.7327864274382592, | |
| "num_tokens": 32884993.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 1.059150441735983, | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 3.25, | |
| "learning_rate": 1.846888888888889e-05, | |
| "loss": 1.004, | |
| "mean_token_accuracy": 0.7282994613051414, | |
| "num_tokens": 33088388.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 1.0319584112614393, | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 1.8446666666666667e-05, | |
| "loss": 0.9858, | |
| "mean_token_accuracy": 0.7318238351494074, | |
| "num_tokens": 33287208.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_biology_entropy": 1.0519089736938476, | |
| "eval_biology_loss": 1.2423198223114014, | |
| "eval_biology_mean_token_accuracy": 0.7037161078453064, | |
| "eval_biology_num_tokens": 33287208.0, | |
| "eval_biology_runtime": 38.8171, | |
| "eval_biology_samples_per_second": 12.881, | |
| "eval_biology_steps_per_second": 3.22, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "eval_physics_entropy": 0.5594763560295105, | |
| "eval_physics_loss": 0.677318811416626, | |
| "eval_physics_mean_token_accuracy": 0.8168268208503723, | |
| "eval_physics_num_tokens": 33287208.0, | |
| "eval_physics_runtime": 57.185, | |
| "eval_physics_samples_per_second": 8.744, | |
| "eval_physics_steps_per_second": 2.186, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 1.0736609652638436, | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 1.842444444444445e-05, | |
| "loss": 1.017, | |
| "mean_token_accuracy": 0.7253531437367201, | |
| "num_tokens": 33474750.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 1.0549831412732602, | |
| "epoch": 2.752, | |
| "grad_norm": 3.625, | |
| "learning_rate": 1.8402222222222223e-05, | |
| "loss": 1.0054, | |
| "mean_token_accuracy": 0.7258293054997921, | |
| "num_tokens": 33664302.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 1.0876980781555177, | |
| "epoch": 2.768, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 1.8380000000000004e-05, | |
| "loss": 1.0317, | |
| "mean_token_accuracy": 0.7200849443674088, | |
| "num_tokens": 33854081.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 1.046105058863759, | |
| "epoch": 2.784, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 1.8357777777777778e-05, | |
| "loss": 0.9956, | |
| "mean_token_accuracy": 0.7304057106375694, | |
| "num_tokens": 34053149.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 1.0452948212623596, | |
| "epoch": 2.8, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 1.833555555555556e-05, | |
| "loss": 0.9964, | |
| "mean_token_accuracy": 0.7318151947110891, | |
| "num_tokens": 34254105.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 1.0454461514949798, | |
| "epoch": 2.816, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 1.8313333333333333e-05, | |
| "loss": 0.9864, | |
| "mean_token_accuracy": 0.7298544138669968, | |
| "num_tokens": 34453342.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 1.0891260366886855, | |
| "epoch": 2.832, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 1.8291111111111114e-05, | |
| "loss": 1.0274, | |
| "mean_token_accuracy": 0.7241302687674761, | |
| "num_tokens": 34644113.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 1.0674222145229577, | |
| "epoch": 2.848, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 1.8268888888888888e-05, | |
| "loss": 1.0191, | |
| "mean_token_accuracy": 0.7263091344386339, | |
| "num_tokens": 34835062.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 1.047906568646431, | |
| "epoch": 2.864, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 1.824666666666667e-05, | |
| "loss": 1.0015, | |
| "mean_token_accuracy": 0.7286620266735554, | |
| "num_tokens": 35032361.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 1.0795419167727232, | |
| "epoch": 2.88, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 1.8224444444444447e-05, | |
| "loss": 1.0231, | |
| "mean_token_accuracy": 0.7250057961791754, | |
| "num_tokens": 35226653.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_biology_entropy": 1.0592120084762573, | |
| "eval_biology_loss": 1.2307487726211548, | |
| "eval_biology_mean_token_accuracy": 0.7039423842430115, | |
| "eval_biology_num_tokens": 35226653.0, | |
| "eval_biology_runtime": 38.811, | |
| "eval_biology_samples_per_second": 12.883, | |
| "eval_biology_steps_per_second": 3.221, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "eval_physics_entropy": 0.5607087211608887, | |
| "eval_physics_loss": 0.6777584552764893, | |
| "eval_physics_mean_token_accuracy": 0.816848436832428, | |
| "eval_physics_num_tokens": 35226653.0, | |
| "eval_physics_runtime": 56.9878, | |
| "eval_physics_samples_per_second": 8.774, | |
| "eval_physics_steps_per_second": 2.193, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 1.0618976611644029, | |
| "epoch": 2.896, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 1.8202222222222225e-05, | |
| "loss": 1.0068, | |
| "mean_token_accuracy": 0.7253085561096668, | |
| "num_tokens": 35421873.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 1.0810394674539565, | |
| "epoch": 2.912, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 1.8180000000000002e-05, | |
| "loss": 1.0287, | |
| "mean_token_accuracy": 0.7242106914520263, | |
| "num_tokens": 35614590.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 1.0641514841467141, | |
| "epoch": 2.928, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 1.815777777777778e-05, | |
| "loss": 1.0163, | |
| "mean_token_accuracy": 0.7251328159123659, | |
| "num_tokens": 35806494.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 1.0547591600567103, | |
| "epoch": 2.944, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 1.8135555555555557e-05, | |
| "loss": 1.0087, | |
| "mean_token_accuracy": 0.7291445326060056, | |
| "num_tokens": 36001867.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 1.0459509972482919, | |
| "epoch": 2.96, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 1.8113333333333335e-05, | |
| "loss": 0.9846, | |
| "mean_token_accuracy": 0.7305242072790861, | |
| "num_tokens": 36198806.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 1.0438212916254996, | |
| "epoch": 2.976, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 1.8091111111111113e-05, | |
| "loss": 0.9897, | |
| "mean_token_accuracy": 0.7307346038520336, | |
| "num_tokens": 36393098.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 1.0572807922959329, | |
| "epoch": 2.992, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 1.806888888888889e-05, | |
| "loss": 1.005, | |
| "mean_token_accuracy": 0.728681518137455, | |
| "num_tokens": 36589074.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 0.9687618412077427, | |
| "epoch": 3.008, | |
| "grad_norm": 4.75, | |
| "learning_rate": 1.8046666666666668e-05, | |
| "loss": 0.8593, | |
| "mean_token_accuracy": 0.7585306584835052, | |
| "num_tokens": 36778710.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 0.7038324676454067, | |
| "epoch": 3.024, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 1.8024444444444445e-05, | |
| "loss": 0.666, | |
| "mean_token_accuracy": 0.79880424849689, | |
| "num_tokens": 36966789.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 0.6823407102376222, | |
| "epoch": 3.04, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 1.8002222222222223e-05, | |
| "loss": 0.621, | |
| "mean_token_accuracy": 0.8089036166667938, | |
| "num_tokens": 37156529.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_biology_entropy": 0.7511428904533386, | |
| "eval_biology_loss": 1.4531614780426025, | |
| "eval_biology_mean_token_accuracy": 0.6934510517120361, | |
| "eval_biology_num_tokens": 37156529.0, | |
| "eval_biology_runtime": 38.62, | |
| "eval_biology_samples_per_second": 12.947, | |
| "eval_biology_steps_per_second": 3.237, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "eval_physics_entropy": 0.4265694625377655, | |
| "eval_physics_loss": 0.781083881855011, | |
| "eval_physics_mean_token_accuracy": 0.8090727143287658, | |
| "eval_physics_num_tokens": 37156529.0, | |
| "eval_physics_runtime": 56.8784, | |
| "eval_physics_samples_per_second": 8.791, | |
| "eval_physics_steps_per_second": 2.198, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 0.6869338469579815, | |
| "epoch": 3.056, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 1.798e-05, | |
| "loss": 0.6247, | |
| "mean_token_accuracy": 0.8083337739109993, | |
| "num_tokens": 37351325.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 0.6717990661039949, | |
| "epoch": 3.072, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 1.7957777777777778e-05, | |
| "loss": 0.6161, | |
| "mean_token_accuracy": 0.8123877976089716, | |
| "num_tokens": 37541069.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 0.6788237554952502, | |
| "epoch": 3.088, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 1.7935555555555556e-05, | |
| "loss": 0.6233, | |
| "mean_token_accuracy": 0.8102868665009737, | |
| "num_tokens": 37733440.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 0.6710379313677549, | |
| "epoch": 3.104, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 1.7913333333333337e-05, | |
| "loss": 0.6095, | |
| "mean_token_accuracy": 0.811876380816102, | |
| "num_tokens": 37933661.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 0.6801185900345444, | |
| "epoch": 3.12, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 1.789111111111111e-05, | |
| "loss": 0.6255, | |
| "mean_token_accuracy": 0.8087910477072, | |
| "num_tokens": 38126905.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 0.6823285862803459, | |
| "epoch": 3.136, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 1.7868888888888892e-05, | |
| "loss": 0.619, | |
| "mean_token_accuracy": 0.8105869103223086, | |
| "num_tokens": 38316042.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 0.6686201065778732, | |
| "epoch": 3.152, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 1.7846666666666666e-05, | |
| "loss": 0.6139, | |
| "mean_token_accuracy": 0.811134846508503, | |
| "num_tokens": 38514212.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 0.696059150248766, | |
| "epoch": 3.168, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 1.7824444444444447e-05, | |
| "loss": 0.6377, | |
| "mean_token_accuracy": 0.8045505676418543, | |
| "num_tokens": 38708822.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 0.6871348209679127, | |
| "epoch": 3.184, | |
| "grad_norm": 3.9375, | |
| "learning_rate": 1.780222222222222e-05, | |
| "loss": 0.632, | |
| "mean_token_accuracy": 0.8063865445554257, | |
| "num_tokens": 38900708.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 0.6824637034907937, | |
| "epoch": 3.2, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 1.7780000000000003e-05, | |
| "loss": 0.626, | |
| "mean_token_accuracy": 0.8084224767982959, | |
| "num_tokens": 39102325.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_biology_entropy": 0.7477709612846375, | |
| "eval_biology_loss": 1.473409652709961, | |
| "eval_biology_mean_token_accuracy": 0.6936884145736695, | |
| "eval_biology_num_tokens": 39102325.0, | |
| "eval_biology_runtime": 38.6411, | |
| "eval_biology_samples_per_second": 12.94, | |
| "eval_biology_steps_per_second": 3.235, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_physics_entropy": 0.4129572913646698, | |
| "eval_physics_loss": 0.8035010695457458, | |
| "eval_physics_mean_token_accuracy": 0.8071833181381226, | |
| "eval_physics_num_tokens": 39102325.0, | |
| "eval_physics_runtime": 56.8955, | |
| "eval_physics_samples_per_second": 8.788, | |
| "eval_physics_steps_per_second": 2.197, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 16, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.279038413131436e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |