Image-Text-to-Text
Transformers
Safetensors
qwen2_5_vl
llama-factory
full
qwen2.5-vl
stepcount
sft
conversational
text-generation-inference
Instructions to use SI-Lab/StepCount-7B-SFT-1M with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use SI-Lab/StepCount-7B-SFT-1M with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="SI-Lab/StepCount-7B-SFT-1M") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForMultimodalLM processor = AutoProcessor.from_pretrained("SI-Lab/StepCount-7B-SFT-1M") model = AutoModelForMultimodalLM.from_pretrained("SI-Lab/StepCount-7B-SFT-1M") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use SI-Lab/StepCount-7B-SFT-1M with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "SI-Lab/StepCount-7B-SFT-1M" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SI-Lab/StepCount-7B-SFT-1M", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/SI-Lab/StepCount-7B-SFT-1M
- SGLang
How to use SI-Lab/StepCount-7B-SFT-1M with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "SI-Lab/StepCount-7B-SFT-1M" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SI-Lab/StepCount-7B-SFT-1M", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "SI-Lab/StepCount-7B-SFT-1M" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "SI-Lab/StepCount-7B-SFT-1M", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use SI-Lab/StepCount-7B-SFT-1M with Docker Model Runner:
docker model run hf.co/SI-Lab/StepCount-7B-SFT-1M
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.24257867244234935, | |
| "eval_steps": 238, | |
| "global_step": 476, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.002038476239011339, | |
| "grad_norm": 0.6477837651252902, | |
| "learning_rate": 2.699735382882792e-06, | |
| "loss": 0.5815, | |
| "num_input_tokens_seen": 6936832, | |
| "step": 4, | |
| "train_runtime": 594.7401, | |
| "train_tokens_per_second": 11663.637 | |
| }, | |
| { | |
| "epoch": 0.004076952478022678, | |
| "grad_norm": 0.6182898594326122, | |
| "learning_rate": 2.698559515983203e-06, | |
| "loss": 0.5691, | |
| "num_input_tokens_seen": 14270848, | |
| "step": 8, | |
| "train_runtime": 1203.7054, | |
| "train_tokens_per_second": 11855.764 | |
| }, | |
| { | |
| "epoch": 0.006115428717034017, | |
| "grad_norm": 0.6559666459303339, | |
| "learning_rate": 2.696443816026242e-06, | |
| "loss": 0.5597, | |
| "num_input_tokens_seen": 21129216, | |
| "step": 12, | |
| "train_runtime": 1762.6999, | |
| "train_tokens_per_second": 11986.848 | |
| }, | |
| { | |
| "epoch": 0.008153904956045356, | |
| "grad_norm": 0.6460312307750392, | |
| "learning_rate": 2.693389757477617e-06, | |
| "loss": 0.564, | |
| "num_input_tokens_seen": 28254528, | |
| "step": 16, | |
| "train_runtime": 2338.7069, | |
| "train_tokens_per_second": 12081.261 | |
| }, | |
| { | |
| "epoch": 0.010192381195056695, | |
| "grad_norm": 0.7820470969086647, | |
| "learning_rate": 2.689399468760395e-06, | |
| "loss": 0.5386, | |
| "num_input_tokens_seen": 35623168, | |
| "step": 20, | |
| "train_runtime": 2958.6163, | |
| "train_tokens_per_second": 12040.482 | |
| }, | |
| { | |
| "epoch": 0.012230857434068034, | |
| "grad_norm": 0.7662986210694266, | |
| "learning_rate": 2.6844757307716743e-06, | |
| "loss": 0.5589, | |
| "num_input_tokens_seen": 42908288, | |
| "step": 24, | |
| "train_runtime": 3591.5632, | |
| "train_tokens_per_second": 11946.967 | |
| }, | |
| { | |
| "epoch": 0.014269333673079374, | |
| "grad_norm": 0.7264783091425356, | |
| "learning_rate": 2.678621974944531e-06, | |
| "loss": 0.5349, | |
| "num_input_tokens_seen": 49922816, | |
| "step": 28, | |
| "train_runtime": 4214.8304, | |
| "train_tokens_per_second": 11844.561 | |
| }, | |
| { | |
| "epoch": 0.01630780991209071, | |
| "grad_norm": 0.7961355791031945, | |
| "learning_rate": 2.6718422808565973e-06, | |
| "loss": 0.5432, | |
| "num_input_tokens_seen": 56972736, | |
| "step": 32, | |
| "train_runtime": 4806.2524, | |
| "train_tokens_per_second": 11853.879 | |
| }, | |
| { | |
| "epoch": 0.01834628615110205, | |
| "grad_norm": 0.8138139768273569, | |
| "learning_rate": 2.6641413733869353e-06, | |
| "loss": 0.538, | |
| "num_input_tokens_seen": 64012032, | |
| "step": 36, | |
| "train_runtime": 5375.3791, | |
| "train_tokens_per_second": 11908.375 | |
| }, | |
| { | |
| "epoch": 0.02038476239011339, | |
| "grad_norm": 0.7766770677480328, | |
| "learning_rate": 2.6555246194231957e-06, | |
| "loss": 0.5368, | |
| "num_input_tokens_seen": 71145984, | |
| "step": 40, | |
| "train_runtime": 5983.1229, | |
| "train_tokens_per_second": 11891.112 | |
| }, | |
| { | |
| "epoch": 0.02242323862912473, | |
| "grad_norm": 0.786674093556373, | |
| "learning_rate": 2.6459980241213444e-06, | |
| "loss": 0.5267, | |
| "num_input_tokens_seen": 78403392, | |
| "step": 44, | |
| "train_runtime": 6587.3777, | |
| "train_tokens_per_second": 11902.064 | |
| }, | |
| { | |
| "epoch": 0.02446171486813607, | |
| "grad_norm": 0.895151868173495, | |
| "learning_rate": 2.6355682267205705e-06, | |
| "loss": 0.5334, | |
| "num_input_tokens_seen": 85316672, | |
| "step": 48, | |
| "train_runtime": 7184.1032, | |
| "train_tokens_per_second": 11875.758 | |
| }, | |
| { | |
| "epoch": 0.02650019110714741, | |
| "grad_norm": 0.7705686339787958, | |
| "learning_rate": 2.6242424959162964e-06, | |
| "loss": 0.531, | |
| "num_input_tokens_seen": 92412736, | |
| "step": 52, | |
| "train_runtime": 7797.6138, | |
| "train_tokens_per_second": 11851.412 | |
| }, | |
| { | |
| "epoch": 0.028538667346158748, | |
| "grad_norm": 0.819091795569789, | |
| "learning_rate": 2.612028724794501e-06, | |
| "loss": 0.5213, | |
| "num_input_tokens_seen": 99543616, | |
| "step": 56, | |
| "train_runtime": 8349.8154, | |
| "train_tokens_per_second": 11921.655 | |
| }, | |
| { | |
| "epoch": 0.030577143585170087, | |
| "grad_norm": 0.8450961282572036, | |
| "learning_rate": 2.598935425330904e-06, | |
| "loss": 0.5254, | |
| "num_input_tokens_seen": 106489728, | |
| "step": 60, | |
| "train_runtime": 8960.4265, | |
| "train_tokens_per_second": 11884.449 | |
| }, | |
| { | |
| "epoch": 0.03261561982418142, | |
| "grad_norm": 0.9361937413311264, | |
| "learning_rate": 2.5849717224588284e-06, | |
| "loss": 0.5376, | |
| "num_input_tokens_seen": 113524032, | |
| "step": 64, | |
| "train_runtime": 9543.0991, | |
| "train_tokens_per_second": 11895.929 | |
| }, | |
| { | |
| "epoch": 0.034654096063192766, | |
| "grad_norm": 0.8847320129841505, | |
| "learning_rate": 2.5701473477098874e-06, | |
| "loss": 0.5651, | |
| "num_input_tokens_seen": 120475456, | |
| "step": 68, | |
| "train_runtime": 10086.9254, | |
| "train_tokens_per_second": 11943.724 | |
| }, | |
| { | |
| "epoch": 0.0366925723022041, | |
| "grad_norm": 0.8777390400419234, | |
| "learning_rate": 2.5544726324319225e-06, | |
| "loss": 0.5692, | |
| "num_input_tokens_seen": 127670080, | |
| "step": 72, | |
| "train_runtime": 10665.8117, | |
| "train_tokens_per_second": 11970.029 | |
| }, | |
| { | |
| "epoch": 0.038731048541215445, | |
| "grad_norm": 0.9576210756092853, | |
| "learning_rate": 2.5379585005889178e-06, | |
| "loss": 0.5625, | |
| "num_input_tokens_seen": 134768512, | |
| "step": 76, | |
| "train_runtime": 11283.1943, | |
| "train_tokens_per_second": 11944.181 | |
| }, | |
| { | |
| "epoch": 0.04076952478022678, | |
| "grad_norm": 0.8753248708570508, | |
| "learning_rate": 2.5206164611479122e-06, | |
| "loss": 0.5805, | |
| "num_input_tokens_seen": 141573760, | |
| "step": 80, | |
| "train_runtime": 11827.0251, | |
| "train_tokens_per_second": 11970.361 | |
| }, | |
| { | |
| "epoch": 0.042808001019238116, | |
| "grad_norm": 0.8684000412026938, | |
| "learning_rate": 2.502458600058214e-06, | |
| "loss": 0.5426, | |
| "num_input_tokens_seen": 148565440, | |
| "step": 84, | |
| "train_runtime": 12416.0172, | |
| "train_tokens_per_second": 11965.628 | |
| }, | |
| { | |
| "epoch": 0.04484647725824946, | |
| "grad_norm": 0.7849759417456085, | |
| "learning_rate": 2.4834975718285047e-06, | |
| "loss": 0.5676, | |
| "num_input_tokens_seen": 155693632, | |
| "step": 88, | |
| "train_runtime": 13027.3428, | |
| "train_tokens_per_second": 11951.296 | |
| }, | |
| { | |
| "epoch": 0.046884953497260795, | |
| "grad_norm": 0.8527233643540989, | |
| "learning_rate": 2.463746590707708e-06, | |
| "loss": 0.5706, | |
| "num_input_tokens_seen": 162884544, | |
| "step": 92, | |
| "train_runtime": 13621.8968, | |
| "train_tokens_per_second": 11957.552 | |
| }, | |
| { | |
| "epoch": 0.04892342973627214, | |
| "grad_norm": 0.8245948514984758, | |
| "learning_rate": 2.4432194214757634e-06, | |
| "loss": 0.5601, | |
| "num_input_tokens_seen": 169884480, | |
| "step": 96, | |
| "train_runtime": 14201.6379, | |
| "train_tokens_per_second": 11962.316 | |
| }, | |
| { | |
| "epoch": 0.050961905975283474, | |
| "grad_norm": 0.9463896487710859, | |
| "learning_rate": 2.4219303698507273e-06, | |
| "loss": 0.578, | |
| "num_input_tokens_seen": 177144768, | |
| "step": 100, | |
| "train_runtime": 14812.6402, | |
| "train_tokens_per_second": 11959.027 | |
| }, | |
| { | |
| "epoch": 0.05300038221429482, | |
| "grad_norm": 0.8528440593847427, | |
| "learning_rate": 2.399894272518887e-06, | |
| "loss": 0.5695, | |
| "num_input_tokens_seen": 184355904, | |
| "step": 104, | |
| "train_runtime": 15392.8793, | |
| "train_tokens_per_second": 11976.7 | |
| }, | |
| { | |
| "epoch": 0.05503885845330615, | |
| "grad_norm": 0.8983751041750461, | |
| "learning_rate": 2.3771264867948297e-06, | |
| "loss": 0.5643, | |
| "num_input_tokens_seen": 191461120, | |
| "step": 108, | |
| "train_runtime": 15962.3103, | |
| "train_tokens_per_second": 11994.574 | |
| }, | |
| { | |
| "epoch": 0.057077334692317495, | |
| "grad_norm": 0.9738501561599475, | |
| "learning_rate": 2.353642879918684e-06, | |
| "loss": 0.5611, | |
| "num_input_tokens_seen": 198512640, | |
| "step": 112, | |
| "train_runtime": 16542.875, | |
| "train_tokens_per_second": 11999.888 | |
| }, | |
| { | |
| "epoch": 0.05911581093132883, | |
| "grad_norm": 0.9069372359842974, | |
| "learning_rate": 2.329459817997979e-06, | |
| "loss": 0.555, | |
| "num_input_tokens_seen": 205184640, | |
| "step": 116, | |
| "train_runtime": 17132.5621, | |
| "train_tokens_per_second": 11976.296 | |
| }, | |
| { | |
| "epoch": 0.061154287170340174, | |
| "grad_norm": 0.8535229191703974, | |
| "learning_rate": 2.304594154601839e-06, | |
| "loss": 0.5802, | |
| "num_input_tokens_seen": 212192768, | |
| "step": 120, | |
| "train_runtime": 17695.1167, | |
| "train_tokens_per_second": 11991.6 | |
| }, | |
| { | |
| "epoch": 0.06319276340935151, | |
| "grad_norm": 0.9235254276307827, | |
| "learning_rate": 2.2790632190154588e-06, | |
| "loss": 0.5602, | |
| "num_input_tokens_seen": 219343424, | |
| "step": 124, | |
| "train_runtime": 18276.554, | |
| "train_tokens_per_second": 12001.356 | |
| }, | |
| { | |
| "epoch": 0.06523123964836285, | |
| "grad_norm": 0.8469907739766386, | |
| "learning_rate": 2.2528848041630394e-06, | |
| "loss": 0.5726, | |
| "num_input_tokens_seen": 226825024, | |
| "step": 128, | |
| "train_runtime": 18867.9805, | |
| "train_tokens_per_second": 12021.691 | |
| }, | |
| { | |
| "epoch": 0.06726971588737418, | |
| "grad_norm": 0.863302212361952, | |
| "learning_rate": 2.226077154207613e-06, | |
| "loss": 0.5458, | |
| "num_input_tokens_seen": 233873600, | |
| "step": 132, | |
| "train_runtime": 19428.0179, | |
| "train_tokens_per_second": 12037.955 | |
| }, | |
| { | |
| "epoch": 0.06930819212638553, | |
| "grad_norm": 0.890826252542842, | |
| "learning_rate": 2.1986589518363884e-06, | |
| "loss": 0.5683, | |
| "num_input_tokens_seen": 241046016, | |
| "step": 136, | |
| "train_runtime": 20027.9177, | |
| "train_tokens_per_second": 12035.501 | |
| }, | |
| { | |
| "epoch": 0.07134666836539687, | |
| "grad_norm": 0.8897818157637029, | |
| "learning_rate": 2.17064930524048e-06, | |
| "loss": 0.5753, | |
| "num_input_tokens_seen": 248445952, | |
| "step": 140, | |
| "train_runtime": 20655.1895, | |
| "train_tokens_per_second": 12028.258 | |
| }, | |
| { | |
| "epoch": 0.0733851446044082, | |
| "grad_norm": 0.7970770860102688, | |
| "learning_rate": 2.1420677347981022e-06, | |
| "loss": 0.5492, | |
| "num_input_tokens_seen": 255691072, | |
| "step": 144, | |
| "train_runtime": 21246.8799, | |
| "train_tokens_per_second": 12034.288 | |
| }, | |
| { | |
| "epoch": 0.07542362084341954, | |
| "grad_norm": 1.114229924043429, | |
| "learning_rate": 2.112934159470499e-06, | |
| "loss": 0.5696, | |
| "num_input_tokens_seen": 263081280, | |
| "step": 148, | |
| "train_runtime": 21841.1341, | |
| "train_tokens_per_second": 12045.221 | |
| }, | |
| { | |
| "epoch": 0.07746209708243089, | |
| "grad_norm": 0.8591461034263972, | |
| "learning_rate": 2.083268882920095e-06, | |
| "loss": 0.5765, | |
| "num_input_tokens_seen": 270227584, | |
| "step": 152, | |
| "train_runtime": 22468.8462, | |
| "train_tokens_per_second": 12026.767 | |
| }, | |
| { | |
| "epoch": 0.07950057332144222, | |
| "grad_norm": 0.8676767698372961, | |
| "learning_rate": 2.053092579360543e-06, | |
| "loss": 0.5706, | |
| "num_input_tokens_seen": 277353664, | |
| "step": 156, | |
| "train_runtime": 23053.2043, | |
| "train_tokens_per_second": 12031.024 | |
| }, | |
| { | |
| "epoch": 0.08153904956045356, | |
| "grad_norm": 0.9224293367262005, | |
| "learning_rate": 2.0224262791485315e-06, | |
| "loss": 0.5608, | |
| "num_input_tokens_seen": 284646784, | |
| "step": 160, | |
| "train_runtime": 23643.9972, | |
| "train_tokens_per_second": 12038.86 | |
| }, | |
| { | |
| "epoch": 0.0835775257994649, | |
| "grad_norm": 0.887751023260673, | |
| "learning_rate": 1.991291354127381e-06, | |
| "loss": 0.5636, | |
| "num_input_tokens_seen": 291840192, | |
| "step": 164, | |
| "train_runtime": 24251.2033, | |
| "train_tokens_per_second": 12034.05 | |
| }, | |
| { | |
| "epoch": 0.08561600203847623, | |
| "grad_norm": 0.8971166373496055, | |
| "learning_rate": 1.959709502732666e-06, | |
| "loss": 0.5624, | |
| "num_input_tokens_seen": 298899456, | |
| "step": 168, | |
| "train_runtime": 24847.2173, | |
| "train_tokens_per_second": 12029.494 | |
| }, | |
| { | |
| "epoch": 0.08765447827748758, | |
| "grad_norm": 0.884570439459004, | |
| "learning_rate": 1.927702734870216e-06, | |
| "loss": 0.5802, | |
| "num_input_tokens_seen": 305987520, | |
| "step": 172, | |
| "train_runtime": 25438.1181, | |
| "train_tokens_per_second": 12028.701 | |
| }, | |
| { | |
| "epoch": 0.08969295451649892, | |
| "grad_norm": 0.7905104567457971, | |
| "learning_rate": 1.895293356577058e-06, | |
| "loss": 0.5557, | |
| "num_input_tokens_seen": 312989312, | |
| "step": 176, | |
| "train_runtime": 26002.8183, | |
| "train_tokens_per_second": 12036.746 | |
| }, | |
| { | |
| "epoch": 0.09173143075551025, | |
| "grad_norm": 0.9424718927507695, | |
| "learning_rate": 1.8625039544759767e-06, | |
| "loss": 0.5663, | |
| "num_input_tokens_seen": 320097152, | |
| "step": 180, | |
| "train_runtime": 26586.1368, | |
| "train_tokens_per_second": 12040.002 | |
| }, | |
| { | |
| "epoch": 0.09376990699452159, | |
| "grad_norm": 0.8615504446489718, | |
| "learning_rate": 1.8293573800345261e-06, | |
| "loss": 0.5729, | |
| "num_input_tokens_seen": 327245696, | |
| "step": 184, | |
| "train_runtime": 27215.267, | |
| "train_tokens_per_second": 12024.343 | |
| }, | |
| { | |
| "epoch": 0.09580838323353294, | |
| "grad_norm": 0.9699743298030914, | |
| "learning_rate": 1.7958767336394758e-06, | |
| "loss": 0.5737, | |
| "num_input_tokens_seen": 334634880, | |
| "step": 188, | |
| "train_runtime": 27830.7489, | |
| "train_tokens_per_second": 12023.927 | |
| }, | |
| { | |
| "epoch": 0.09784685947254428, | |
| "grad_norm": 0.8699479610963319, | |
| "learning_rate": 1.7620853484977693e-06, | |
| "loss": 0.5578, | |
| "num_input_tokens_seen": 341804032, | |
| "step": 192, | |
| "train_runtime": 28418.8119, | |
| "train_tokens_per_second": 12027.386 | |
| }, | |
| { | |
| "epoch": 0.09988533571155561, | |
| "grad_norm": 0.9257049516328657, | |
| "learning_rate": 1.7280067743752384e-06, | |
| "loss": 0.5521, | |
| "num_input_tokens_seen": 348671424, | |
| "step": 196, | |
| "train_runtime": 29007.5671, | |
| "train_tokens_per_second": 12020.016 | |
| }, | |
| { | |
| "epoch": 0.10192381195056695, | |
| "grad_norm": 0.9131171296688921, | |
| "learning_rate": 1.6936647611843846e-06, | |
| "loss": 0.5682, | |
| "num_input_tokens_seen": 355675520, | |
| "step": 200, | |
| "train_runtime": 29580.1297, | |
| "train_tokens_per_second": 12024.137 | |
| }, | |
| { | |
| "epoch": 0.10396228818957828, | |
| "grad_norm": 0.8769808506815606, | |
| "learning_rate": 1.659083242432681e-06, | |
| "loss": 0.5584, | |
| "num_input_tokens_seen": 362651648, | |
| "step": 204, | |
| "train_runtime": 30152.9255, | |
| "train_tokens_per_second": 12027.08 | |
| }, | |
| { | |
| "epoch": 0.10600076442858963, | |
| "grad_norm": 0.9497619416730099, | |
| "learning_rate": 1.6242863185429212e-06, | |
| "loss": 0.5879, | |
| "num_input_tokens_seen": 369726720, | |
| "step": 208, | |
| "train_runtime": 30751.1104, | |
| "train_tokens_per_second": 12023.199 | |
| }, | |
| { | |
| "epoch": 0.10803924066760097, | |
| "grad_norm": 0.9230137333974207, | |
| "learning_rate": 1.5892982400572422e-06, | |
| "loss": 0.5681, | |
| "num_input_tokens_seen": 376862016, | |
| "step": 212, | |
| "train_runtime": 31362.1922, | |
| "train_tokens_per_second": 12016.444 | |
| }, | |
| { | |
| "epoch": 0.1100777169066123, | |
| "grad_norm": 0.8995334199668418, | |
| "learning_rate": 1.5541433907365264e-06, | |
| "loss": 0.5455, | |
| "num_input_tokens_seen": 383921152, | |
| "step": 216, | |
| "train_runtime": 31914.7502, | |
| "train_tokens_per_second": 12029.583 | |
| }, | |
| { | |
| "epoch": 0.11211619314562364, | |
| "grad_norm": 0.9043735392416916, | |
| "learning_rate": 1.5188462705669648e-06, | |
| "loss": 0.5641, | |
| "num_input_tokens_seen": 390988416, | |
| "step": 220, | |
| "train_runtime": 32509.882, | |
| "train_tokens_per_second": 12026.756 | |
| }, | |
| { | |
| "epoch": 0.11415466938463499, | |
| "grad_norm": 0.9729636398442392, | |
| "learning_rate": 1.4834314786856161e-06, | |
| "loss": 0.5607, | |
| "num_input_tokens_seen": 398152576, | |
| "step": 224, | |
| "train_runtime": 33085.7673, | |
| "train_tokens_per_second": 12033.953 | |
| }, | |
| { | |
| "epoch": 0.11619314562364633, | |
| "grad_norm": 0.9539597912248131, | |
| "learning_rate": 1.4479236962368684e-06, | |
| "loss": 0.5589, | |
| "num_input_tokens_seen": 405173888, | |
| "step": 228, | |
| "train_runtime": 33682.3182, | |
| "train_tokens_per_second": 12029.276 | |
| }, | |
| { | |
| "epoch": 0.11823162186265766, | |
| "grad_norm": 0.8598081231037438, | |
| "learning_rate": 1.4123476691717487e-06, | |
| "loss": 0.5518, | |
| "num_input_tokens_seen": 412070528, | |
| "step": 232, | |
| "train_runtime": 34252.7917, | |
| "train_tokens_per_second": 12030.276 | |
| }, | |
| { | |
| "epoch": 0.120270098101669, | |
| "grad_norm": 0.8860637699401077, | |
| "learning_rate": 1.376728191002066e-06, | |
| "loss": 0.558, | |
| "num_input_tokens_seen": 419423744, | |
| "step": 236, | |
| "train_runtime": 34844.2912, | |
| "train_tokens_per_second": 12037.086 | |
| }, | |
| { | |
| "epoch": 0.12128933622117467, | |
| "eval_loss": 0.7941220998764038, | |
| "eval_runtime": 213.2901, | |
| "eval_samples_per_second": 4.717, | |
| "eval_steps_per_second": 0.075, | |
| "num_input_tokens_seen": 422911872, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.12230857434068035, | |
| "grad_norm": 0.9547518291189157, | |
| "learning_rate": 1.3410900855214124e-06, | |
| "loss": 0.5732, | |
| "num_input_tokens_seen": 426493248, | |
| "step": 240, | |
| "train_runtime": 35670.5805, | |
| "train_tokens_per_second": 11956.443 | |
| }, | |
| { | |
| "epoch": 0.12434705057969168, | |
| "grad_norm": 0.8725219735566059, | |
| "learning_rate": 1.305458189505055e-06, | |
| "loss": 0.5605, | |
| "num_input_tokens_seen": 433674688, | |
| "step": 244, | |
| "train_runtime": 36270.2542, | |
| "train_tokens_per_second": 11956.759 | |
| }, | |
| { | |
| "epoch": 0.12638552681870302, | |
| "grad_norm": 0.9381950485311329, | |
| "learning_rate": 1.269857335400783e-06, | |
| "loss": 0.5758, | |
| "num_input_tokens_seen": 441063552, | |
| "step": 248, | |
| "train_runtime": 36895.0601, | |
| "train_tokens_per_second": 11954.542 | |
| }, | |
| { | |
| "epoch": 0.12842400305771437, | |
| "grad_norm": 0.8095871776257958, | |
| "learning_rate": 1.2343123340227753e-06, | |
| "loss": 0.5682, | |
| "num_input_tokens_seen": 448549632, | |
| "step": 252, | |
| "train_runtime": 37565.6709, | |
| "train_tokens_per_second": 11940.413 | |
| }, | |
| { | |
| "epoch": 0.1304624792967257, | |
| "grad_norm": 0.8656130470620739, | |
| "learning_rate": 1.1988479572605345e-06, | |
| "loss": 0.5545, | |
| "num_input_tokens_seen": 455502912, | |
| "step": 256, | |
| "train_runtime": 38151.4724, | |
| "train_tokens_per_second": 11939.327 | |
| }, | |
| { | |
| "epoch": 0.13250095553573704, | |
| "grad_norm": 0.9346351864084386, | |
| "learning_rate": 1.1634889208149594e-06, | |
| "loss": 0.5664, | |
| "num_input_tokens_seen": 462435200, | |
| "step": 260, | |
| "train_runtime": 38714.2142, | |
| "train_tokens_per_second": 11944.843 | |
| }, | |
| { | |
| "epoch": 0.13453943177474836, | |
| "grad_norm": 0.899249765870996, | |
| "learning_rate": 1.1282598669735671e-06, | |
| "loss": 0.558, | |
| "num_input_tokens_seen": 469336640, | |
| "step": 264, | |
| "train_runtime": 39254.3889, | |
| "train_tokens_per_second": 11956.284 | |
| }, | |
| { | |
| "epoch": 0.1365779080137597, | |
| "grad_norm": 0.8458171655451426, | |
| "learning_rate": 1.093185347436887e-06, | |
| "loss": 0.5538, | |
| "num_input_tokens_seen": 476310656, | |
| "step": 268, | |
| "train_runtime": 39827.3684, | |
| "train_tokens_per_second": 11959.381 | |
| }, | |
| { | |
| "epoch": 0.13861638425277106, | |
| "grad_norm": 0.9179027627040293, | |
| "learning_rate": 1.058289806207975e-06, | |
| "loss": 0.5698, | |
| "num_input_tokens_seen": 483771840, | |
| "step": 272, | |
| "train_runtime": 40459.3108, | |
| "train_tokens_per_second": 11956.997 | |
| }, | |
| { | |
| "epoch": 0.14065486049178239, | |
| "grad_norm": 0.9442703488926768, | |
| "learning_rate": 1.0235975625569967e-06, | |
| "loss": 0.5649, | |
| "num_input_tokens_seen": 490750336, | |
| "step": 276, | |
| "train_runtime": 41048.4541, | |
| "train_tokens_per_second": 11955.391 | |
| }, | |
| { | |
| "epoch": 0.14269333673079374, | |
| "grad_norm": 0.8974082301624304, | |
| "learning_rate": 9.891327940727266e-07, | |
| "loss": 0.5657, | |
| "num_input_tokens_seen": 497702912, | |
| "step": 280, | |
| "train_runtime": 41610.7519, | |
| "train_tokens_per_second": 11960.921 | |
| }, | |
| { | |
| "epoch": 0.14473181296980506, | |
| "grad_norm": 0.9898213671868435, | |
| "learning_rate": 9.549195198127994e-07, | |
| "loss": 0.58, | |
| "num_input_tokens_seen": 505099584, | |
| "step": 284, | |
| "train_runtime": 42219.4154, | |
| "train_tokens_per_second": 11963.68 | |
| }, | |
| { | |
| "epoch": 0.1467702892088164, | |
| "grad_norm": 0.9363039019647847, | |
| "learning_rate": 9.209815835644328e-07, | |
| "loss": 0.5512, | |
| "num_input_tokens_seen": 512152896, | |
| "step": 288, | |
| "train_runtime": 42766.508, | |
| "train_tokens_per_second": 11975.56 | |
| }, | |
| { | |
| "epoch": 0.14880876544782776, | |
| "grad_norm": 0.9589553041250718, | |
| "learning_rate": 8.873426372273072e-07, | |
| "loss": 0.5747, | |
| "num_input_tokens_seen": 519187584, | |
| "step": 292, | |
| "train_runtime": 43380.0418, | |
| "train_tokens_per_second": 11968.351 | |
| }, | |
| { | |
| "epoch": 0.15084724168683908, | |
| "grad_norm": 0.9433972448251747, | |
| "learning_rate": 8.540261243301721e-07, | |
| "loss": 0.5509, | |
| "num_input_tokens_seen": 526140736, | |
| "step": 296, | |
| "train_runtime": 43995.5053, | |
| "train_tokens_per_second": 11958.966 | |
| }, | |
| { | |
| "epoch": 0.15288571792585043, | |
| "grad_norm": 0.9393953597111547, | |
| "learning_rate": 8.210552636926686e-07, | |
| "loss": 0.5629, | |
| "num_input_tokens_seen": 532992384, | |
| "step": 300, | |
| "train_runtime": 44595.3555, | |
| "train_tokens_per_second": 11951.746 | |
| }, | |
| { | |
| "epoch": 0.15492419416486178, | |
| "grad_norm": 0.9091057919029727, | |
| "learning_rate": 7.884530332437565e-07, | |
| "loss": 0.5663, | |
| "num_input_tokens_seen": 540379392, | |
| "step": 304, | |
| "train_runtime": 45193.3818, | |
| "train_tokens_per_second": 11957.047 | |
| }, | |
| { | |
| "epoch": 0.1569626704038731, | |
| "grad_norm": 0.8832084768576383, | |
| "learning_rate": 7.562421540080231e-07, | |
| "loss": 0.5613, | |
| "num_input_tokens_seen": 547461440, | |
| "step": 308, | |
| "train_runtime": 45756.6672, | |
| "train_tokens_per_second": 11964.627 | |
| }, | |
| { | |
| "epoch": 0.15900114664288445, | |
| "grad_norm": 0.8587618292954599, | |
| "learning_rate": 7.244450742710321e-07, | |
| "loss": 0.561, | |
| "num_input_tokens_seen": 554617984, | |
| "step": 312, | |
| "train_runtime": 46381.4062, | |
| "train_tokens_per_second": 11957.766 | |
| }, | |
| { | |
| "epoch": 0.16103962288189577, | |
| "grad_norm": 0.9446856546867352, | |
| "learning_rate": 6.930839539347442e-07, | |
| "loss": 0.5532, | |
| "num_input_tokens_seen": 561693888, | |
| "step": 316, | |
| "train_runtime": 46994.8669, | |
| "train_tokens_per_second": 11952.239 | |
| }, | |
| { | |
| "epoch": 0.16307809912090712, | |
| "grad_norm": 0.9429683753374553, | |
| "learning_rate": 6.621806490739267e-07, | |
| "loss": 0.5595, | |
| "num_input_tokens_seen": 568833664, | |
| "step": 320, | |
| "train_runtime": 47586.3302, | |
| "train_tokens_per_second": 11953.72 | |
| }, | |
| { | |
| "epoch": 0.16511657535991847, | |
| "grad_norm": 0.9035859891449433, | |
| "learning_rate": 6.317566967042958e-07, | |
| "loss": 0.5504, | |
| "num_input_tokens_seen": 576161408, | |
| "step": 324, | |
| "train_runtime": 48187.0813, | |
| "train_tokens_per_second": 11956.761 | |
| }, | |
| { | |
| "epoch": 0.1671550515989298, | |
| "grad_norm": 0.9033151093018099, | |
| "learning_rate": 6.018332997730213e-07, | |
| "loss": 0.5605, | |
| "num_input_tokens_seen": 583395328, | |
| "step": 328, | |
| "train_runtime": 48788.5572, | |
| "train_tokens_per_second": 11957.626 | |
| }, | |
| { | |
| "epoch": 0.16919352783794114, | |
| "grad_norm": 0.9609542284697502, | |
| "learning_rate": 5.724313123820482e-07, | |
| "loss": 0.5557, | |
| "num_input_tokens_seen": 590636544, | |
| "step": 332, | |
| "train_runtime": 49400.1577, | |
| "train_tokens_per_second": 11956.167 | |
| }, | |
| { | |
| "epoch": 0.17123200407695247, | |
| "grad_norm": 0.8861118611359287, | |
| "learning_rate": 5.435712252545331e-07, | |
| "loss": 0.5608, | |
| "num_input_tokens_seen": 597723776, | |
| "step": 336, | |
| "train_runtime": 50003.6729, | |
| "train_tokens_per_second": 11953.597 | |
| }, | |
| { | |
| "epoch": 0.17327048031596382, | |
| "grad_norm": 0.9761110376120566, | |
| "learning_rate": 5.152731514545266e-07, | |
| "loss": 0.5478, | |
| "num_input_tokens_seen": 604770752, | |
| "step": 340, | |
| "train_runtime": 50553.9712, | |
| "train_tokens_per_second": 11962.873 | |
| }, | |
| { | |
| "epoch": 0.17530895655497516, | |
| "grad_norm": 0.9053070583607986, | |
| "learning_rate": 4.875568123698525e-07, | |
| "loss": 0.55, | |
| "num_input_tokens_seen": 612028800, | |
| "step": 344, | |
| "train_runtime": 51128.5271, | |
| "train_tokens_per_second": 11970.398 | |
| }, | |
| { | |
| "epoch": 0.1773474327939865, | |
| "grad_norm": 0.901496752129611, | |
| "learning_rate": 4.604415239679492e-07, | |
| "loss": 0.5682, | |
| "num_input_tokens_seen": 619137792, | |
| "step": 348, | |
| "train_runtime": 51752.3485, | |
| "train_tokens_per_second": 11963.472 | |
| }, | |
| { | |
| "epoch": 0.17938590903299784, | |
| "grad_norm": 0.9438945381441597, | |
| "learning_rate": 4.3394618333426135e-07, | |
| "loss": 0.5652, | |
| "num_input_tokens_seen": 625854848, | |
| "step": 352, | |
| "train_runtime": 52310.1207, | |
| "train_tokens_per_second": 11964.317 | |
| }, | |
| { | |
| "epoch": 0.1814243852720092, | |
| "grad_norm": 0.9543610627021044, | |
| "learning_rate": 4.080892555025522e-07, | |
| "loss": 0.5581, | |
| "num_input_tokens_seen": 632626752, | |
| "step": 356, | |
| "train_runtime": 52857.0375, | |
| "train_tokens_per_second": 11968.638 | |
| }, | |
| { | |
| "epoch": 0.1834628615110205, | |
| "grad_norm": 0.9118473255578559, | |
| "learning_rate": 3.8288876058632056e-07, | |
| "loss": 0.56, | |
| "num_input_tokens_seen": 639638080, | |
| "step": 360, | |
| "train_runtime": 53443.4994, | |
| "train_tokens_per_second": 11968.492 | |
| }, | |
| { | |
| "epoch": 0.18550133775003186, | |
| "grad_norm": 0.9212497452458377, | |
| "learning_rate": 3.5836226122029165e-07, | |
| "loss": 0.557, | |
| "num_input_tokens_seen": 646705152, | |
| "step": 364, | |
| "train_runtime": 54052.7201, | |
| "train_tokens_per_second": 11964.341 | |
| }, | |
| { | |
| "epoch": 0.18753981398904318, | |
| "grad_norm": 0.9056019341991234, | |
| "learning_rate": 3.34526850320731e-07, | |
| "loss": 0.5602, | |
| "num_input_tokens_seen": 653674880, | |
| "step": 368, | |
| "train_runtime": 54641.6155, | |
| "train_tokens_per_second": 11962.949 | |
| }, | |
| { | |
| "epoch": 0.18957829022805453, | |
| "grad_norm": 1.0036743775927282, | |
| "learning_rate": 3.1139913917311347e-07, | |
| "loss": 0.5787, | |
| "num_input_tokens_seen": 660674240, | |
| "step": 372, | |
| "train_runtime": 55222.9846, | |
| "train_tokens_per_second": 11963.755 | |
| }, | |
| { | |
| "epoch": 0.19161676646706588, | |
| "grad_norm": 0.9225731699510716, | |
| "learning_rate": 2.889952458554475e-07, | |
| "loss": 0.5515, | |
| "num_input_tokens_seen": 667601792, | |
| "step": 376, | |
| "train_runtime": 55825.7562, | |
| "train_tokens_per_second": 11958.67 | |
| }, | |
| { | |
| "epoch": 0.1936552427060772, | |
| "grad_norm": 0.9067845518246624, | |
| "learning_rate": 2.6733078400532475e-07, | |
| "loss": 0.5621, | |
| "num_input_tokens_seen": 674622976, | |
| "step": 380, | |
| "train_runtime": 56384.2096, | |
| "train_tokens_per_second": 11964.75 | |
| }, | |
| { | |
| "epoch": 0.19569371894508855, | |
| "grad_norm": 0.9543732534991831, | |
| "learning_rate": 2.464208519385228e-07, | |
| "loss": 0.5708, | |
| "num_input_tokens_seen": 681708672, | |
| "step": 384, | |
| "train_runtime": 56981.2121, | |
| "train_tokens_per_second": 11963.745 | |
| }, | |
| { | |
| "epoch": 0.19773219518409987, | |
| "grad_norm": 0.8996552227811947, | |
| "learning_rate": 2.2628002212674264e-07, | |
| "loss": 0.5671, | |
| "num_input_tokens_seen": 689109376, | |
| "step": 388, | |
| "train_runtime": 57593.2165, | |
| "train_tokens_per_second": 11965.114 | |
| }, | |
| { | |
| "epoch": 0.19977067142311122, | |
| "grad_norm": 0.955255917559042, | |
| "learning_rate": 2.0692233104181644e-07, | |
| "loss": 0.5449, | |
| "num_input_tokens_seen": 696091456, | |
| "step": 392, | |
| "train_runtime": 58166.8582, | |
| "train_tokens_per_second": 11967.149 | |
| }, | |
| { | |
| "epoch": 0.20180914766212257, | |
| "grad_norm": 0.9040308153396052, | |
| "learning_rate": 1.8836126937346177e-07, | |
| "loss": 0.5618, | |
| "num_input_tokens_seen": 703111168, | |
| "step": 396, | |
| "train_runtime": 58738.4377, | |
| "train_tokens_per_second": 11970.205 | |
| }, | |
| { | |
| "epoch": 0.2038476239011339, | |
| "grad_norm": 0.97944459658204, | |
| "learning_rate": 1.706097726274012e-07, | |
| "loss": 0.5622, | |
| "num_input_tokens_seen": 710207168, | |
| "step": 400, | |
| "train_runtime": 59356.9508, | |
| "train_tokens_per_second": 11965.021 | |
| }, | |
| { | |
| "epoch": 0.20588610014014525, | |
| "grad_norm": 0.9381590971188796, | |
| "learning_rate": 1.5368021211039678e-07, | |
| "loss": 0.5411, | |
| "num_input_tokens_seen": 717098944, | |
| "step": 404, | |
| "train_runtime": 59911.4198, | |
| "train_tokens_per_second": 11969.32 | |
| }, | |
| { | |
| "epoch": 0.20792457637915657, | |
| "grad_norm": 0.8878878520807448, | |
| "learning_rate": 1.3758438630848725e-07, | |
| "loss": 0.5585, | |
| "num_input_tokens_seen": 723993152, | |
| "step": 408, | |
| "train_runtime": 60475.8972, | |
| "train_tokens_per_second": 11971.598 | |
| }, | |
| { | |
| "epoch": 0.20996305261816792, | |
| "grad_norm": 0.8806187323037155, | |
| "learning_rate": 1.2233351266442794e-07, | |
| "loss": 0.5541, | |
| "num_input_tokens_seen": 731057472, | |
| "step": 412, | |
| "train_runtime": 61037.4217, | |
| "train_tokens_per_second": 11977.201 | |
| }, | |
| { | |
| "epoch": 0.21200152885717927, | |
| "grad_norm": 0.9150900155470338, | |
| "learning_rate": 1.0793821976007693e-07, | |
| "loss": 0.566, | |
| "num_input_tokens_seen": 737872960, | |
| "step": 416, | |
| "train_runtime": 61615.4698, | |
| "train_tokens_per_second": 11975.45 | |
| }, | |
| { | |
| "epoch": 0.2140400050961906, | |
| "grad_norm": 0.9022599301148059, | |
| "learning_rate": 9.440853990915897e-08, | |
| "loss": 0.5454, | |
| "num_input_tokens_seen": 744627648, | |
| "step": 420, | |
| "train_runtime": 62161.1667, | |
| "train_tokens_per_second": 11978.984 | |
| }, | |
| { | |
| "epoch": 0.21607848133520194, | |
| "grad_norm": 0.9035906517868586, | |
| "learning_rate": 8.17539021655864e-08, | |
| "loss": 0.5571, | |
| "num_input_tokens_seen": 751584960, | |
| "step": 424, | |
| "train_runtime": 62697.4235, | |
| "train_tokens_per_second": 11987.494 | |
| }, | |
| { | |
| "epoch": 0.2181169575742133, | |
| "grad_norm": 0.8993779939634194, | |
| "learning_rate": 6.99831257521961e-08, | |
| "loss": 0.5444, | |
| "num_input_tokens_seen": 758504960, | |
| "step": 428, | |
| "train_runtime": 63274.1778, | |
| "train_tokens_per_second": 11987.591 | |
| }, | |
| { | |
| "epoch": 0.2201554338132246, | |
| "grad_norm": 0.9210930898367543, | |
| "learning_rate": 5.9104413914490546e-08, | |
| "loss": 0.5625, | |
| "num_input_tokens_seen": 765707712, | |
| "step": 432, | |
| "train_runtime": 63862.7303, | |
| "train_tokens_per_second": 11989.899 | |
| }, | |
| { | |
| "epoch": 0.22219391005223596, | |
| "grad_norm": 0.9280791017512283, | |
| "learning_rate": 4.912534820366224e-08, | |
| "loss": 0.5613, | |
| "num_input_tokens_seen": 772952256, | |
| "step": 436, | |
| "train_runtime": 64479.6745, | |
| "train_tokens_per_second": 11987.533 | |
| }, | |
| { | |
| "epoch": 0.22423238629124728, | |
| "grad_norm": 0.907069920744252, | |
| "learning_rate": 4.005288319288777e-08, | |
| "loss": 0.566, | |
| "num_input_tokens_seen": 780056832, | |
| "step": 440, | |
| "train_runtime": 65078.7995, | |
| "train_tokens_per_second": 11986.343 | |
| }, | |
| { | |
| "epoch": 0.22627086253025863, | |
| "grad_norm": 0.9315099081560159, | |
| "learning_rate": 3.189334163057219e-08, | |
| "loss": 0.5666, | |
| "num_input_tokens_seen": 786978752, | |
| "step": 444, | |
| "train_runtime": 65667.9097, | |
| "train_tokens_per_second": 11984.221 | |
| }, | |
| { | |
| "epoch": 0.22830933876926998, | |
| "grad_norm": 0.9383224396822377, | |
| "learning_rate": 2.4652410033923543e-08, | |
| "loss": 0.5541, | |
| "num_input_tokens_seen": 793887616, | |
| "step": 448, | |
| "train_runtime": 66257.6668, | |
| "train_tokens_per_second": 11981.823 | |
| }, | |
| { | |
| "epoch": 0.2303478150082813, | |
| "grad_norm": 0.8944859848575912, | |
| "learning_rate": 1.8335134725925177e-08, | |
| "loss": 0.5526, | |
| "num_input_tokens_seen": 800645760, | |
| "step": 452, | |
| "train_runtime": 66822.0683, | |
| "train_tokens_per_second": 11981.757 | |
| }, | |
| { | |
| "epoch": 0.23238629124729265, | |
| "grad_norm": 0.9050253510819353, | |
| "learning_rate": 1.2945918318473138e-08, | |
| "loss": 0.5634, | |
| "num_input_tokens_seen": 807760064, | |
| "step": 456, | |
| "train_runtime": 67425.8238, | |
| "train_tokens_per_second": 11979.981 | |
| }, | |
| { | |
| "epoch": 0.23442476748630398, | |
| "grad_norm": 0.9352706962645487, | |
| "learning_rate": 8.488516644122484e-09, | |
| "loss": 0.5618, | |
| "num_input_tokens_seen": 814702080, | |
| "step": 460, | |
| "train_runtime": 67994.9171, | |
| "train_tokens_per_second": 11981.809 | |
| }, | |
| { | |
| "epoch": 0.23646324372531533, | |
| "grad_norm": 0.8645415996226098, | |
| "learning_rate": 4.966036138587982e-09, | |
| "loss": 0.5553, | |
| "num_input_tokens_seen": 821691008, | |
| "step": 464, | |
| "train_runtime": 68583.3469, | |
| "train_tokens_per_second": 11980.911 | |
| }, | |
| { | |
| "epoch": 0.23850171996432667, | |
| "grad_norm": 0.9706971370356248, | |
| "learning_rate": 2.380931675817649e-09, | |
| "loss": 0.5601, | |
| "num_input_tokens_seen": 828902464, | |
| "step": 468, | |
| "train_runtime": 69210.2105, | |
| "train_tokens_per_second": 11976.592 | |
| }, | |
| { | |
| "epoch": 0.240540196203338, | |
| "grad_norm": 0.9160395400629753, | |
| "learning_rate": 7.350048571510504e-10, | |
| "loss": 0.5567, | |
| "num_input_tokens_seen": 835703232, | |
| "step": 472, | |
| "train_runtime": 69768.608, | |
| "train_tokens_per_second": 11978.213 | |
| }, | |
| { | |
| "epoch": 0.24257867244234935, | |
| "grad_norm": 0.839011112693872, | |
| "learning_rate": 2.9402755754737166e-11, | |
| "loss": 0.5519, | |
| "num_input_tokens_seen": 842742656, | |
| "step": 476, | |
| "train_runtime": 70378.9536, | |
| "train_tokens_per_second": 11974.356 | |
| }, | |
| { | |
| "epoch": 0.24257867244234935, | |
| "eval_loss": 0.7951585054397583, | |
| "eval_runtime": 212.309, | |
| "eval_samples_per_second": 4.738, | |
| "eval_steps_per_second": 0.075, | |
| "num_input_tokens_seen": 842742656, | |
| "step": 476 | |
| } | |
| ], | |
| "logging_steps": 4, | |
| "max_steps": 476, | |
| "num_input_tokens_seen": 842742656, | |
| "num_train_epochs": 1, | |
| "save_steps": 238, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4291443279527936.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |