Text Generation
Transformers
TensorBoard
Safetensors
gemma2
Generated from Trainer
trl
sft
alignment-handbook
conversational
text-generation-inference
Instructions to use Gabe-Thomp/lr2.0e-06_data-mix with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Gabe-Thomp/lr2.0e-06_data-mix with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Gabe-Thomp/lr2.0e-06_data-mix") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("Gabe-Thomp/lr2.0e-06_data-mix") model = AutoModelForCausalLM.from_pretrained("Gabe-Thomp/lr2.0e-06_data-mix") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Gabe-Thomp/lr2.0e-06_data-mix with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Gabe-Thomp/lr2.0e-06_data-mix" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gabe-Thomp/lr2.0e-06_data-mix", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Gabe-Thomp/lr2.0e-06_data-mix
- SGLang
How to use Gabe-Thomp/lr2.0e-06_data-mix with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Gabe-Thomp/lr2.0e-06_data-mix" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gabe-Thomp/lr2.0e-06_data-mix", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Gabe-Thomp/lr2.0e-06_data-mix" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Gabe-Thomp/lr2.0e-06_data-mix", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Gabe-Thomp/lr2.0e-06_data-mix with Docker Model Runner:
docker model run hf.co/Gabe-Thomp/lr2.0e-06_data-mix
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 486, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.030959752321981424, | |
| "grad_norm": 37.88094364827914, | |
| "learning_rate": 1.6326530612244896e-07, | |
| "loss": 2.012, | |
| "mean_token_accuracy": 0.6721480131149292, | |
| "num_tokens": 356347.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.06191950464396285, | |
| "grad_norm": 33.96281709708476, | |
| "learning_rate": 3.673469387755102e-07, | |
| "loss": 1.9687, | |
| "mean_token_accuracy": 0.6716048419475555, | |
| "num_tokens": 710036.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.09287925696594428, | |
| "grad_norm": 22.65005612491039, | |
| "learning_rate": 5.714285714285714e-07, | |
| "loss": 1.7907, | |
| "mean_token_accuracy": 0.68106742699941, | |
| "num_tokens": 1066084.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1238390092879257, | |
| "grad_norm": 12.825394787025013, | |
| "learning_rate": 7.755102040816326e-07, | |
| "loss": 1.3465, | |
| "mean_token_accuracy": 0.7263136367003123, | |
| "num_tokens": 1418691.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15479876160990713, | |
| "grad_norm": 4.82390687349429, | |
| "learning_rate": 9.795918367346939e-07, | |
| "loss": 1.105, | |
| "mean_token_accuracy": 0.7517302592595418, | |
| "num_tokens": 1773596.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.18575851393188855, | |
| "grad_norm": 2.138743079455233, | |
| "learning_rate": 1.183673469387755e-06, | |
| "loss": 0.9184, | |
| "mean_token_accuracy": 0.7795046945412953, | |
| "num_tokens": 2129005.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.21671826625386997, | |
| "grad_norm": 1.56843353818363, | |
| "learning_rate": 1.3877551020408162e-06, | |
| "loss": 0.8507, | |
| "mean_token_accuracy": 0.7924608170986176, | |
| "num_tokens": 2484075.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2476780185758514, | |
| "grad_norm": 1.3036043605324696, | |
| "learning_rate": 1.5918367346938775e-06, | |
| "loss": 0.7988, | |
| "mean_token_accuracy": 0.802043096224467, | |
| "num_tokens": 2838384.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2786377708978328, | |
| "grad_norm": 1.1700385198630212, | |
| "learning_rate": 1.7959183673469386e-06, | |
| "loss": 0.7936, | |
| "mean_token_accuracy": 0.7999271392822266, | |
| "num_tokens": 3192492.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.30959752321981426, | |
| "grad_norm": 1.1487890193406949, | |
| "learning_rate": 2e-06, | |
| "loss": 0.7533, | |
| "mean_token_accuracy": 0.8103362222512563, | |
| "num_tokens": 3545097.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.34055727554179566, | |
| "grad_norm": 1.162010071560409, | |
| "learning_rate": 1.9993540481842407e-06, | |
| "loss": 0.8066, | |
| "mean_token_accuracy": 0.7923434535662334, | |
| "num_tokens": 3903788.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3715170278637771, | |
| "grad_norm": 1.2349108637093285, | |
| "learning_rate": 1.9974170272444602e-06, | |
| "loss": 0.7724, | |
| "mean_token_accuracy": 0.8026398738225301, | |
| "num_tokens": 4259188.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4024767801857585, | |
| "grad_norm": 1.1955598435745196, | |
| "learning_rate": 1.9941914396250445e-06, | |
| "loss": 0.7695, | |
| "mean_token_accuracy": 0.8047589619954427, | |
| "num_tokens": 4613362.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.43343653250773995, | |
| "grad_norm": 1.0813062584413962, | |
| "learning_rate": 1.9896814524743527e-06, | |
| "loss": 0.7418, | |
| "mean_token_accuracy": 0.8074904640515645, | |
| "num_tokens": 4968926.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.46439628482972134, | |
| "grad_norm": 1.2490538133285465, | |
| "learning_rate": 1.983892892261163e-06, | |
| "loss": 0.782, | |
| "mean_token_accuracy": 0.7980295320351919, | |
| "num_tokens": 5326662.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.4953560371517028, | |
| "grad_norm": 1.1648094821501256, | |
| "learning_rate": 1.9768332372474366e-06, | |
| "loss": 0.781, | |
| "mean_token_accuracy": 0.7981564621130626, | |
| "num_tokens": 5683326.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 1.153658574148309, | |
| "learning_rate": 1.9685116078271223e-06, | |
| "loss": 0.7208, | |
| "mean_token_accuracy": 0.8101318061351777, | |
| "num_tokens": 6037360.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5572755417956656, | |
| "grad_norm": 1.0499709465738478, | |
| "learning_rate": 1.958938754743489e-06, | |
| "loss": 0.7171, | |
| "mean_token_accuracy": 0.8120081921418508, | |
| "num_tokens": 6391493.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 1.1336114333093723, | |
| "learning_rate": 1.9481270452001986e-06, | |
| "loss": 0.7145, | |
| "mean_token_accuracy": 0.8118303279081981, | |
| "num_tokens": 6744880.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.6191950464396285, | |
| "grad_norm": 1.1623292661065792, | |
| "learning_rate": 1.9360904468840735e-06, | |
| "loss": 0.7779, | |
| "mean_token_accuracy": 0.7967695931593577, | |
| "num_tokens": 7104675.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6501547987616099, | |
| "grad_norm": 1.1258828367181777, | |
| "learning_rate": 1.92284450992019e-06, | |
| "loss": 0.6596, | |
| "mean_token_accuracy": 0.8252765933672587, | |
| "num_tokens": 7456185.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.6811145510835913, | |
| "grad_norm": 1.0529210280076258, | |
| "learning_rate": 1.9084063467826133e-06, | |
| "loss": 0.7169, | |
| "mean_token_accuracy": 0.807880413532257, | |
| "num_tokens": 7810532.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7120743034055728, | |
| "grad_norm": 0.9981313212919131, | |
| "learning_rate": 1.8927946101867344e-06, | |
| "loss": 0.7207, | |
| "mean_token_accuracy": 0.8063897867997487, | |
| "num_tokens": 8166048.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.7430340557275542, | |
| "grad_norm": 1.1009837013665933, | |
| "learning_rate": 1.8760294689917554e-06, | |
| "loss": 0.7037, | |
| "mean_token_accuracy": 0.8074031293392181, | |
| "num_tokens": 8523864.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7739938080495357, | |
| "grad_norm": 1.0150983601738315, | |
| "learning_rate": 1.858132582144469e-06, | |
| "loss": 0.6768, | |
| "mean_token_accuracy": 0.8155300041039785, | |
| "num_tokens": 8875663.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.804953560371517, | |
| "grad_norm": 0.9926457956250347, | |
| "learning_rate": 1.8391270706979861e-06, | |
| "loss": 0.6913, | |
| "mean_token_accuracy": 0.8103034933408101, | |
| "num_tokens": 9230905.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8359133126934984, | |
| "grad_norm": 1.0928972897431848, | |
| "learning_rate": 1.819037487941563e-06, | |
| "loss": 0.7202, | |
| "mean_token_accuracy": 0.8045949776967366, | |
| "num_tokens": 9587132.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.8668730650154799, | |
| "grad_norm": 1.0669887753572231, | |
| "learning_rate": 1.7978897876801188e-06, | |
| "loss": 0.7363, | |
| "mean_token_accuracy": 0.8004952649275462, | |
| "num_tokens": 9943019.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8978328173374613, | |
| "grad_norm": 0.9764132937473873, | |
| "learning_rate": 1.7757112907044198e-06, | |
| "loss": 0.7099, | |
| "mean_token_accuracy": 0.8056580940882365, | |
| "num_tokens": 10299363.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.9287925696594427, | |
| "grad_norm": 0.9521500960876352, | |
| "learning_rate": 1.7525306494952496e-06, | |
| "loss": 0.6723, | |
| "mean_token_accuracy": 0.8147999107837677, | |
| "num_tokens": 10653205.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9597523219814241, | |
| "grad_norm": 0.9014288610770413, | |
| "learning_rate": 1.728377811207168e-06, | |
| "loss": 0.6682, | |
| "mean_token_accuracy": 0.8167306999365489, | |
| "num_tokens": 11006084.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.9907120743034056, | |
| "grad_norm": 0.9887395976242098, | |
| "learning_rate": 1.7032839789796709e-06, | |
| "loss": 0.6729, | |
| "mean_token_accuracy": 0.812424510717392, | |
| "num_tokens": 11362461.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0185758513931888, | |
| "grad_norm": 1.0123568191837427, | |
| "learning_rate": 1.6772815716257411e-06, | |
| "loss": 0.6889, | |
| "mean_token_accuracy": 0.812437218648416, | |
| "num_tokens": 11684418.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.0495356037151702, | |
| "grad_norm": 1.2247818299294482, | |
| "learning_rate": 1.6504041817498676e-06, | |
| "loss": 0.6003, | |
| "mean_token_accuracy": 0.8325801193714142, | |
| "num_tokens": 12038916.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.0804953560371517, | |
| "grad_norm": 1.1309870189119007, | |
| "learning_rate": 1.622686532349637e-06, | |
| "loss": 0.595, | |
| "mean_token_accuracy": 0.8306204895178477, | |
| "num_tokens": 12392491.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.111455108359133, | |
| "grad_norm": 1.0261193684060383, | |
| "learning_rate": 1.5941644319569663e-06, | |
| "loss": 0.5745, | |
| "mean_token_accuracy": 0.8364119688669841, | |
| "num_tokens": 12746211.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.1424148606811146, | |
| "grad_norm": 1.065208049531987, | |
| "learning_rate": 1.5648747283769316e-06, | |
| "loss": 0.6384, | |
| "mean_token_accuracy": 0.821829471985499, | |
| "num_tokens": 13105142.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.173374613003096, | |
| "grad_norm": 1.049060212665513, | |
| "learning_rate": 1.5348552610839538e-06, | |
| "loss": 0.583, | |
| "mean_token_accuracy": 0.8339940627415975, | |
| "num_tokens": 13459616.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.2043343653250773, | |
| "grad_norm": 1.0646992389936152, | |
| "learning_rate": 1.5041448123368452e-06, | |
| "loss": 0.585, | |
| "mean_token_accuracy": 0.8339759588241578, | |
| "num_tokens": 13813912.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.2352941176470589, | |
| "grad_norm": 1.0502983258770635, | |
| "learning_rate": 1.4727830570758676e-06, | |
| "loss": 0.6229, | |
| "mean_token_accuracy": 0.8257229665915171, | |
| "num_tokens": 14170920.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.2662538699690402, | |
| "grad_norm": 1.0418470874061867, | |
| "learning_rate": 1.4408105116665333e-06, | |
| "loss": 0.5806, | |
| "mean_token_accuracy": 0.836287780602773, | |
| "num_tokens": 14524500.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.2972136222910216, | |
| "grad_norm": 1.087294144738772, | |
| "learning_rate": 1.4082684815563658e-06, | |
| "loss": 0.5956, | |
| "mean_token_accuracy": 0.8319136381149292, | |
| "num_tokens": 14880512.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.328173374613003, | |
| "grad_norm": 1.101276572209567, | |
| "learning_rate": 1.375199007912241e-06, | |
| "loss": 0.6054, | |
| "mean_token_accuracy": 0.8299936970074971, | |
| "num_tokens": 15237315.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.3591331269349844, | |
| "grad_norm": 1.010589916838959, | |
| "learning_rate": 1.3416448133072523e-06, | |
| "loss": 0.5881, | |
| "mean_token_accuracy": 0.8364007751146952, | |
| "num_tokens": 15589797.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.390092879256966, | |
| "grad_norm": 1.0622699241655902, | |
| "learning_rate": 1.307649246527263e-06, | |
| "loss": 0.5995, | |
| "mean_token_accuracy": 0.8305212179819743, | |
| "num_tokens": 15946015.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.4210526315789473, | |
| "grad_norm": 1.1594145716237165, | |
| "learning_rate": 1.273256226568451e-06, | |
| "loss": 0.6167, | |
| "mean_token_accuracy": 0.8282975077629089, | |
| "num_tokens": 16300916.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.4520123839009287, | |
| "grad_norm": 1.051260396120812, | |
| "learning_rate": 1.2385101858982004e-06, | |
| "loss": 0.6137, | |
| "mean_token_accuracy": 0.8268493433793386, | |
| "num_tokens": 16658826.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.4829721362229102, | |
| "grad_norm": 1.1511715900211696, | |
| "learning_rate": 1.203456013052634e-06, | |
| "loss": 0.6359, | |
| "mean_token_accuracy": 0.8252548217773438, | |
| "num_tokens": 17016876.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.5139318885448918, | |
| "grad_norm": 1.0811656373656746, | |
| "learning_rate": 1.1681389946449502e-06, | |
| "loss": 0.5956, | |
| "mean_token_accuracy": 0.8341775079568227, | |
| "num_tokens": 17371949.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.5448916408668731, | |
| "grad_norm": 1.0456087268969014, | |
| "learning_rate": 1.132604756859485e-06, | |
| "loss": 0.5821, | |
| "mean_token_accuracy": 0.836975779136022, | |
| "num_tokens": 17724910.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.5758513931888545, | |
| "grad_norm": 1.1464877768864348, | |
| "learning_rate": 1.0968992065070768e-06, | |
| "loss": 0.6304, | |
| "mean_token_accuracy": 0.8253893832365672, | |
| "num_tokens": 18082483.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.6068111455108358, | |
| "grad_norm": 1.0740762703373092, | |
| "learning_rate": 1.0610684717178905e-06, | |
| "loss": 0.6069, | |
| "mean_token_accuracy": 0.8298774818579356, | |
| "num_tokens": 18438484.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.6377708978328174, | |
| "grad_norm": 1.058918476787192, | |
| "learning_rate": 1.0251588423483204e-06, | |
| "loss": 0.5919, | |
| "mean_token_accuracy": 0.831935566663742, | |
| "num_tokens": 18794196.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.6687306501547987, | |
| "grad_norm": 1.0594158390719453, | |
| "learning_rate": 9.892167101789563e-07, | |
| "loss": 0.5799, | |
| "mean_token_accuracy": 0.8371186554431915, | |
| "num_tokens": 19147172.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.6996904024767803, | |
| "grad_norm": 1.0350217488731246, | |
| "learning_rate": 9.532885089808712e-07, | |
| "loss": 0.5707, | |
| "mean_token_accuracy": 0.8371800223986308, | |
| "num_tokens": 19500395.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.7306501547987616, | |
| "grad_norm": 1.1043252167652924, | |
| "learning_rate": 9.174206545276677e-07, | |
| "loss": 0.5919, | |
| "mean_token_accuracy": 0.833638709783554, | |
| "num_tokens": 19854642.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.761609907120743, | |
| "grad_norm": 1.0594957855862892, | |
| "learning_rate": 8.81659484630768e-07, | |
| "loss": 0.6144, | |
| "mean_token_accuracy": 0.8295779307683309, | |
| "num_tokens": 20210201.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.7925696594427245, | |
| "grad_norm": 1.0655593034762212, | |
| "learning_rate": 8.460511992754299e-07, | |
| "loss": 0.6008, | |
| "mean_token_accuracy": 0.8292633573214213, | |
| "num_tokens": 20564970.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.8235294117647058, | |
| "grad_norm": 1.0925951219291423, | |
| "learning_rate": 8.106418009348156e-07, | |
| "loss": 0.5495, | |
| "mean_token_accuracy": 0.8422843952973683, | |
| "num_tokens": 20916569.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.8544891640866874, | |
| "grad_norm": 1.066475477830169, | |
| "learning_rate": 7.75477035139231e-07, | |
| "loss": 0.592, | |
| "mean_token_accuracy": 0.8322900295257568, | |
| "num_tokens": 21270517.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.8854489164086687, | |
| "grad_norm": 1.0726398714625474, | |
| "learning_rate": 7.406023313773097e-07, | |
| "loss": 0.5846, | |
| "mean_token_accuracy": 0.833445531129837, | |
| "num_tokens": 21626435.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.91640866873065, | |
| "grad_norm": 1.0081168980943895, | |
| "learning_rate": 7.060627444054893e-07, | |
| "loss": 0.599, | |
| "mean_token_accuracy": 0.8309976756572723, | |
| "num_tokens": 21981684.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.9473684210526314, | |
| "grad_norm": 1.1440591625457015, | |
| "learning_rate": 6.719028960416098e-07, | |
| "loss": 0.5952, | |
| "mean_token_accuracy": 0.8320066591103872, | |
| "num_tokens": 22335994.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.978328173374613, | |
| "grad_norm": 1.0647370303589478, | |
| "learning_rate": 6.381669175178248e-07, | |
| "loss": 0.5769, | |
| "mean_token_accuracy": 0.8349888563156128, | |
| "num_tokens": 22690758.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.0061919504643964, | |
| "grad_norm": 1.4685059022343072, | |
| "learning_rate": 6.048983924673022e-07, | |
| "loss": 0.6154, | |
| "mean_token_accuracy": 0.8296286706571225, | |
| "num_tokens": 23012996.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.0371517027863777, | |
| "grad_norm": 1.0756981234282057, | |
| "learning_rate": 5.72140300618369e-07, | |
| "loss": 0.548, | |
| "mean_token_accuracy": 0.845786041021347, | |
| "num_tokens": 23369489.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.068111455108359, | |
| "grad_norm": 1.22987487372924, | |
| "learning_rate": 5.399349622688478e-07, | |
| "loss": 0.5536, | |
| "mean_token_accuracy": 0.8429702619711558, | |
| "num_tokens": 23727840.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.0990712074303404, | |
| "grad_norm": 1.0621757442951574, | |
| "learning_rate": 5.083239836123059e-07, | |
| "loss": 0.5258, | |
| "mean_token_accuracy": 0.8519696414470672, | |
| "num_tokens": 24082355.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.130030959752322, | |
| "grad_norm": 1.1425718017062751, | |
| "learning_rate": 4.773482029868656e-07, | |
| "loss": 0.5293, | |
| "mean_token_accuracy": 0.8492769340674082, | |
| "num_tokens": 24438493.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.1609907120743035, | |
| "grad_norm": 1.1312756633475332, | |
| "learning_rate": 4.4704763811600643e-07, | |
| "loss": 0.5308, | |
| "mean_token_accuracy": 0.8488172392050425, | |
| "num_tokens": 24792322.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.191950464396285, | |
| "grad_norm": 1.2842356289182306, | |
| "learning_rate": 4.174614344095213e-07, | |
| "loss": 0.5655, | |
| "mean_token_accuracy": 0.8417594293753307, | |
| "num_tokens": 25151009.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.222910216718266, | |
| "grad_norm": 1.1702297440144753, | |
| "learning_rate": 3.886278143914219e-07, | |
| "loss": 0.5364, | |
| "mean_token_accuracy": 0.8470952173074087, | |
| "num_tokens": 25507520.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.2538699690402475, | |
| "grad_norm": 1.3759095012811366, | |
| "learning_rate": 3.605840283201195e-07, | |
| "loss": 0.5599, | |
| "mean_token_accuracy": 0.8408537685871125, | |
| "num_tokens": 25864215.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.2848297213622293, | |
| "grad_norm": 1.1193461931741189, | |
| "learning_rate": 3.333663060646813e-07, | |
| "loss": 0.49, | |
| "mean_token_accuracy": 0.859304424126943, | |
| "num_tokens": 26216877.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.3157894736842106, | |
| "grad_norm": 1.1648036361258658, | |
| "learning_rate": 3.0700981029933016e-07, | |
| "loss": 0.4931, | |
| "mean_token_accuracy": 0.8578304747740427, | |
| "num_tokens": 26573016.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.346749226006192, | |
| "grad_norm": 1.1405575574665878, | |
| "learning_rate": 2.8154859107665987e-07, | |
| "loss": 0.4917, | |
| "mean_token_accuracy": 0.8590823928515117, | |
| "num_tokens": 26925170.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.3777089783281733, | |
| "grad_norm": 1.1724816632075952, | |
| "learning_rate": 2.5701554183824724e-07, | |
| "loss": 0.5237, | |
| "mean_token_accuracy": 0.8520345091819763, | |
| "num_tokens": 27279422.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.4086687306501546, | |
| "grad_norm": 1.1357266729474, | |
| "learning_rate": 2.3344235691949476e-07, | |
| "loss": 0.4672, | |
| "mean_token_accuracy": 0.8642761449019114, | |
| "num_tokens": 27630575.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.4396284829721364, | |
| "grad_norm": 1.2409266497871647, | |
| "learning_rate": 2.1085949060360653e-07, | |
| "loss": 0.5369, | |
| "mean_token_accuracy": 0.849452143907547, | |
| "num_tokens": 27984763.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.4705882352941178, | |
| "grad_norm": 1.188481006905719, | |
| "learning_rate": 1.8929611777758525e-07, | |
| "loss": 0.5212, | |
| "mean_token_accuracy": 0.8504625717798869, | |
| "num_tokens": 28339838.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.501547987616099, | |
| "grad_norm": 1.1055493889153472, | |
| "learning_rate": 1.6878009624109312e-07, | |
| "loss": 0.5062, | |
| "mean_token_accuracy": 0.8539404590924581, | |
| "num_tokens": 28694134.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.5325077399380804, | |
| "grad_norm": 1.0702929024634484, | |
| "learning_rate": 1.493379307168573e-07, | |
| "loss": 0.5392, | |
| "mean_token_accuracy": 0.8472303132216136, | |
| "num_tokens": 29049115.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.5634674922600618, | |
| "grad_norm": 1.2498891293489098, | |
| "learning_rate": 1.3099473860912325e-07, | |
| "loss": 0.5218, | |
| "mean_token_accuracy": 0.8514606674512227, | |
| "num_tokens": 29404233.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 2.594427244582043, | |
| "grad_norm": 1.135112532376098, | |
| "learning_rate": 1.1377421755438832e-07, | |
| "loss": 0.5072, | |
| "mean_token_accuracy": 0.8529640992482503, | |
| "num_tokens": 29759031.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.625386996904025, | |
| "grad_norm": 1.228152423413386, | |
| "learning_rate": 9.769861480633979e-08, | |
| "loss": 0.5377, | |
| "mean_token_accuracy": 0.8478512247403462, | |
| "num_tokens": 30116017.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.656346749226006, | |
| "grad_norm": 1.1448683622452198, | |
| "learning_rate": 8.278869849454717e-08, | |
| "loss": 0.5106, | |
| "mean_token_accuracy": 0.8535682797431946, | |
| "num_tokens": 30469452.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.6873065015479876, | |
| "grad_norm": 1.1448594360250504, | |
| "learning_rate": 6.906373079403849e-08, | |
| "loss": 0.4842, | |
| "mean_token_accuracy": 0.8605853617191315, | |
| "num_tokens": 30822195.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.718266253869969, | |
| "grad_norm": 1.1485381285009257, | |
| "learning_rate": 5.6541443040429295e-08, | |
| "loss": 0.5117, | |
| "mean_token_accuracy": 0.8541167537371318, | |
| "num_tokens": 31177700.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.7492260061919502, | |
| "grad_norm": 1.1004929101776235, | |
| "learning_rate": 4.523801282274286e-08, | |
| "loss": 0.5198, | |
| "mean_token_accuracy": 0.8509711424509684, | |
| "num_tokens": 31534499.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.780185758513932, | |
| "grad_norm": 1.235343825874549, | |
| "learning_rate": 3.5168043083526274e-08, | |
| "loss": 0.5151, | |
| "mean_token_accuracy": 0.8518358329931895, | |
| "num_tokens": 31890705.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.8111455108359134, | |
| "grad_norm": 1.2048990404018094, | |
| "learning_rate": 2.634454325325497e-08, | |
| "loss": 0.5085, | |
| "mean_token_accuracy": 0.8524168650309245, | |
| "num_tokens": 32246784.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 2.8421052631578947, | |
| "grad_norm": 1.1330314224549871, | |
| "learning_rate": 1.877891244340224e-08, | |
| "loss": 0.5141, | |
| "mean_token_accuracy": 0.853009025255839, | |
| "num_tokens": 32601413.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.873065015479876, | |
| "grad_norm": 1.1251032096069928, | |
| "learning_rate": 1.2480924719885932e-08, | |
| "loss": 0.4948, | |
| "mean_token_accuracy": 0.8585106293360393, | |
| "num_tokens": 32955057.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 2.9040247678018574, | |
| "grad_norm": 1.3402807034322533, | |
| "learning_rate": 7.45871647591756e-09, | |
| "loss": 0.5186, | |
| "mean_token_accuracy": 0.8510903239250183, | |
| "num_tokens": 33310606.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.934984520123839, | |
| "grad_norm": 1.1610151530941533, | |
| "learning_rate": 3.7187759205656864e-09, | |
| "loss": 0.5312, | |
| "mean_token_accuracy": 0.8494451999664306, | |
| "num_tokens": 33664584.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.9659442724458205, | |
| "grad_norm": 1.2159889934118617, | |
| "learning_rate": 1.2659346966152895e-09, | |
| "loss": 0.5202, | |
| "mean_token_accuracy": 0.8501146256923675, | |
| "num_tokens": 34019473.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.996904024767802, | |
| "grad_norm": 1.1268577396776651, | |
| "learning_rate": 1.0336163855129143e-10, | |
| "loss": 0.524, | |
| "mean_token_accuracy": 0.8500737905502319, | |
| "num_tokens": 34375925.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "mean_token_accuracy": 0.8452582756678263, | |
| "num_tokens": 34411845.0, | |
| "step": 486, | |
| "total_flos": 131252961812480.0, | |
| "train_loss": 0.6672706213998206, | |
| "train_runtime": 11583.057, | |
| "train_samples_per_second": 4.014, | |
| "train_steps_per_second": 0.042 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 486, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 131252961812480.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |