Text Generation
Transformers
Safetensors
qwen3
Generated from Trainer
trl
sft
conversational
text-generation-inference
Instructions to use cs-552-2026-ChatMODS/general_knowledge_model with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use cs-552-2026-ChatMODS/general_knowledge_model with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="cs-552-2026-ChatMODS/general_knowledge_model") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("cs-552-2026-ChatMODS/general_knowledge_model") model = AutoModelForCausalLM.from_pretrained("cs-552-2026-ChatMODS/general_knowledge_model") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use cs-552-2026-ChatMODS/general_knowledge_model with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "cs-552-2026-ChatMODS/general_knowledge_model" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "cs-552-2026-ChatMODS/general_knowledge_model", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/cs-552-2026-ChatMODS/general_knowledge_model
- SGLang
How to use cs-552-2026-ChatMODS/general_knowledge_model with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "cs-552-2026-ChatMODS/general_knowledge_model" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "cs-552-2026-ChatMODS/general_knowledge_model", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "cs-552-2026-ChatMODS/general_knowledge_model" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "cs-552-2026-ChatMODS/general_knowledge_model", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use cs-552-2026-ChatMODS/general_knowledge_model with Docker Model Runner:
docker model run hf.co/cs-552-2026-ChatMODS/general_knowledge_model
| { | |
| "best_global_step": 3000, | |
| "best_metric": 0.43371766805648804, | |
| "best_model_checkpoint": "/scratch/gk_checkpoint_lora_v2/checkpoint-3000", | |
| "epoch": 0.9051821679112921, | |
| "eval_steps": 200, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.0372698324173688, | |
| "epoch": 0.015086369465188203, | |
| "grad_norm": 0.4944687783718109, | |
| "learning_rate": 3.266666666666667e-05, | |
| "loss": 2.1209957885742186, | |
| "mean_token_accuracy": 0.6639667785167694, | |
| "num_tokens": 213967.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.4767542722821236, | |
| "epoch": 0.030172738930376405, | |
| "grad_norm": 0.29953643679618835, | |
| "learning_rate": 6.6e-05, | |
| "loss": 0.5209395980834961, | |
| "mean_token_accuracy": 0.9028549310564995, | |
| "num_tokens": 426365.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.46151453502476214, | |
| "epoch": 0.04525910839556461, | |
| "grad_norm": 0.3118787109851837, | |
| "learning_rate": 9.933333333333334e-05, | |
| "loss": 0.508198356628418, | |
| "mean_token_accuracy": 0.9032981966435909, | |
| "num_tokens": 642837.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.4455497920885682, | |
| "epoch": 0.06034547786075281, | |
| "grad_norm": 0.3013271391391754, | |
| "learning_rate": 9.999382532513122e-05, | |
| "loss": 0.48826019287109373, | |
| "mean_token_accuracy": 0.907553653717041, | |
| "num_tokens": 856475.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06034547786075281, | |
| "eval_entropy": 0.4689600637588114, | |
| "eval_loss": 0.4816349446773529, | |
| "eval_mean_token_accuracy": 0.8987587762934696, | |
| "eval_num_tokens": 856475.0, | |
| "eval_runtime": 37.5206, | |
| "eval_samples_per_second": 57.728, | |
| "eval_steps_per_second": 7.223, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.44372758489102127, | |
| "epoch": 0.075431847325941, | |
| "grad_norm": 0.31373220682144165, | |
| "learning_rate": 9.997479627263544e-05, | |
| "loss": 0.4843710327148438, | |
| "mean_token_accuracy": 0.9082412907481193, | |
| "num_tokens": 1069370.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.4475720078870654, | |
| "epoch": 0.09051821679112922, | |
| "grad_norm": 0.27380964159965515, | |
| "learning_rate": 9.994291516446573e-05, | |
| "loss": 0.491109733581543, | |
| "mean_token_accuracy": 0.9055162121355533, | |
| "num_tokens": 1286132.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.4463552813604474, | |
| "epoch": 0.10560458625631741, | |
| "grad_norm": 0.2614763677120209, | |
| "learning_rate": 9.989819019951048e-05, | |
| "loss": 0.4837772369384766, | |
| "mean_token_accuracy": 0.9081570096313953, | |
| "num_tokens": 1500851.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.41013720393180847, | |
| "epoch": 0.12069095572150562, | |
| "grad_norm": 0.4836612045764923, | |
| "learning_rate": 9.984063287972232e-05, | |
| "loss": 0.44385364532470706, | |
| "mean_token_accuracy": 0.9148843766748905, | |
| "num_tokens": 1708807.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.12069095572150562, | |
| "eval_entropy": 0.4811463643044123, | |
| "eval_loss": 0.4747391641139984, | |
| "eval_mean_token_accuracy": 0.8997697566268189, | |
| "eval_num_tokens": 1708807.0, | |
| "eval_runtime": 37.1123, | |
| "eval_samples_per_second": 58.363, | |
| "eval_steps_per_second": 7.302, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.42200190499424933, | |
| "epoch": 0.13577732518669383, | |
| "grad_norm": 0.19339531660079956, | |
| "learning_rate": 9.977025800716017e-05, | |
| "loss": 0.45712459564208985, | |
| "mean_token_accuracy": 0.9106362241506577, | |
| "num_tokens": 1923184.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.43271509755402804, | |
| "epoch": 0.150863694651882, | |
| "grad_norm": 0.299110472202301, | |
| "learning_rate": 9.968708368018253e-05, | |
| "loss": 0.4724855422973633, | |
| "mean_token_accuracy": 0.9078708891570568, | |
| "num_tokens": 2139609.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.43638833791017534, | |
| "epoch": 0.16595006411707022, | |
| "grad_norm": 0.2539425492286682, | |
| "learning_rate": 9.959113128879322e-05, | |
| "loss": 0.4785395050048828, | |
| "mean_token_accuracy": 0.907249256670475, | |
| "num_tokens": 2358080.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.43115664307028057, | |
| "epoch": 0.18103643358225843, | |
| "grad_norm": 0.2542003393173218, | |
| "learning_rate": 9.948242550914035e-05, | |
| "loss": 0.4740608215332031, | |
| "mean_token_accuracy": 0.9089943794906139, | |
| "num_tokens": 2574667.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.18103643358225843, | |
| "eval_entropy": 0.45346878287537074, | |
| "eval_loss": 0.46328845620155334, | |
| "eval_mean_token_accuracy": 0.9025413253210568, | |
| "eval_num_tokens": 2574667.0, | |
| "eval_runtime": 37.1254, | |
| "eval_samples_per_second": 58.343, | |
| "eval_steps_per_second": 7.3, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.4420048241317272, | |
| "epoch": 0.19612280304744664, | |
| "grad_norm": 0.26832085847854614, | |
| "learning_rate": 9.936099429717045e-05, | |
| "loss": 0.486652717590332, | |
| "mean_token_accuracy": 0.9074076810479164, | |
| "num_tokens": 2790489.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.4269510039314628, | |
| "epoch": 0.21120917251263482, | |
| "grad_norm": 0.20635050535202026, | |
| "learning_rate": 9.922686888143897e-05, | |
| "loss": 0.4619187927246094, | |
| "mean_token_accuracy": 0.910810690075159, | |
| "num_tokens": 3003881.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.42016164746135476, | |
| "epoch": 0.22629554197782303, | |
| "grad_norm": 0.2643264830112457, | |
| "learning_rate": 9.908008375507924e-05, | |
| "loss": 0.46344844818115233, | |
| "mean_token_accuracy": 0.9113752076029777, | |
| "num_tokens": 3218446.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.3974369211867452, | |
| "epoch": 0.24138191144301124, | |
| "grad_norm": 0.19392798840999603, | |
| "learning_rate": 9.89206766669318e-05, | |
| "loss": 0.42646697998046873, | |
| "mean_token_accuracy": 0.9165593402087688, | |
| "num_tokens": 3428256.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.24138191144301124, | |
| "eval_entropy": 0.43642714537157784, | |
| "eval_loss": 0.46031010150909424, | |
| "eval_mean_token_accuracy": 0.9025695645061366, | |
| "eval_num_tokens": 3428256.0, | |
| "eval_runtime": 37.1078, | |
| "eval_samples_per_second": 58.37, | |
| "eval_steps_per_second": 7.303, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.4217161551490426, | |
| "epoch": 0.25646828090819945, | |
| "grad_norm": 0.21452021598815918, | |
| "learning_rate": 9.874868861183658e-05, | |
| "loss": 0.4612973022460938, | |
| "mean_token_accuracy": 0.9114794608950615, | |
| "num_tokens": 3642529.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.41439461953938006, | |
| "epoch": 0.27155465037338766, | |
| "grad_norm": 0.25079163908958435, | |
| "learning_rate": 9.856416382009006e-05, | |
| "loss": 0.4494070053100586, | |
| "mean_token_accuracy": 0.9127840812504292, | |
| "num_tokens": 3855962.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.4270974922552705, | |
| "epoch": 0.2866410198385759, | |
| "grad_norm": 0.20817860960960388, | |
| "learning_rate": 9.836714974607077e-05, | |
| "loss": 0.46105358123779294, | |
| "mean_token_accuracy": 0.9099104046821594, | |
| "num_tokens": 4069157.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.400859787017107, | |
| "epoch": 0.301727389303764, | |
| "grad_norm": 0.20943191647529602, | |
| "learning_rate": 9.815769705603521e-05, | |
| "loss": 0.4289055633544922, | |
| "mean_token_accuracy": 0.9167061321437359, | |
| "num_tokens": 4278743.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.301727389303764, | |
| "eval_entropy": 0.44312036031946483, | |
| "eval_loss": 0.4571220278739929, | |
| "eval_mean_token_accuracy": 0.9029428505809545, | |
| "eval_num_tokens": 4278743.0, | |
| "eval_runtime": 37.0803, | |
| "eval_samples_per_second": 58.414, | |
| "eval_steps_per_second": 7.308, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.4052092955261469, | |
| "epoch": 0.31681375876895224, | |
| "grad_norm": 0.21670734882354736, | |
| "learning_rate": 9.793585961508811e-05, | |
| "loss": 0.44187084197998044, | |
| "mean_token_accuracy": 0.9138142390549183, | |
| "num_tokens": 4495120.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.3885428298264742, | |
| "epoch": 0.33190012823414045, | |
| "grad_norm": 0.2656344771385193, | |
| "learning_rate": 9.770169447332977e-05, | |
| "loss": 0.42026878356933595, | |
| "mean_token_accuracy": 0.9171342994272709, | |
| "num_tokens": 4707664.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.41116086438298227, | |
| "epoch": 0.34698649769932866, | |
| "grad_norm": 0.25747165083885193, | |
| "learning_rate": 9.745526185118458e-05, | |
| "loss": 0.44418087005615237, | |
| "mean_token_accuracy": 0.9133055797219276, | |
| "num_tokens": 4921515.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 0.40833847373723986, | |
| "epoch": 0.36207286716451687, | |
| "grad_norm": 0.22326916456222534, | |
| "learning_rate": 9.719662512391396e-05, | |
| "loss": 0.4394912338256836, | |
| "mean_token_accuracy": 0.9140990000963211, | |
| "num_tokens": 5133998.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.36207286716451687, | |
| "eval_entropy": 0.44459747828240764, | |
| "eval_loss": 0.4540960192680359, | |
| "eval_mean_token_accuracy": 0.9033675605080664, | |
| "eval_num_tokens": 5133998.0, | |
| "eval_runtime": 37.1014, | |
| "eval_samples_per_second": 58.381, | |
| "eval_steps_per_second": 7.304, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.39192866910248997, | |
| "epoch": 0.3771592366297051, | |
| "grad_norm": 0.20642907917499542, | |
| "learning_rate": 9.692585080531822e-05, | |
| "loss": 0.42892616271972656, | |
| "mean_token_accuracy": 0.9165673214197159, | |
| "num_tokens": 5348047.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 0.40107650008052587, | |
| "epoch": 0.3922456060948933, | |
| "grad_norm": 0.2515887916088104, | |
| "learning_rate": 9.664300853063104e-05, | |
| "loss": 0.4329941558837891, | |
| "mean_token_accuracy": 0.9152751086652279, | |
| "num_tokens": 5562125.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.3964537301659584, | |
| "epoch": 0.4073319755600815, | |
| "grad_norm": 0.20992055535316467, | |
| "learning_rate": 9.63481710386114e-05, | |
| "loss": 0.4275414276123047, | |
| "mean_token_accuracy": 0.9168652257323265, | |
| "num_tokens": 5773685.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 0.39044841077178716, | |
| "epoch": 0.42241834502526965, | |
| "grad_norm": 0.30388110876083374, | |
| "learning_rate": 9.604141415283728e-05, | |
| "loss": 0.42324817657470704, | |
| "mean_token_accuracy": 0.9169075645506382, | |
| "num_tokens": 5986601.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.42241834502526965, | |
| "eval_entropy": 0.4361799991658693, | |
| "eval_loss": 0.45172417163848877, | |
| "eval_mean_token_accuracy": 0.9033232137725802, | |
| "eval_num_tokens": 5986601.0, | |
| "eval_runtime": 37.1897, | |
| "eval_samples_per_second": 58.242, | |
| "eval_steps_per_second": 7.287, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.4137679870799184, | |
| "epoch": 0.43750471449045786, | |
| "grad_norm": 0.23755542933940887, | |
| "learning_rate": 9.572281676220608e-05, | |
| "loss": 0.4478377532958984, | |
| "mean_token_accuracy": 0.911891212016344, | |
| "num_tokens": 6203048.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 0.41279098089784383, | |
| "epoch": 0.45259108395564607, | |
| "grad_norm": 0.21780936419963837, | |
| "learning_rate": 9.539246080064659e-05, | |
| "loss": 0.45262195587158205, | |
| "mean_token_accuracy": 0.9123758906126023, | |
| "num_tokens": 6419624.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 0.41412957072257994, | |
| "epoch": 0.4676774534208343, | |
| "grad_norm": 0.2430579662322998, | |
| "learning_rate": 9.505043122604818e-05, | |
| "loss": 0.45246307373046873, | |
| "mean_token_accuracy": 0.9122441673278808, | |
| "num_tokens": 6633965.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 0.3798889485746622, | |
| "epoch": 0.4827638228860225, | |
| "grad_norm": 0.28653672337532043, | |
| "learning_rate": 9.469681599841192e-05, | |
| "loss": 0.41427810668945314, | |
| "mean_token_accuracy": 0.9184439463913441, | |
| "num_tokens": 6847358.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.4827638228860225, | |
| "eval_entropy": 0.43242653594025826, | |
| "eval_loss": 0.4488651752471924, | |
| "eval_mean_token_accuracy": 0.9040581949082688, | |
| "eval_num_tokens": 6847358.0, | |
| "eval_runtime": 37.1595, | |
| "eval_samples_per_second": 58.289, | |
| "eval_steps_per_second": 7.293, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 0.38126184083521364, | |
| "epoch": 0.4978501923512107, | |
| "grad_norm": 0.2068091183900833, | |
| "learning_rate": 9.433170605722996e-05, | |
| "loss": 0.40500320434570314, | |
| "mean_token_accuracy": 0.9181749866902829, | |
| "num_tokens": 7062005.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 0.38879580337554215, | |
| "epoch": 0.5129365618163989, | |
| "grad_norm": 0.19908899068832397, | |
| "learning_rate": 9.395519529809848e-05, | |
| "loss": 0.41957916259765626, | |
| "mean_token_accuracy": 0.9183010324835778, | |
| "num_tokens": 7272082.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.3867104376107454, | |
| "epoch": 0.5280229312815871, | |
| "grad_norm": 0.21681655943393707, | |
| "learning_rate": 9.356738054857057e-05, | |
| "loss": 0.41496986389160156, | |
| "mean_token_accuracy": 0.9176018598675728, | |
| "num_tokens": 7484751.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 0.3900348538905382, | |
| "epoch": 0.5431093007467753, | |
| "grad_norm": 0.23264895379543304, | |
| "learning_rate": 9.316836154325494e-05, | |
| "loss": 0.4201799774169922, | |
| "mean_token_accuracy": 0.9161376728117466, | |
| "num_tokens": 7699385.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5431093007467753, | |
| "eval_entropy": 0.4371595958941977, | |
| "eval_loss": 0.44555598497390747, | |
| "eval_mean_token_accuracy": 0.9047388078101887, | |
| "eval_num_tokens": 7699385.0, | |
| "eval_runtime": 37.2391, | |
| "eval_samples_per_second": 58.165, | |
| "eval_steps_per_second": 7.277, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 0.38737339399755, | |
| "epoch": 0.5581956702119635, | |
| "grad_norm": 0.21236486732959747, | |
| "learning_rate": 9.275824089816716e-05, | |
| "loss": 0.4186508941650391, | |
| "mean_token_accuracy": 0.9184837466478348, | |
| "num_tokens": 7912846.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 0.38791515786200764, | |
| "epoch": 0.5732820396771517, | |
| "grad_norm": 0.22874821722507477, | |
| "learning_rate": 9.233712408433972e-05, | |
| "loss": 0.42144878387451173, | |
| "mean_token_accuracy": 0.9170675221085548, | |
| "num_tokens": 8126645.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 0.3831383780390024, | |
| "epoch": 0.5883684091423399, | |
| "grad_norm": 0.3072109818458557, | |
| "learning_rate": 9.190511940069813e-05, | |
| "loss": 0.407428092956543, | |
| "mean_token_accuracy": 0.9184182004630566, | |
| "num_tokens": 8341447.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 0.38130090072751044, | |
| "epoch": 0.603454778607528, | |
| "grad_norm": 0.2783527374267578, | |
| "learning_rate": 9.146233794620944e-05, | |
| "loss": 0.41518512725830076, | |
| "mean_token_accuracy": 0.9192077203094959, | |
| "num_tokens": 8553915.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.603454778607528, | |
| "eval_entropy": 0.41359548831557874, | |
| "eval_loss": 0.4426310062408447, | |
| "eval_mean_token_accuracy": 0.9062746316744392, | |
| "eval_num_tokens": 8553915.0, | |
| "eval_runtime": 37.0772, | |
| "eval_samples_per_second": 58.419, | |
| "eval_steps_per_second": 7.309, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 0.40945424281060694, | |
| "epoch": 0.6185411480727163, | |
| "grad_norm": 0.2751815915107727, | |
| "learning_rate": 9.100889359131093e-05, | |
| "loss": 0.44279281616210936, | |
| "mean_token_accuracy": 0.9126340833306312, | |
| "num_tokens": 8773030.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 0.37910934548825026, | |
| "epoch": 0.6336275175379045, | |
| "grad_norm": 0.24518635869026184, | |
| "learning_rate": 9.054490294862594e-05, | |
| "loss": 0.41019065856933595, | |
| "mean_token_accuracy": 0.9180504800379277, | |
| "num_tokens": 8987621.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 0.38141273133456705, | |
| "epoch": 0.6487138870030927, | |
| "grad_norm": 0.2637041211128235, | |
| "learning_rate": 9.00704853429745e-05, | |
| "loss": 0.41344562530517576, | |
| "mean_token_accuracy": 0.9188940741121769, | |
| "num_tokens": 9202085.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 0.37150444712489844, | |
| "epoch": 0.6638002564682809, | |
| "grad_norm": 0.23061881959438324, | |
| "learning_rate": 8.958576278068655e-05, | |
| "loss": 0.4002714157104492, | |
| "mean_token_accuracy": 0.9211013509333134, | |
| "num_tokens": 9414528.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6638002564682809, | |
| "eval_entropy": 0.4198416776652706, | |
| "eval_loss": 0.44056928157806396, | |
| "eval_mean_token_accuracy": 0.9060557997094749, | |
| "eval_num_tokens": 9414528.0, | |
| "eval_runtime": 37.0429, | |
| "eval_samples_per_second": 58.473, | |
| "eval_steps_per_second": 7.316, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 0.3824780482426286, | |
| "epoch": 0.6788866259334692, | |
| "grad_norm": 0.29528528451919556, | |
| "learning_rate": 8.909085991822532e-05, | |
| "loss": 0.4100413513183594, | |
| "mean_token_accuracy": 0.9181285245716572, | |
| "num_tokens": 9631160.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 0.37036353170871733, | |
| "epoch": 0.6939729953986573, | |
| "grad_norm": 0.2833597958087921, | |
| "learning_rate": 8.858590403012954e-05, | |
| "loss": 0.39582439422607424, | |
| "mean_token_accuracy": 0.9203065976500511, | |
| "num_tokens": 9844323.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 0.377471005320549, | |
| "epoch": 0.7090593648638455, | |
| "grad_norm": 0.2559050917625427, | |
| "learning_rate": 8.807102497628199e-05, | |
| "loss": 0.4039160919189453, | |
| "mean_token_accuracy": 0.9185835334658623, | |
| "num_tokens": 10060066.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 0.38689912386238573, | |
| "epoch": 0.7241457343290337, | |
| "grad_norm": 0.3571145236492157, | |
| "learning_rate": 8.754635516851342e-05, | |
| "loss": 0.41998291015625, | |
| "mean_token_accuracy": 0.9171991994976998, | |
| "num_tokens": 10275374.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7241457343290337, | |
| "eval_entropy": 0.4061841280148038, | |
| "eval_loss": 0.4392658472061157, | |
| "eval_mean_token_accuracy": 0.9060493254573583, | |
| "eval_num_tokens": 10275374.0, | |
| "eval_runtime": 37.185, | |
| "eval_samples_per_second": 58.249, | |
| "eval_steps_per_second": 7.288, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 0.3773344187065959, | |
| "epoch": 0.7392321037942219, | |
| "grad_norm": 0.23827174305915833, | |
| "learning_rate": 8.701202953655006e-05, | |
| "loss": 0.4055968475341797, | |
| "mean_token_accuracy": 0.9189482787251473, | |
| "num_tokens": 10495301.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 0.3638977843523026, | |
| "epoch": 0.7543184732594101, | |
| "grad_norm": 0.247745543718338, | |
| "learning_rate": 8.646818549331366e-05, | |
| "loss": 0.38891139984130857, | |
| "mean_token_accuracy": 0.9226090031862259, | |
| "num_tokens": 10706938.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 0.358336652033031, | |
| "epoch": 0.7694048427245983, | |
| "grad_norm": 0.24292156100273132, | |
| "learning_rate": 8.591496289958292e-05, | |
| "loss": 0.3846548461914063, | |
| "mean_token_accuracy": 0.923456951379776, | |
| "num_tokens": 10918302.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 0.37086400829255584, | |
| "epoch": 0.7844912121897866, | |
| "grad_norm": 0.2979118525981903, | |
| "learning_rate": 8.535250402802536e-05, | |
| "loss": 0.39662261962890627, | |
| "mean_token_accuracy": 0.9212297305464745, | |
| "num_tokens": 11131056.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.7844912121897866, | |
| "eval_entropy": 0.4161290250361186, | |
| "eval_loss": 0.43674495816230774, | |
| "eval_mean_token_accuracy": 0.9060781219788583, | |
| "eval_num_tokens": 11131056.0, | |
| "eval_runtime": 37.0488, | |
| "eval_samples_per_second": 58.463, | |
| "eval_steps_per_second": 7.315, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 0.36887906536459925, | |
| "epoch": 0.7995775816549747, | |
| "grad_norm": 0.25673073530197144, | |
| "learning_rate": 8.478095352660897e-05, | |
| "loss": 0.3948686218261719, | |
| "mean_token_accuracy": 0.9204315200448037, | |
| "num_tokens": 11345648.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 0.36981521353125574, | |
| "epoch": 0.814663951120163, | |
| "grad_norm": 0.2649747133255005, | |
| "learning_rate": 8.4200458381403e-05, | |
| "loss": 0.3937848663330078, | |
| "mean_token_accuracy": 0.9218536545336247, | |
| "num_tokens": 11559009.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 0.37904939975589513, | |
| "epoch": 0.8297503205853511, | |
| "grad_norm": 0.20989011228084564, | |
| "learning_rate": 8.361116787877736e-05, | |
| "loss": 0.4084677505493164, | |
| "mean_token_accuracy": 0.9188165719807148, | |
| "num_tokens": 11776255.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 0.3781035339459777, | |
| "epoch": 0.8448366900505393, | |
| "grad_norm": 0.2979874908924103, | |
| "learning_rate": 8.301323356701069e-05, | |
| "loss": 0.40767410278320315, | |
| "mean_token_accuracy": 0.9183482979238033, | |
| "num_tokens": 11994830.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8448366900505393, | |
| "eval_entropy": 0.3918299580962016, | |
| "eval_loss": 0.43606311082839966, | |
| "eval_mean_token_accuracy": 0.9074829088805786, | |
| "eval_num_tokens": 11994830.0, | |
| "eval_runtime": 37.1502, | |
| "eval_samples_per_second": 58.304, | |
| "eval_steps_per_second": 7.295, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 0.3669764836877584, | |
| "epoch": 0.8599230595157276, | |
| "grad_norm": 0.3718933165073395, | |
| "learning_rate": 8.240680921731639e-05, | |
| "loss": 0.39511192321777344, | |
| "mean_token_accuracy": 0.9215331043303013, | |
| "num_tokens": 12210990.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 0.36516126081347466, | |
| "epoch": 0.8750094289809157, | |
| "grad_norm": 0.2584734559059143, | |
| "learning_rate": 8.179205078429728e-05, | |
| "loss": 0.3858111572265625, | |
| "mean_token_accuracy": 0.9223315984010696, | |
| "num_tokens": 12425768.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 0.36489626977592704, | |
| "epoch": 0.890095798446104, | |
| "grad_norm": 0.260593980550766, | |
| "learning_rate": 8.116911636583866e-05, | |
| "loss": 0.3904818344116211, | |
| "mean_token_accuracy": 0.921723841279745, | |
| "num_tokens": 12644047.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 0.35986222576349974, | |
| "epoch": 0.9051821679112921, | |
| "grad_norm": 0.2872949540615082, | |
| "learning_rate": 8.053816616245007e-05, | |
| "loss": 0.3802699661254883, | |
| "mean_token_accuracy": 0.922919643521309, | |
| "num_tokens": 12858612.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9051821679112921, | |
| "eval_entropy": 0.39114147694128465, | |
| "eval_loss": 0.43371766805648804, | |
| "eval_mean_token_accuracy": 0.9085190791045608, | |
| "eval_num_tokens": 12858612.0, | |
| "eval_runtime": 37.1981, | |
| "eval_samples_per_second": 58.229, | |
| "eval_steps_per_second": 7.285, | |
| "step": 3000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 9945, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3087949758650778e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |