Text Generation
Transformers
Safetensors
qwen3
Generated from Trainer
trl
sft
conversational
text-generation-inference
Instructions to use cs-552-2026-ChatMODS/general_knowledge_model with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use cs-552-2026-ChatMODS/general_knowledge_model with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="cs-552-2026-ChatMODS/general_knowledge_model") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("cs-552-2026-ChatMODS/general_knowledge_model") model = AutoModelForCausalLM.from_pretrained("cs-552-2026-ChatMODS/general_knowledge_model") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use cs-552-2026-ChatMODS/general_knowledge_model with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "cs-552-2026-ChatMODS/general_knowledge_model" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "cs-552-2026-ChatMODS/general_knowledge_model", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/cs-552-2026-ChatMODS/general_knowledge_model
- SGLang
How to use cs-552-2026-ChatMODS/general_knowledge_model with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "cs-552-2026-ChatMODS/general_knowledge_model" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "cs-552-2026-ChatMODS/general_knowledge_model", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "cs-552-2026-ChatMODS/general_knowledge_model" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "cs-552-2026-ChatMODS/general_knowledge_model", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use cs-552-2026-ChatMODS/general_knowledge_model with Docker Model Runner:
docker model run hf.co/cs-552-2026-ChatMODS/general_knowledge_model
| { | |
| "best_global_step": 3470, | |
| "best_metric": 0.5210279822349548, | |
| "best_model_checkpoint": "/scratch/gk_checkpoint_lora/checkpoint-3470", | |
| "epoch": 2.0, | |
| "eval_steps": 200, | |
| "global_step": 3470, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.8664254155755043, | |
| "epoch": 0.02882259691598213, | |
| "grad_norm": 2.775977611541748, | |
| "learning_rate": 9.800000000000001e-06, | |
| "loss": 3.2154922485351562, | |
| "mean_token_accuracy": 0.5048915630578995, | |
| "num_tokens": 270598.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.210615372657776, | |
| "epoch": 0.05764519383196426, | |
| "grad_norm": 1.1605831384658813, | |
| "learning_rate": 1.98e-05, | |
| "loss": 2.068811798095703, | |
| "mean_token_accuracy": 0.5692685562372207, | |
| "num_tokens": 543717.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.7421895080804825, | |
| "epoch": 0.08646779074794639, | |
| "grad_norm": 0.25959399342536926, | |
| "learning_rate": 1.9989568984484556e-05, | |
| "loss": 0.8849951171875, | |
| "mean_token_accuracy": 0.8145365649461747, | |
| "num_tokens": 813682.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.6125312650203705, | |
| "epoch": 0.11529038766392852, | |
| "grad_norm": 0.24997037649154663, | |
| "learning_rate": 1.9957442896851584e-05, | |
| "loss": 0.691104507446289, | |
| "mean_token_accuracy": 0.862597424685955, | |
| "num_tokens": 1085263.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11529038766392852, | |
| "eval_entropy": 0.5956721862680033, | |
| "eval_loss": 0.6279548406600952, | |
| "eval_mean_token_accuracy": 0.8725619774115713, | |
| "eval_num_tokens": 1085263.0, | |
| "eval_runtime": 25.3455, | |
| "eval_samples_per_second": 59.695, | |
| "eval_steps_per_second": 7.496, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.6464120636880398, | |
| "epoch": 0.14411298457991065, | |
| "grad_norm": 0.2776671051979065, | |
| "learning_rate": 1.9903687176430222e-05, | |
| "loss": 0.6937020874023437, | |
| "mean_token_accuracy": 0.8613211107254028, | |
| "num_tokens": 1359193.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.6303374738991261, | |
| "epoch": 0.17293558149589278, | |
| "grad_norm": 0.23006124794483185, | |
| "learning_rate": 1.9828418591803025e-05, | |
| "loss": 0.674161376953125, | |
| "mean_token_accuracy": 0.865567267537117, | |
| "num_tokens": 1633225.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.6267825645208359, | |
| "epoch": 0.2017581784118749, | |
| "grad_norm": 0.264813095331192, | |
| "learning_rate": 1.973180064195894e-05, | |
| "loss": 0.6597396850585937, | |
| "mean_token_accuracy": 0.8673095554113388, | |
| "num_tokens": 1905535.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.6219879929721356, | |
| "epoch": 0.23058077532785703, | |
| "grad_norm": 0.3037996292114258, | |
| "learning_rate": 1.9614043201139513e-05, | |
| "loss": 0.6504788208007812, | |
| "mean_token_accuracy": 0.868158842921257, | |
| "num_tokens": 2177042.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.23058077532785703, | |
| "eval_entropy": 0.5888996227791435, | |
| "eval_loss": 0.5949175357818604, | |
| "eval_mean_token_accuracy": 0.8765835507919914, | |
| "eval_num_tokens": 2177042.0, | |
| "eval_runtime": 25.3953, | |
| "eval_samples_per_second": 59.578, | |
| "eval_steps_per_second": 7.482, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.6223398548364639, | |
| "epoch": 0.2594033722438392, | |
| "grad_norm": 0.27440914511680603, | |
| "learning_rate": 1.9475402062948533e-05, | |
| "loss": 0.6479128265380859, | |
| "mean_token_accuracy": 0.8684594085812569, | |
| "num_tokens": 2452073.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.610146201401949, | |
| "epoch": 0.2882259691598213, | |
| "grad_norm": 0.3176165521144867, | |
| "learning_rate": 1.9316178384715195e-05, | |
| "loss": 0.6257384109497071, | |
| "mean_token_accuracy": 0.8711106261610985, | |
| "num_tokens": 2724641.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.5934953857213259, | |
| "epoch": 0.31704856607580345, | |
| "grad_norm": 0.2879047095775604, | |
| "learning_rate": 1.9136718033317887e-05, | |
| "loss": 0.6003322601318359, | |
| "mean_token_accuracy": 0.8751304519176483, | |
| "num_tokens": 2994313.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.5552539291977883, | |
| "epoch": 0.34587116299178555, | |
| "grad_norm": 0.31590884923934937, | |
| "learning_rate": 1.8937410833889517e-05, | |
| "loss": 0.5751915740966796, | |
| "mean_token_accuracy": 0.8799619308114052, | |
| "num_tokens": 3262166.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.34587116299178555, | |
| "eval_entropy": 0.5373091018513629, | |
| "eval_loss": 0.5472979545593262, | |
| "eval_mean_token_accuracy": 0.882329721827256, | |
| "eval_num_tokens": 3262166.0, | |
| "eval_runtime": 25.4147, | |
| "eval_samples_per_second": 59.533, | |
| "eval_steps_per_second": 7.476, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.6037677505612373, | |
| "epoch": 0.3746937599077677, | |
| "grad_norm": 0.24785549938678741, | |
| "learning_rate": 1.871868972303645e-05, | |
| "loss": 0.6383477020263671, | |
| "mean_token_accuracy": 0.868063251376152, | |
| "num_tokens": 3540715.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.5363423094898462, | |
| "epoch": 0.4035163568237498, | |
| "grad_norm": 0.2744317650794983, | |
| "learning_rate": 1.848102980841029e-05, | |
| "loss": 0.5591300964355469, | |
| "mean_token_accuracy": 0.8841247257590293, | |
| "num_tokens": 3809747.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.5646945191174746, | |
| "epoch": 0.43233895373973197, | |
| "grad_norm": 0.22471845149993896, | |
| "learning_rate": 1.8224947336675485e-05, | |
| "loss": 0.5917744064331054, | |
| "mean_token_accuracy": 0.8773607212305069, | |
| "num_tokens": 4084262.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.5565017917752266, | |
| "epoch": 0.46116155065571407, | |
| "grad_norm": 0.24923403561115265, | |
| "learning_rate": 1.79509985721144e-05, | |
| "loss": 0.581727409362793, | |
| "mean_token_accuracy": 0.8778996297717094, | |
| "num_tokens": 4358564.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.46116155065571407, | |
| "eval_entropy": 0.5298120318274749, | |
| "eval_loss": 0.5324572324752808, | |
| "eval_mean_token_accuracy": 0.8855449488288478, | |
| "eval_num_tokens": 4358564.0, | |
| "eval_runtime": 25.3391, | |
| "eval_samples_per_second": 59.71, | |
| "eval_steps_per_second": 7.498, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.5551759076118469, | |
| "epoch": 0.4899841475716962, | |
| "grad_norm": 0.2890514135360718, | |
| "learning_rate": 1.765977858830583e-05, | |
| "loss": 0.5757025909423829, | |
| "mean_token_accuracy": 0.8784776413440705, | |
| "num_tokens": 4631480.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.5454864390939475, | |
| "epoch": 0.5188067444876784, | |
| "grad_norm": 0.23076863586902618, | |
| "learning_rate": 1.735191997550167e-05, | |
| "loss": 0.5680808258056641, | |
| "mean_token_accuracy": 0.88047192633152, | |
| "num_tokens": 4904344.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.5318841424584388, | |
| "epoch": 0.5476293414036605, | |
| "grad_norm": 0.23075300455093384, | |
| "learning_rate": 1.7028091466509602e-05, | |
| "loss": 0.555275650024414, | |
| "mean_token_accuracy": 0.884403744339943, | |
| "num_tokens": 5172046.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.5584682691097259, | |
| "epoch": 0.5764519383196426, | |
| "grad_norm": 0.22157496213912964, | |
| "learning_rate": 1.668899648406662e-05, | |
| "loss": 0.5754995346069336, | |
| "mean_token_accuracy": 0.877709536254406, | |
| "num_tokens": 5445539.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5764519383196426, | |
| "eval_entropy": 0.5218244160476484, | |
| "eval_loss": 0.530579686164856, | |
| "eval_mean_token_accuracy": 0.8847430721709603, | |
| "eval_num_tokens": 5445539.0, | |
| "eval_runtime": 25.2955, | |
| "eval_samples_per_second": 59.813, | |
| "eval_steps_per_second": 7.511, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.5296673697978258, | |
| "epoch": 0.6052745352356247, | |
| "grad_norm": 0.2421996146440506, | |
| "learning_rate": 1.6335371612858827e-05, | |
| "loss": 0.552278709411621, | |
| "mean_token_accuracy": 0.884430148601532, | |
| "num_tokens": 5713428.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.5427714378386735, | |
| "epoch": 0.6340971321516069, | |
| "grad_norm": 0.22103162109851837, | |
| "learning_rate": 1.5967984999506623e-05, | |
| "loss": 0.5644734954833984, | |
| "mean_token_accuracy": 0.8810071355104446, | |
| "num_tokens": 5985117.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.5319583508372306, | |
| "epoch": 0.662919729067589, | |
| "grad_norm": 0.25406044721603394, | |
| "learning_rate": 1.558763468399081e-05, | |
| "loss": 0.551100959777832, | |
| "mean_token_accuracy": 0.8851462480425835, | |
| "num_tokens": 6254015.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 0.5286855664849281, | |
| "epoch": 0.6917423259835711, | |
| "grad_norm": 0.24316425621509552, | |
| "learning_rate": 1.5195146866144093e-05, | |
| "loss": 0.5527534484863281, | |
| "mean_token_accuracy": 0.8843164274096489, | |
| "num_tokens": 6524236.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6917423259835711, | |
| "eval_entropy": 0.5235515671341042, | |
| "eval_loss": 0.5270145535469055, | |
| "eval_mean_token_accuracy": 0.8862221178255583, | |
| "eval_num_tokens": 6524236.0, | |
| "eval_runtime": 25.2946, | |
| "eval_samples_per_second": 59.815, | |
| "eval_steps_per_second": 7.511, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.5438067949563264, | |
| "epoch": 0.7205649228995532, | |
| "grad_norm": 0.2850089967250824, | |
| "learning_rate": 1.4791374110973555e-05, | |
| "loss": 0.5662718963623047, | |
| "mean_token_accuracy": 0.8811582899093628, | |
| "num_tokens": 6798500.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 0.5394456747919321, | |
| "epoch": 0.7493875198155354, | |
| "grad_norm": 0.2898014783859253, | |
| "learning_rate": 1.4377193496712517e-05, | |
| "loss": 0.5550812149047851, | |
| "mean_token_accuracy": 0.8827075427770614, | |
| "num_tokens": 7068572.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.5455511239916087, | |
| "epoch": 0.7782101167315175, | |
| "grad_norm": 0.22777394950389862, | |
| "learning_rate": 1.395350470962454e-05, | |
| "loss": 0.5698577117919922, | |
| "mean_token_accuracy": 0.8805486962199212, | |
| "num_tokens": 7338828.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 0.5357480451464653, | |
| "epoch": 0.8070327136474996, | |
| "grad_norm": 0.22745366394519806, | |
| "learning_rate": 1.3521228089698138e-05, | |
| "loss": 0.555338134765625, | |
| "mean_token_accuracy": 0.8834168764948845, | |
| "num_tokens": 7608634.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8070327136474996, | |
| "eval_entropy": 0.5221125764282126, | |
| "eval_loss": 0.5261030197143555, | |
| "eval_mean_token_accuracy": 0.8857603835432153, | |
| "eval_num_tokens": 7608634.0, | |
| "eval_runtime": 25.3733, | |
| "eval_samples_per_second": 59.63, | |
| "eval_steps_per_second": 7.488, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.5366402574628591, | |
| "epoch": 0.8358553105634817, | |
| "grad_norm": 0.3329956531524658, | |
| "learning_rate": 1.3081302631477272e-05, | |
| "loss": 0.5528204345703125, | |
| "mean_token_accuracy": 0.883393512070179, | |
| "num_tokens": 7876795.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 0.5442750995606184, | |
| "epoch": 0.8646779074794639, | |
| "grad_norm": 0.22092807292938232, | |
| "learning_rate": 1.263468394437032e-05, | |
| "loss": 0.5658491897583008, | |
| "mean_token_accuracy": 0.8810335186123848, | |
| "num_tokens": 8149031.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 0.5575385902076959, | |
| "epoch": 0.893500504395446, | |
| "grad_norm": 0.22535692155361176, | |
| "learning_rate": 1.218234217686808e-05, | |
| "loss": 0.577353744506836, | |
| "mean_token_accuracy": 0.878656555712223, | |
| "num_tokens": 8420950.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 0.5398243299871683, | |
| "epoch": 0.9223231013114281, | |
| "grad_norm": 0.2995174527168274, | |
| "learning_rate": 1.1725259909179875e-05, | |
| "loss": 0.5575567626953125, | |
| "mean_token_accuracy": 0.882205625474453, | |
| "num_tokens": 8690831.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.9223231013114281, | |
| "eval_entropy": 0.5169236369823155, | |
| "eval_loss": 0.5248522162437439, | |
| "eval_mean_token_accuracy": 0.8859441503098137, | |
| "eval_num_tokens": 8690831.0, | |
| "eval_runtime": 25.3236, | |
| "eval_samples_per_second": 59.747, | |
| "eval_steps_per_second": 7.503, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 0.5338192899525166, | |
| "epoch": 0.9511456982274102, | |
| "grad_norm": 0.3139007091522217, | |
| "learning_rate": 1.1264430018865391e-05, | |
| "loss": 0.5552957916259765, | |
| "mean_token_accuracy": 0.8834528475999832, | |
| "num_tokens": 8961415.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 0.5275231996178626, | |
| "epoch": 0.9799682951433925, | |
| "grad_norm": 0.2292962670326233, | |
| "learning_rate": 1.0800853524098543e-05, | |
| "loss": 0.5416835021972656, | |
| "mean_token_accuracy": 0.8851530715823174, | |
| "num_tokens": 9229637.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.5638248626161461, | |
| "epoch": 1.0086467790747946, | |
| "grad_norm": 0.23777392506599426, | |
| "learning_rate": 1.0335537409248204e-05, | |
| "loss": 0.5851130676269531, | |
| "mean_token_accuracy": 0.8776284435286594, | |
| "num_tokens": 9499446.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 0.5698649657517671, | |
| "epoch": 1.0374693759907767, | |
| "grad_norm": 0.2397402822971344, | |
| "learning_rate": 9.869492437499167e-06, | |
| "loss": 0.5934230804443359, | |
| "mean_token_accuracy": 0.8746954175829887, | |
| "num_tokens": 9775732.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.0374693759907767, | |
| "eval_entropy": 0.5189189992452923, | |
| "eval_loss": 0.5238012671470642, | |
| "eval_mean_token_accuracy": 0.8859998445761831, | |
| "eval_num_tokens": 9775732.0, | |
| "eval_runtime": 25.2908, | |
| "eval_samples_per_second": 59.824, | |
| "eval_steps_per_second": 7.513, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 0.5613676090538502, | |
| "epoch": 1.0662919729067588, | |
| "grad_norm": 0.2651011645793915, | |
| "learning_rate": 9.403730955264677e-06, | |
| "loss": 0.5792824935913086, | |
| "mean_token_accuracy": 0.8779774031043053, | |
| "num_tokens": 10051416.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 0.564930793941021, | |
| "epoch": 1.0951145698227411, | |
| "grad_norm": 0.2934422194957733, | |
| "learning_rate": 8.939264693159926e-06, | |
| "loss": 0.5857321166992188, | |
| "mean_token_accuracy": 0.8772886765003204, | |
| "num_tokens": 10325112.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 0.5554238288849592, | |
| "epoch": 1.1239371667387232, | |
| "grad_norm": 0.2599495053291321, | |
| "learning_rate": 8.477102568313138e-06, | |
| "loss": 0.5767181015014649, | |
| "mean_token_accuracy": 0.8785749426484109, | |
| "num_tokens": 10599513.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 0.5364625386148691, | |
| "epoch": 1.1527597636547053, | |
| "grad_norm": 0.2258245348930359, | |
| "learning_rate": 8.01824849278814e-06, | |
| "loss": 0.555483169555664, | |
| "mean_token_accuracy": 0.883213449716568, | |
| "num_tokens": 10869586.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.1527597636547053, | |
| "eval_entropy": 0.519798114582112, | |
| "eval_loss": 0.5227712988853455, | |
| "eval_mean_token_accuracy": 0.8866353715720929, | |
| "eval_num_tokens": 10869586.0, | |
| "eval_runtime": 25.2263, | |
| "eval_samples_per_second": 59.977, | |
| "eval_steps_per_second": 7.532, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 0.5413903272151948, | |
| "epoch": 1.1815823605706874, | |
| "grad_norm": 0.2601049542427063, | |
| "learning_rate": 7.5636991928790226e-06, | |
| "loss": 0.5650748443603516, | |
| "mean_token_accuracy": 0.8822926163673401, | |
| "num_tokens": 11142115.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 0.5441782039403915, | |
| "epoch": 1.2104049574866695, | |
| "grad_norm": 0.23305267095565796, | |
| "learning_rate": 7.1144420440136945e-06, | |
| "loss": 0.5608541870117187, | |
| "mean_token_accuracy": 0.8806314519047738, | |
| "num_tokens": 11414459.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 0.5373694878071547, | |
| "epoch": 1.2392275544026516, | |
| "grad_norm": 0.2710422873497009, | |
| "learning_rate": 6.671452925969549e-06, | |
| "loss": 0.5601076126098633, | |
| "mean_token_accuracy": 0.8817597103118896, | |
| "num_tokens": 11686439.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 0.5512064357846975, | |
| "epoch": 1.2680501513186337, | |
| "grad_norm": 0.24875891208648682, | |
| "learning_rate": 6.2356941030600036e-06, | |
| "loss": 0.572663803100586, | |
| "mean_token_accuracy": 0.8795112228393555, | |
| "num_tokens": 11959949.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.2680501513186337, | |
| "eval_entropy": 0.5157758243774113, | |
| "eval_loss": 0.5222153663635254, | |
| "eval_mean_token_accuracy": 0.8866087377071381, | |
| "eval_num_tokens": 11959949.0, | |
| "eval_runtime": 25.3435, | |
| "eval_samples_per_second": 59.7, | |
| "eval_steps_per_second": 7.497, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 0.5333986005187035, | |
| "epoch": 1.296872748234616, | |
| "grad_norm": 0.2774944603443146, | |
| "learning_rate": 5.808112133896682e-06, | |
| "loss": 0.5507477569580078, | |
| "mean_token_accuracy": 0.8833928933739662, | |
| "num_tokens": 12230467.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 0.5385575620830059, | |
| "epoch": 1.325695345150598, | |
| "grad_norm": 0.24219754338264465, | |
| "learning_rate": 5.38963581526766e-06, | |
| "loss": 0.5618357849121094, | |
| "mean_token_accuracy": 0.8824037438631058, | |
| "num_tokens": 12500631.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 0.5562852944433689, | |
| "epoch": 1.3545179420665803, | |
| "grad_norm": 0.23605461418628693, | |
| "learning_rate": 4.981174164598023e-06, | |
| "loss": 0.5740032577514649, | |
| "mean_token_accuracy": 0.8787400788068771, | |
| "num_tokens": 12773793.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 0.5452473207563162, | |
| "epoch": 1.3833405389825624, | |
| "grad_norm": 0.23601187765598297, | |
| "learning_rate": 4.5836144453753595e-06, | |
| "loss": 0.5657626724243164, | |
| "mean_token_accuracy": 0.8807224997878075, | |
| "num_tokens": 13046257.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.3833405389825624, | |
| "eval_entropy": 0.5180542894884159, | |
| "eval_loss": 0.5217919945716858, | |
| "eval_mean_token_accuracy": 0.8867926757586629, | |
| "eval_num_tokens": 13046257.0, | |
| "eval_runtime": 25.2941, | |
| "eval_samples_per_second": 59.816, | |
| "eval_steps_per_second": 7.512, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 0.5406669420003891, | |
| "epoch": 1.4121631358985445, | |
| "grad_norm": 0.27414751052856445, | |
| "learning_rate": 4.197820239829295e-06, | |
| "loss": 0.5581526184082031, | |
| "mean_token_accuracy": 0.8821376091241837, | |
| "num_tokens": 13318117.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 0.5123971965163946, | |
| "epoch": 1.4409857328145266, | |
| "grad_norm": 0.27002713084220886, | |
| "learning_rate": 3.8246295730516455e-06, | |
| "loss": 0.5292396545410156, | |
| "mean_token_accuracy": 0.8868949916958809, | |
| "num_tokens": 13585112.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 0.5250877778977155, | |
| "epoch": 1.4698083297305087, | |
| "grad_norm": 0.2513103187084198, | |
| "learning_rate": 3.4648530926319634e-06, | |
| "loss": 0.5443946075439453, | |
| "mean_token_accuracy": 0.8851349979639054, | |
| "num_tokens": 13854892.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 0.5314137779176236, | |
| "epoch": 1.4986309266464908, | |
| "grad_norm": 0.21981389820575714, | |
| "learning_rate": 3.1192723077627163e-06, | |
| "loss": 0.5500518798828125, | |
| "mean_token_accuracy": 0.8838223886489868, | |
| "num_tokens": 14126513.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.4986309266464908, | |
| "eval_entropy": 0.5145505380473638, | |
| "eval_loss": 0.5215311050415039, | |
| "eval_mean_token_accuracy": 0.8864174689117231, | |
| "eval_num_tokens": 14126513.0, | |
| "eval_runtime": 25.2961, | |
| "eval_samples_per_second": 59.812, | |
| "eval_steps_per_second": 7.511, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 0.5299251443892717, | |
| "epoch": 1.5274535235624729, | |
| "grad_norm": 0.2680607736110687, | |
| "learning_rate": 2.78863789163911e-06, | |
| "loss": 0.5511317825317383, | |
| "mean_token_accuracy": 0.8838860777020454, | |
| "num_tokens": 14396535.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 0.5394382092356682, | |
| "epoch": 1.5562761204784552, | |
| "grad_norm": 0.25587576627731323, | |
| "learning_rate": 2.4736680508410902e-06, | |
| "loss": 0.5560498809814454, | |
| "mean_token_accuracy": 0.8820542943477631, | |
| "num_tokens": 14666012.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 0.5438171474635601, | |
| "epoch": 1.585098717394437, | |
| "grad_norm": 0.22950419783592224, | |
| "learning_rate": 2.1750469652395777e-06, | |
| "loss": 0.5646057891845703, | |
| "mean_token_accuracy": 0.8806953200697899, | |
| "num_tokens": 14938072.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 0.5157144083827734, | |
| "epoch": 1.6139213143104194, | |
| "grad_norm": 0.23236404359340668, | |
| "learning_rate": 1.8934233018157822e-06, | |
| "loss": 0.5314432525634766, | |
| "mean_token_accuracy": 0.8868013408780098, | |
| "num_tokens": 15206239.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.6139213143104194, | |
| "eval_entropy": 0.514612096783362, | |
| "eval_loss": 0.5212787985801697, | |
| "eval_mean_token_accuracy": 0.8865814996393103, | |
| "eval_num_tokens": 15206239.0, | |
| "eval_runtime": 25.284, | |
| "eval_samples_per_second": 59.84, | |
| "eval_steps_per_second": 7.515, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 0.5607952538132668, | |
| "epoch": 1.6427439112264015, | |
| "grad_norm": 0.27139943838119507, | |
| "learning_rate": 1.6294088056218705e-06, | |
| "loss": 0.5802758026123047, | |
| "mean_token_accuracy": 0.8771729645133018, | |
| "num_tokens": 15479383.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 0.5458529234677553, | |
| "epoch": 1.6715665081423836, | |
| "grad_norm": 0.22497807443141937, | |
| "learning_rate": 1.3835769709437307e-06, | |
| "loss": 0.5699198913574218, | |
| "mean_token_accuracy": 0.8804216027259827, | |
| "num_tokens": 15754571.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 0.5275375150889158, | |
| "epoch": 1.7003891050583657, | |
| "grad_norm": 0.25096791982650757, | |
| "learning_rate": 1.1564617955523716e-06, | |
| "loss": 0.5493584060668946, | |
| "mean_token_accuracy": 0.8840730246901513, | |
| "num_tokens": 16024591.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 0.529434435814619, | |
| "epoch": 1.7292117019743478, | |
| "grad_norm": 0.20803657174110413, | |
| "learning_rate": 9.485566207498986e-07, | |
| "loss": 0.552624740600586, | |
| "mean_token_accuracy": 0.8842183569073677, | |
| "num_tokens": 16291960.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.7292117019743478, | |
| "eval_entropy": 0.5173903987595909, | |
| "eval_loss": 0.5210983753204346, | |
| "eval_mean_token_accuracy": 0.8867466380721645, | |
| "eval_num_tokens": 16291960.0, | |
| "eval_runtime": 25.2973, | |
| "eval_samples_per_second": 59.809, | |
| "eval_steps_per_second": 7.511, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 0.5379937102645636, | |
| "epoch": 1.7580342988903301, | |
| "grad_norm": 0.279082715511322, | |
| "learning_rate": 7.603130597298147e-07, | |
| "loss": 0.5518331146240234, | |
| "mean_token_accuracy": 0.8835244515538215, | |
| "num_tokens": 16562192.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 0.5540396096557378, | |
| "epoch": 1.786856895806312, | |
| "grad_norm": 0.23533675074577332, | |
| "learning_rate": 5.921400165794255e-07, | |
| "loss": 0.5790340423583984, | |
| "mean_token_accuracy": 0.8788801202178002, | |
| "num_tokens": 16834646.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 0.5152763035148382, | |
| "epoch": 1.8156794927222943, | |
| "grad_norm": 0.2803124487400055, | |
| "learning_rate": 4.444027980552901e-07, | |
| "loss": 0.5328571701049805, | |
| "mean_token_accuracy": 0.8869862693548203, | |
| "num_tokens": 17104758.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 0.5612848294526338, | |
| "epoch": 1.8445020896382764, | |
| "grad_norm": 0.26171237230300903, | |
| "learning_rate": 3.1742232006111374e-07, | |
| "loss": 0.5824931716918945, | |
| "mean_token_accuracy": 0.8772889456152916, | |
| "num_tokens": 17379954.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.8445020896382764, | |
| "eval_entropy": 0.516960418381189, | |
| "eval_loss": 0.5211681723594666, | |
| "eval_mean_token_accuracy": 0.8866549037004772, | |
| "eval_num_tokens": 17379954.0, | |
| "eval_runtime": 25.3046, | |
| "eval_samples_per_second": 59.791, | |
| "eval_steps_per_second": 7.509, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 0.5196366369724273, | |
| "epoch": 1.8733246865542585, | |
| "grad_norm": 0.6400351524353027, | |
| "learning_rate": 2.1147441055180074e-07, | |
| "loss": 0.5364640808105469, | |
| "mean_token_accuracy": 0.8855624732375145, | |
| "num_tokens": 17647943.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 0.5125804611295461, | |
| "epoch": 1.9021472834702406, | |
| "grad_norm": 0.24278956651687622, | |
| "learning_rate": 1.2678921037788118e-07, | |
| "loss": 0.5320493316650391, | |
| "mean_token_accuracy": 0.8888507178425789, | |
| "num_tokens": 17914413.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 0.5364588350057602, | |
| "epoch": 1.9309698803862227, | |
| "grad_norm": 0.2414465695619583, | |
| "learning_rate": 6.355067337181497e-08, | |
| "loss": 0.5448334121704101, | |
| "mean_token_accuracy": 0.8835894984006881, | |
| "num_tokens": 18184605.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 0.5623560689389706, | |
| "epoch": 1.959792477302205, | |
| "grad_norm": 0.31695693731307983, | |
| "learning_rate": 2.189616676208428e-08, | |
| "loss": 0.5812443923950196, | |
| "mean_token_accuracy": 0.877460196018219, | |
| "num_tokens": 18459900.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.959792477302205, | |
| "eval_entropy": 0.5174976312016186, | |
| "eval_loss": 0.5211665630340576, | |
| "eval_mean_token_accuracy": 0.8866490850323125, | |
| "eval_num_tokens": 18459900.0, | |
| "eval_runtime": 25.3065, | |
| "eval_samples_per_second": 59.787, | |
| "eval_steps_per_second": 7.508, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 0.5258527837693692, | |
| "epoch": 1.988615074218187, | |
| "grad_norm": 0.2398333102464676, | |
| "learning_rate": 1.916172783061887e-09, | |
| "loss": 0.5451886367797851, | |
| "mean_token_accuracy": 0.8846419337391853, | |
| "num_tokens": 18729680.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_entropy": 0.5169560465373491, | |
| "eval_loss": 0.5210279822349548, | |
| "eval_mean_token_accuracy": 0.8867915294672314, | |
| "eval_num_tokens": 18836626.0, | |
| "eval_runtime": 25.327, | |
| "eval_samples_per_second": 59.739, | |
| "eval_steps_per_second": 7.502, | |
| "step": 3470 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 3470, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.0420402599061914e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |