Text Generation
Transformers
Safetensors
qwen3
Generated from Trainer
sft
trl
conversational
text-generation-inference
Instructions to use codingmonster1234/chess-sft-modelv2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use codingmonster1234/chess-sft-modelv2 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="codingmonster1234/chess-sft-modelv2") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForMultimodalLM tokenizer = AutoTokenizer.from_pretrained("codingmonster1234/chess-sft-modelv2") model = AutoModelForMultimodalLM.from_pretrained("codingmonster1234/chess-sft-modelv2") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use codingmonster1234/chess-sft-modelv2 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "codingmonster1234/chess-sft-modelv2" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "codingmonster1234/chess-sft-modelv2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/codingmonster1234/chess-sft-modelv2
- SGLang
How to use codingmonster1234/chess-sft-modelv2 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "codingmonster1234/chess-sft-modelv2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "codingmonster1234/chess-sft-modelv2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "codingmonster1234/chess-sft-modelv2" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "codingmonster1234/chess-sft-modelv2", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use codingmonster1234/chess-sft-modelv2 with Docker Model Runner:
docker model run hf.co/codingmonster1234/chess-sft-modelv2
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 168, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.0061692222952843, | |
| "epoch": 0.011904761904761904, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 2e-05, | |
| "loss": 2.293125867843628, | |
| "mean_token_accuracy": 0.5738132819533348, | |
| "num_tokens": 29832.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.385195016860962, | |
| "epoch": 0.023809523809523808, | |
| "grad_norm": 0.78125, | |
| "learning_rate": 1.999922292480975e-05, | |
| "loss": 1.5697591304779053, | |
| "mean_token_accuracy": 0.6427712365984917, | |
| "num_tokens": 58835.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.5784537345170975, | |
| "epoch": 0.03571428571428571, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 1.9996891820008165e-05, | |
| "loss": 1.5061622858047485, | |
| "mean_token_accuracy": 0.654805600643158, | |
| "num_tokens": 88089.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.5019408017396927, | |
| "epoch": 0.047619047619047616, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 1.9993007047883988e-05, | |
| "loss": 1.3531173467636108, | |
| "mean_token_accuracy": 0.6810621172189713, | |
| "num_tokens": 116996.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.442432388663292, | |
| "epoch": 0.05952380952380952, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 1.9987569212189224e-05, | |
| "loss": 1.2870382070541382, | |
| "mean_token_accuracy": 0.6946646422147751, | |
| "num_tokens": 146502.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.383298322558403, | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 1.9980579158045322e-05, | |
| "loss": 1.2606914043426514, | |
| "mean_token_accuracy": 0.6914810612797737, | |
| "num_tokens": 175000.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.3554321229457855, | |
| "epoch": 0.08333333333333333, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 1.9972037971811802e-05, | |
| "loss": 1.2325180768966675, | |
| "mean_token_accuracy": 0.6992553323507309, | |
| "num_tokens": 203581.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.301919937133789, | |
| "epoch": 0.09523809523809523, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 1.9961946980917457e-05, | |
| "loss": 1.1691060066223145, | |
| "mean_token_accuracy": 0.714451938867569, | |
| "num_tokens": 233225.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.3274528235197067, | |
| "epoch": 0.10714285714285714, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 1.9950307753654016e-05, | |
| "loss": 1.22238290309906, | |
| "mean_token_accuracy": 0.6991388499736786, | |
| "num_tokens": 261557.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 1.3020492941141129, | |
| "epoch": 0.11904761904761904, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 1.9937122098932428e-05, | |
| "loss": 1.1407413482666016, | |
| "mean_token_accuracy": 0.7115657702088356, | |
| "num_tokens": 290843.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.2911820262670517, | |
| "epoch": 0.13095238095238096, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 1.9922392066001724e-05, | |
| "loss": 1.1007871627807617, | |
| "mean_token_accuracy": 0.7251745313405991, | |
| "num_tokens": 320963.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 1.305821493268013, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 1.9906119944130527e-05, | |
| "loss": 1.0885382890701294, | |
| "mean_token_accuracy": 0.7273061871528625, | |
| "num_tokens": 350648.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 1.3162220120429993, | |
| "epoch": 0.15476190476190477, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 1.9888308262251286e-05, | |
| "loss": 1.0963213443756104, | |
| "mean_token_accuracy": 0.7211973443627357, | |
| "num_tokens": 380096.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 1.3141592741012573, | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 1.9868959788567213e-05, | |
| "loss": 1.0897754430770874, | |
| "mean_token_accuracy": 0.7258400693535805, | |
| "num_tokens": 407435.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 1.3073242455720901, | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 1.9848077530122083e-05, | |
| "loss": 1.0494160652160645, | |
| "mean_token_accuracy": 0.7293207123875618, | |
| "num_tokens": 435734.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 1.3367096036672592, | |
| "epoch": 0.19047619047619047, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 1.9825664732332886e-05, | |
| "loss": 1.1211317777633667, | |
| "mean_token_accuracy": 0.7143202275037766, | |
| "num_tokens": 464973.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 1.3097643703222275, | |
| "epoch": 0.20238095238095238, | |
| "grad_norm": 0.25, | |
| "learning_rate": 1.9801724878485438e-05, | |
| "loss": 1.0753662586212158, | |
| "mean_token_accuracy": 0.7259641215205193, | |
| "num_tokens": 493135.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 1.2622702419757843, | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1.977626168919305e-05, | |
| "loss": 1.007223129272461, | |
| "mean_token_accuracy": 0.744126707315445, | |
| "num_tokens": 522656.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 1.2859665155410767, | |
| "epoch": 0.2261904761904762, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1.9749279121818235e-05, | |
| "loss": 1.0457340478897095, | |
| "mean_token_accuracy": 0.7328037023544312, | |
| "num_tokens": 551875.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 1.275212675333023, | |
| "epoch": 0.23809523809523808, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 1.9720781369857747e-05, | |
| "loss": 1.0395888090133667, | |
| "mean_token_accuracy": 0.7307759299874306, | |
| "num_tokens": 580523.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.3000101447105408, | |
| "epoch": 0.25, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1.969077286229078e-05, | |
| "loss": 1.0626932382583618, | |
| "mean_token_accuracy": 0.7271415144205093, | |
| "num_tokens": 609771.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 1.242678239941597, | |
| "epoch": 0.2619047619047619, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 1.9659258262890683e-05, | |
| "loss": 0.9827122092247009, | |
| "mean_token_accuracy": 0.7448626458644867, | |
| "num_tokens": 639104.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 1.2583424746990204, | |
| "epoch": 0.27380952380952384, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1.962624246950012e-05, | |
| "loss": 1.0062870979309082, | |
| "mean_token_accuracy": 0.7375933676958084, | |
| "num_tokens": 667792.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 1.2531014680862427, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1.9591730613269878e-05, | |
| "loss": 1.0229589939117432, | |
| "mean_token_accuracy": 0.7366377785801888, | |
| "num_tokens": 696742.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 1.2342166602611542, | |
| "epoch": 0.2976190476190476, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1.955572805786141e-05, | |
| "loss": 0.9788997769355774, | |
| "mean_token_accuracy": 0.7421486154198647, | |
| "num_tokens": 725968.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 1.2210585623979568, | |
| "epoch": 0.30952380952380953, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 1.9518240398613226e-05, | |
| "loss": 0.987277090549469, | |
| "mean_token_accuracy": 0.7420973554253578, | |
| "num_tokens": 755689.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 1.24309404194355, | |
| "epoch": 0.32142857142857145, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1.947927346167132e-05, | |
| "loss": 1.0301053524017334, | |
| "mean_token_accuracy": 0.7300752699375153, | |
| "num_tokens": 784977.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 1.2028213143348694, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.9438833303083677e-05, | |
| "loss": 0.9393562078475952, | |
| "mean_token_accuracy": 0.7491495907306671, | |
| "num_tokens": 814048.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 1.2287103980779648, | |
| "epoch": 0.34523809523809523, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1.9396926207859085e-05, | |
| "loss": 1.0168366432189941, | |
| "mean_token_accuracy": 0.7329602986574173, | |
| "num_tokens": 843602.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 1.2081626951694489, | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1.935355868899034e-05, | |
| "loss": 0.958310604095459, | |
| "mean_token_accuracy": 0.7456908002495766, | |
| "num_tokens": 871915.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.2221457809209824, | |
| "epoch": 0.36904761904761907, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1.9308737486442045e-05, | |
| "loss": 0.9946644902229309, | |
| "mean_token_accuracy": 0.7383344992995262, | |
| "num_tokens": 900851.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 1.1801428943872452, | |
| "epoch": 0.38095238095238093, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1.926246956610309e-05, | |
| "loss": 0.9103766083717346, | |
| "mean_token_accuracy": 0.7624464929103851, | |
| "num_tokens": 929498.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 1.2152698189020157, | |
| "epoch": 0.39285714285714285, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1.921476211870408e-05, | |
| "loss": 0.9737407565116882, | |
| "mean_token_accuracy": 0.7427262291312218, | |
| "num_tokens": 958933.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 1.2030568569898605, | |
| "epoch": 0.40476190476190477, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.9165622558699763e-05, | |
| "loss": 0.9593278169631958, | |
| "mean_token_accuracy": 0.7506603300571442, | |
| "num_tokens": 987731.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 1.1957021951675415, | |
| "epoch": 0.4166666666666667, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1.9115058523116734e-05, | |
| "loss": 0.9239043593406677, | |
| "mean_token_accuracy": 0.7555749863386154, | |
| "num_tokens": 1017002.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 1.2133885324001312, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.9063077870366504e-05, | |
| "loss": 0.9809866547584534, | |
| "mean_token_accuracy": 0.7437998279929161, | |
| "num_tokens": 1046678.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 1.2098581492900848, | |
| "epoch": 0.44047619047619047, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1.900968867902419e-05, | |
| "loss": 0.938984215259552, | |
| "mean_token_accuracy": 0.7494841367006302, | |
| "num_tokens": 1074445.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 1.1815967112779617, | |
| "epoch": 0.4523809523809524, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 1.895489924657301e-05, | |
| "loss": 0.8934326767921448, | |
| "mean_token_accuracy": 0.7595476359128952, | |
| "num_tokens": 1103620.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 1.2028009444475174, | |
| "epoch": 0.4642857142857143, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1.8898718088114688e-05, | |
| "loss": 0.922984778881073, | |
| "mean_token_accuracy": 0.7540801167488098, | |
| "num_tokens": 1132637.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 1.2034422308206558, | |
| "epoch": 0.47619047619047616, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 1.8841153935046098e-05, | |
| "loss": 0.9033240675926208, | |
| "mean_token_accuracy": 0.7560576424002647, | |
| "num_tokens": 1161527.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.1716476827859879, | |
| "epoch": 0.4880952380952381, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1.8782215733702286e-05, | |
| "loss": 0.8880018591880798, | |
| "mean_token_accuracy": 0.7613470479846001, | |
| "num_tokens": 1190701.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 1.2157341986894608, | |
| "epoch": 0.5, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1.8721912643966055e-05, | |
| "loss": 0.9609653949737549, | |
| "mean_token_accuracy": 0.7453824803233147, | |
| "num_tokens": 1218835.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 1.197568565607071, | |
| "epoch": 0.5119047619047619, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.866025403784439e-05, | |
| "loss": 0.9219189882278442, | |
| "mean_token_accuracy": 0.7547592371702194, | |
| "num_tokens": 1248679.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 1.1708803623914719, | |
| "epoch": 0.5238095238095238, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 1.8597249498011906e-05, | |
| "loss": 0.8802202343940735, | |
| "mean_token_accuracy": 0.7667126134037971, | |
| "num_tokens": 1277106.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 1.191767856478691, | |
| "epoch": 0.5357142857142857, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1.8532908816321557e-05, | |
| "loss": 0.9313769936561584, | |
| "mean_token_accuracy": 0.7529165670275688, | |
| "num_tokens": 1305983.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 1.2066084146499634, | |
| "epoch": 0.5476190476190477, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1.8467241992282842e-05, | |
| "loss": 0.9347527027130127, | |
| "mean_token_accuracy": 0.7446473762392998, | |
| "num_tokens": 1334578.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 1.177584484219551, | |
| "epoch": 0.5595238095238095, | |
| "grad_norm": 0.25, | |
| "learning_rate": 1.8400259231507716e-05, | |
| "loss": 0.8884726166725159, | |
| "mean_token_accuracy": 0.7611024901270866, | |
| "num_tokens": 1362873.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 1.1629594564437866, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1.833197094412449e-05, | |
| "loss": 0.8861435651779175, | |
| "mean_token_accuracy": 0.76307063549757, | |
| "num_tokens": 1391315.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 1.168922871351242, | |
| "epoch": 0.5833333333333334, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 1.826238774315995e-05, | |
| "loss": 0.8765286207199097, | |
| "mean_token_accuracy": 0.76119015365839, | |
| "num_tokens": 1419829.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 1.1843004375696182, | |
| "epoch": 0.5952380952380952, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 1.819152044288992e-05, | |
| "loss": 0.9242440462112427, | |
| "mean_token_accuracy": 0.7494527697563171, | |
| "num_tokens": 1447790.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.1673331260681152, | |
| "epoch": 0.6071428571428571, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 1.811938005715857e-05, | |
| "loss": 0.8822228312492371, | |
| "mean_token_accuracy": 0.7585421577095985, | |
| "num_tokens": 1476278.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 1.2116869688034058, | |
| "epoch": 0.6190476190476191, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1.8045977797666685e-05, | |
| "loss": 0.9784308671951294, | |
| "mean_token_accuracy": 0.7404012456536293, | |
| "num_tokens": 1503947.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 1.162365809082985, | |
| "epoch": 0.6309523809523809, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1.7971325072229227e-05, | |
| "loss": 0.9283543825149536, | |
| "mean_token_accuracy": 0.7499738857150078, | |
| "num_tokens": 1533531.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 1.1863622218370438, | |
| "epoch": 0.6428571428571429, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 1.7895433483002356e-05, | |
| "loss": 0.9471738934516907, | |
| "mean_token_accuracy": 0.7532860413193703, | |
| "num_tokens": 1561412.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 1.1698070168495178, | |
| "epoch": 0.6547619047619048, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1.78183148246803e-05, | |
| "loss": 0.9019606709480286, | |
| "mean_token_accuracy": 0.7543124184012413, | |
| "num_tokens": 1590336.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 1.1683688312768936, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 1.7739981082662275e-05, | |
| "loss": 0.9020405411720276, | |
| "mean_token_accuracy": 0.7580606490373611, | |
| "num_tokens": 1620442.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 1.1867523938417435, | |
| "epoch": 0.6785714285714286, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.766044443118978e-05, | |
| "loss": 0.917300283908844, | |
| "mean_token_accuracy": 0.7553394213318825, | |
| "num_tokens": 1648762.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 1.1505564451217651, | |
| "epoch": 0.6904761904761905, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1.757971723145453e-05, | |
| "loss": 0.8627029061317444, | |
| "mean_token_accuracy": 0.7657916098833084, | |
| "num_tokens": 1677464.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 1.1766629666090012, | |
| "epoch": 0.7023809523809523, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 1.7497812029677344e-05, | |
| "loss": 0.8795939087867737, | |
| "mean_token_accuracy": 0.7613174989819527, | |
| "num_tokens": 1704994.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 1.1731744706630707, | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1.741474155515827e-05, | |
| "loss": 0.8988810777664185, | |
| "mean_token_accuracy": 0.7579676881432533, | |
| "num_tokens": 1734202.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.1697156727313995, | |
| "epoch": 0.7261904761904762, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1.7330518718298263e-05, | |
| "loss": 0.9070097804069519, | |
| "mean_token_accuracy": 0.7564781159162521, | |
| "num_tokens": 1763541.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 1.1686383485794067, | |
| "epoch": 0.7380952380952381, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.7245156608592727e-05, | |
| "loss": 0.8804867267608643, | |
| "mean_token_accuracy": 0.7639917582273483, | |
| "num_tokens": 1793196.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 1.195967510342598, | |
| "epoch": 0.75, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 1.7158668492597186e-05, | |
| "loss": 0.9389015436172485, | |
| "mean_token_accuracy": 0.747251845896244, | |
| "num_tokens": 1821023.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 1.1664810329675674, | |
| "epoch": 0.7619047619047619, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1.7071067811865477e-05, | |
| "loss": 0.9056146740913391, | |
| "mean_token_accuracy": 0.7550350353121758, | |
| "num_tokens": 1849586.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 1.171183928847313, | |
| "epoch": 0.7738095238095238, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.698236818086073e-05, | |
| "loss": 0.929341197013855, | |
| "mean_token_accuracy": 0.7491638883948326, | |
| "num_tokens": 1878622.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 1.1465008854866028, | |
| "epoch": 0.7857142857142857, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 1.689258338483947e-05, | |
| "loss": 0.8692110776901245, | |
| "mean_token_accuracy": 0.765314869582653, | |
| "num_tokens": 1907725.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 1.1706128865480423, | |
| "epoch": 0.7976190476190477, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.6801727377709195e-05, | |
| "loss": 0.886278510093689, | |
| "mean_token_accuracy": 0.7576193287968636, | |
| "num_tokens": 1936209.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 1.1479064524173737, | |
| "epoch": 0.8095238095238095, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 1.67098142798597e-05, | |
| "loss": 0.8587610125541687, | |
| "mean_token_accuracy": 0.7682890966534615, | |
| "num_tokens": 1964915.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 1.1495172083377838, | |
| "epoch": 0.8214285714285714, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 1.6616858375968596e-05, | |
| "loss": 0.8885282874107361, | |
| "mean_token_accuracy": 0.7598370909690857, | |
| "num_tokens": 1993606.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 1.1534761786460876, | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1.6522874112781213e-05, | |
| "loss": 0.8863916993141174, | |
| "mean_token_accuracy": 0.7640347108244896, | |
| "num_tokens": 2022472.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.14171202480793, | |
| "epoch": 0.8452380952380952, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1.6427876096865394e-05, | |
| "loss": 0.8785849809646606, | |
| "mean_token_accuracy": 0.7604316994547844, | |
| "num_tokens": 2052746.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 1.1478676050901413, | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 1.6331879092341402e-05, | |
| "loss": 0.8796285390853882, | |
| "mean_token_accuracy": 0.7586944848299026, | |
| "num_tokens": 2081889.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 1.1222540885210037, | |
| "epoch": 0.8690476190476191, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 1.6234898018587336e-05, | |
| "loss": 0.8146858811378479, | |
| "mean_token_accuracy": 0.7756616845726967, | |
| "num_tokens": 2111616.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 1.153001144528389, | |
| "epoch": 0.8809523809523809, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1.6136947947920477e-05, | |
| "loss": 0.8884707689285278, | |
| "mean_token_accuracy": 0.7565625682473183, | |
| "num_tokens": 2140433.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 1.1275182217359543, | |
| "epoch": 0.8928571428571429, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 1.6038044103254775e-05, | |
| "loss": 0.8272450566291809, | |
| "mean_token_accuracy": 0.7704622000455856, | |
| "num_tokens": 2170414.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 1.1576026529073715, | |
| "epoch": 0.9047619047619048, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.5938201855735017e-05, | |
| "loss": 0.9035623669624329, | |
| "mean_token_accuracy": 0.7542874589562416, | |
| "num_tokens": 2198868.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 1.1199318170547485, | |
| "epoch": 0.9166666666666666, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 1.5837436722347902e-05, | |
| "loss": 0.8039325475692749, | |
| "mean_token_accuracy": 0.783287987112999, | |
| "num_tokens": 2228134.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 1.1484037339687347, | |
| "epoch": 0.9285714285714286, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 1.573576436351046e-05, | |
| "loss": 0.8699290752410889, | |
| "mean_token_accuracy": 0.7641323357820511, | |
| "num_tokens": 2257447.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 1.1295416802167892, | |
| "epoch": 0.9404761904761905, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 1.563320058063622e-05, | |
| "loss": 0.8303874731063843, | |
| "mean_token_accuracy": 0.7720286920666695, | |
| "num_tokens": 2286749.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 1.1563286185264587, | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1.5529761313679396e-05, | |
| "loss": 0.8524646759033203, | |
| "mean_token_accuracy": 0.7633371129631996, | |
| "num_tokens": 2315039.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.1543449014425278, | |
| "epoch": 0.9642857142857143, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1.5425462638657597e-05, | |
| "loss": 0.9120794534683228, | |
| "mean_token_accuracy": 0.756316527724266, | |
| "num_tokens": 2344737.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 1.13828843832016, | |
| "epoch": 0.9761904761904762, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1.5320320765153367e-05, | |
| "loss": 0.824118971824646, | |
| "mean_token_accuracy": 0.7736462280154228, | |
| "num_tokens": 2373710.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 1.145560473203659, | |
| "epoch": 0.9880952380952381, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1.5214352033794981e-05, | |
| "loss": 0.8729808926582336, | |
| "mean_token_accuracy": 0.7629412487149239, | |
| "num_tokens": 2402610.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 1.1476428806781769, | |
| "epoch": 1.0, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.5107572913716859e-05, | |
| "loss": 0.8972144722938538, | |
| "mean_token_accuracy": 0.757901057600975, | |
| "num_tokens": 2430019.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_entropy": 1.1429666471481323, | |
| "eval_loss": 0.8658801317214966, | |
| "eval_mean_token_accuracy": 0.7630383356412251, | |
| "eval_model_preparation_time": 0.0051, | |
| "eval_num_tokens": 2430019.0, | |
| "eval_runtime": 19.169, | |
| "eval_samples_per_second": 7.825, | |
| "eval_steps_per_second": 7.825, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 1.1193113178014755, | |
| "epoch": 1.0119047619047619, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.8069751858711243, | |
| "mean_token_accuracy": 0.7752386555075645, | |
| "num_tokens": 2459653.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 1.14054836332798, | |
| "epoch": 1.0238095238095237, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 1.4891650011092896e-05, | |
| "loss": 0.8288445472717285, | |
| "mean_token_accuracy": 0.7729767188429832, | |
| "num_tokens": 2488217.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 1.1414664089679718, | |
| "epoch": 1.0357142857142858, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1.4782539786213184e-05, | |
| "loss": 0.8254880905151367, | |
| "mean_token_accuracy": 0.7727913111448288, | |
| "num_tokens": 2517578.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 1.1179616451263428, | |
| "epoch": 1.0476190476190477, | |
| "grad_norm": 0.205078125, | |
| "learning_rate": 1.4672686282730622e-05, | |
| "loss": 0.8098872303962708, | |
| "mean_token_accuracy": 0.7769448384642601, | |
| "num_tokens": 2546116.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 1.1239117681980133, | |
| "epoch": 1.0595238095238095, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 1.4562106573531632e-05, | |
| "loss": 0.8263017535209656, | |
| "mean_token_accuracy": 0.7758133932948112, | |
| "num_tokens": 2574681.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 1.1026111543178558, | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 1.4450817844365924e-05, | |
| "loss": 0.8099116086959839, | |
| "mean_token_accuracy": 0.7731629684567451, | |
| "num_tokens": 2603807.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.1024491339921951, | |
| "epoch": 1.0833333333333333, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1.4338837391175582e-05, | |
| "loss": 0.8093633055686951, | |
| "mean_token_accuracy": 0.7739714533090591, | |
| "num_tokens": 2632614.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 1.1085499972105026, | |
| "epoch": 1.0952380952380953, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.4226182617406996e-05, | |
| "loss": 0.8473532199859619, | |
| "mean_token_accuracy": 0.7683232203125954, | |
| "num_tokens": 2661538.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 1.0892803370952606, | |
| "epoch": 1.1071428571428572, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 1.4112871031306118e-05, | |
| "loss": 0.8294469118118286, | |
| "mean_token_accuracy": 0.7713945508003235, | |
| "num_tokens": 2690777.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 1.1031535863876343, | |
| "epoch": 1.119047619047619, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1.3998920243197408e-05, | |
| "loss": 0.8391809463500977, | |
| "mean_token_accuracy": 0.7676805257797241, | |
| "num_tokens": 2719730.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 1.0815589874982834, | |
| "epoch": 1.130952380952381, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1.3884347962746949e-05, | |
| "loss": 0.7862935066223145, | |
| "mean_token_accuracy": 0.7806214541196823, | |
| "num_tokens": 2749156.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 1.084671527147293, | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1.3769171996210053e-05, | |
| "loss": 0.840523898601532, | |
| "mean_token_accuracy": 0.7695459797978401, | |
| "num_tokens": 2778531.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 1.0894652903079987, | |
| "epoch": 1.1547619047619047, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 1.3653410243663953e-05, | |
| "loss": 0.7974240779876709, | |
| "mean_token_accuracy": 0.7744667157530785, | |
| "num_tokens": 2806462.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 1.0971969813108444, | |
| "epoch": 1.1666666666666667, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.3537080696225815e-05, | |
| "loss": 0.8246796131134033, | |
| "mean_token_accuracy": 0.7684177905321121, | |
| "num_tokens": 2835497.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 1.1123791635036469, | |
| "epoch": 1.1785714285714286, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1.342020143325669e-05, | |
| "loss": 0.8859103322029114, | |
| "mean_token_accuracy": 0.7534352988004684, | |
| "num_tokens": 2865231.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 1.075607344508171, | |
| "epoch": 1.1904761904761905, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1.3302790619551673e-05, | |
| "loss": 0.7980949878692627, | |
| "mean_token_accuracy": 0.7762870118021965, | |
| "num_tokens": 2894329.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.1072215735912323, | |
| "epoch": 1.2023809523809523, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 1.3184866502516846e-05, | |
| "loss": 0.8650733232498169, | |
| "mean_token_accuracy": 0.764843761920929, | |
| "num_tokens": 2923660.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 1.0887151509523392, | |
| "epoch": 1.2142857142857142, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1.3066447409333345e-05, | |
| "loss": 0.790311336517334, | |
| "mean_token_accuracy": 0.7792445793747902, | |
| "num_tokens": 2952054.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 1.1025346666574478, | |
| "epoch": 1.2261904761904763, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 1.2947551744109044e-05, | |
| "loss": 0.8180376887321472, | |
| "mean_token_accuracy": 0.7729773372411728, | |
| "num_tokens": 2981426.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 1.0916212499141693, | |
| "epoch": 1.2380952380952381, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 1.2828197985018276e-05, | |
| "loss": 0.7971659898757935, | |
| "mean_token_accuracy": 0.7799450904130936, | |
| "num_tokens": 3009579.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 1.1104163080453873, | |
| "epoch": 1.25, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1.2708404681430054e-05, | |
| "loss": 0.8455361127853394, | |
| "mean_token_accuracy": 0.7681760489940643, | |
| "num_tokens": 3038292.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 1.1180581152439117, | |
| "epoch": 1.2619047619047619, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1.2588190451025209e-05, | |
| "loss": 0.8946309685707092, | |
| "mean_token_accuracy": 0.755538322031498, | |
| "num_tokens": 3068231.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 1.0994994044303894, | |
| "epoch": 1.2738095238095237, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 1.2467573976902936e-05, | |
| "loss": 0.7855837345123291, | |
| "mean_token_accuracy": 0.7798345908522606, | |
| "num_tokens": 3096640.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 1.0958448350429535, | |
| "epoch": 1.2857142857142856, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.2346574004677154e-05, | |
| "loss": 0.8080664277076721, | |
| "mean_token_accuracy": 0.775592751801014, | |
| "num_tokens": 3125619.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 1.1057351678609848, | |
| "epoch": 1.2976190476190477, | |
| "grad_norm": 0.375, | |
| "learning_rate": 1.2225209339563144e-05, | |
| "loss": 0.8222600817680359, | |
| "mean_token_accuracy": 0.7683183401823044, | |
| "num_tokens": 3155256.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 1.1132191121578217, | |
| "epoch": 1.3095238095238095, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1.210349884345496e-05, | |
| "loss": 0.8248376250267029, | |
| "mean_token_accuracy": 0.7687205746769905, | |
| "num_tokens": 3183948.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.0987165123224258, | |
| "epoch": 1.3214285714285714, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 1.1981461431993978e-05, | |
| "loss": 0.8191619515419006, | |
| "mean_token_accuracy": 0.772399052977562, | |
| "num_tokens": 3212463.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 1.1073571592569351, | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 1.1859116071629148e-05, | |
| "loss": 0.8318334221839905, | |
| "mean_token_accuracy": 0.7649757117033005, | |
| "num_tokens": 3241487.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 1.102282091975212, | |
| "epoch": 1.3452380952380953, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 1.1736481776669307e-05, | |
| "loss": 0.8375995755195618, | |
| "mean_token_accuracy": 0.7682436108589172, | |
| "num_tokens": 3270436.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 1.0837299078702927, | |
| "epoch": 1.3571428571428572, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 1.1613577606328068e-05, | |
| "loss": 0.7833430767059326, | |
| "mean_token_accuracy": 0.7823601812124252, | |
| "num_tokens": 3299814.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 1.0879952907562256, | |
| "epoch": 1.369047619047619, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 1.1490422661761744e-05, | |
| "loss": 0.7993915677070618, | |
| "mean_token_accuracy": 0.7771986275911331, | |
| "num_tokens": 3328509.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 1.112231805920601, | |
| "epoch": 1.380952380952381, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 1.1367036083100735e-05, | |
| "loss": 0.8307598233222961, | |
| "mean_token_accuracy": 0.7695401236414909, | |
| "num_tokens": 3356953.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 1.0973141938447952, | |
| "epoch": 1.3928571428571428, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 1.1243437046474854e-05, | |
| "loss": 0.8001049757003784, | |
| "mean_token_accuracy": 0.7750882878899574, | |
| "num_tokens": 3385659.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 1.1106764674186707, | |
| "epoch": 1.4047619047619047, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 1.1119644761033079e-05, | |
| "loss": 0.820791482925415, | |
| "mean_token_accuracy": 0.7748995646834373, | |
| "num_tokens": 3414046.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 1.0989094227552414, | |
| "epoch": 1.4166666666666667, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.0995678465958168e-05, | |
| "loss": 0.8132579326629639, | |
| "mean_token_accuracy": 0.7685609012842178, | |
| "num_tokens": 3442153.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 1.123728185892105, | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 1.0871557427476585e-05, | |
| "loss": 0.8655298948287964, | |
| "mean_token_accuracy": 0.7622044086456299, | |
| "num_tokens": 3471630.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.1011102497577667, | |
| "epoch": 1.4404761904761905, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 1.0747300935864245e-05, | |
| "loss": 0.8160438537597656, | |
| "mean_token_accuracy": 0.7715617045760155, | |
| "num_tokens": 3500341.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 1.0802496522665024, | |
| "epoch": 1.4523809523809523, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 1.0622928302448523e-05, | |
| "loss": 0.795846700668335, | |
| "mean_token_accuracy": 0.7745838463306427, | |
| "num_tokens": 3530737.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 1.111521065235138, | |
| "epoch": 1.4642857142857144, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 1.0498458856606972e-05, | |
| "loss": 0.8259180188179016, | |
| "mean_token_accuracy": 0.7704022750258446, | |
| "num_tokens": 3559280.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 1.1052347421646118, | |
| "epoch": 1.4761904761904763, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 1.037391194276326e-05, | |
| "loss": 0.8490574359893799, | |
| "mean_token_accuracy": 0.7630502879619598, | |
| "num_tokens": 3588223.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 1.1114005744457245, | |
| "epoch": 1.4880952380952381, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 1.0249306917380731e-05, | |
| "loss": 0.8460506796836853, | |
| "mean_token_accuracy": 0.766345664858818, | |
| "num_tokens": 3617347.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 1.115243524312973, | |
| "epoch": 1.5, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 1.0124663145954152e-05, | |
| "loss": 0.8421509265899658, | |
| "mean_token_accuracy": 0.7646084725856781, | |
| "num_tokens": 3646452.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 1.1102195531129837, | |
| "epoch": 1.5119047619047619, | |
| "grad_norm": 0.68359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8465963006019592, | |
| "mean_token_accuracy": 0.7651297971606255, | |
| "num_tokens": 3674684.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 1.1138557642698288, | |
| "epoch": 1.5238095238095237, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 9.87533685404585e-06, | |
| "loss": 0.8462578058242798, | |
| "mean_token_accuracy": 0.7656397670507431, | |
| "num_tokens": 3701972.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 1.0684314519166946, | |
| "epoch": 1.5357142857142856, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 9.750693082619274e-06, | |
| "loss": 0.7849152684211731, | |
| "mean_token_accuracy": 0.7857328802347183, | |
| "num_tokens": 3731223.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 1.0986532717943192, | |
| "epoch": 1.5476190476190477, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 9.626088057236745e-06, | |
| "loss": 0.8162216544151306, | |
| "mean_token_accuracy": 0.7728657871484756, | |
| "num_tokens": 3759466.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.1011187136173248, | |
| "epoch": 1.5595238095238095, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 9.501541143393028e-06, | |
| "loss": 0.8209044933319092, | |
| "mean_token_accuracy": 0.7711444199085236, | |
| "num_tokens": 3788276.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 1.0799630433321, | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 9.377071697551479e-06, | |
| "loss": 0.7802744507789612, | |
| "mean_token_accuracy": 0.7825465202331543, | |
| "num_tokens": 3817834.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 1.1000354290008545, | |
| "epoch": 1.5833333333333335, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 9.252699064135759e-06, | |
| "loss": 0.8035217523574829, | |
| "mean_token_accuracy": 0.7786530405282974, | |
| "num_tokens": 3846803.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 1.106198564171791, | |
| "epoch": 1.5952380952380953, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 9.128442572523418e-06, | |
| "loss": 0.8161381483078003, | |
| "mean_token_accuracy": 0.7741018161177635, | |
| "num_tokens": 3875363.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 1.094715103507042, | |
| "epoch": 1.6071428571428572, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 9.004321534041836e-06, | |
| "loss": 0.797153115272522, | |
| "mean_token_accuracy": 0.7743495553731918, | |
| "num_tokens": 3904020.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 1.0934260189533234, | |
| "epoch": 1.619047619047619, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 8.880355238966923e-06, | |
| "loss": 0.7957767248153687, | |
| "mean_token_accuracy": 0.7722097188234329, | |
| "num_tokens": 3932554.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 1.0932885110378265, | |
| "epoch": 1.630952380952381, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 8.756562953525151e-06, | |
| "loss": 0.8285123109817505, | |
| "mean_token_accuracy": 0.7748213410377502, | |
| "num_tokens": 3963124.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 1.0973111540079117, | |
| "epoch": 1.6428571428571428, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 8.632963916899268e-06, | |
| "loss": 0.8037251234054565, | |
| "mean_token_accuracy": 0.7732022255659103, | |
| "num_tokens": 3991485.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 1.1007558554410934, | |
| "epoch": 1.6547619047619047, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 8.509577338238255e-06, | |
| "loss": 0.8211590051651001, | |
| "mean_token_accuracy": 0.770406000316143, | |
| "num_tokens": 4020583.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 1.0826598927378654, | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 8.386422393671934e-06, | |
| "loss": 0.7706205248832703, | |
| "mean_token_accuracy": 0.7830442562699318, | |
| "num_tokens": 4049853.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.1024836301803589, | |
| "epoch": 1.6785714285714286, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 8.263518223330698e-06, | |
| "loss": 0.8168199062347412, | |
| "mean_token_accuracy": 0.7706331759691238, | |
| "num_tokens": 4079030.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 1.1091957688331604, | |
| "epoch": 1.6904761904761905, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 8.140883928370855e-06, | |
| "loss": 0.8526521325111389, | |
| "mean_token_accuracy": 0.7632784247398376, | |
| "num_tokens": 4108702.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 1.0926142483949661, | |
| "epoch": 1.7023809523809523, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 8.018538568006027e-06, | |
| "loss": 0.800937294960022, | |
| "mean_token_accuracy": 0.7739113718271255, | |
| "num_tokens": 4138456.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 1.085595116019249, | |
| "epoch": 1.7142857142857144, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 7.896501156545044e-06, | |
| "loss": 0.7860180735588074, | |
| "mean_token_accuracy": 0.7786939144134521, | |
| "num_tokens": 4168706.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 1.072287455201149, | |
| "epoch": 1.7261904761904763, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 7.774790660436857e-06, | |
| "loss": 0.7674008011817932, | |
| "mean_token_accuracy": 0.7878812775015831, | |
| "num_tokens": 4197229.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 1.1147316098213196, | |
| "epoch": 1.7380952380952381, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 7.653425995322852e-06, | |
| "loss": 0.8494656682014465, | |
| "mean_token_accuracy": 0.7613426074385643, | |
| "num_tokens": 4226241.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 1.0884745866060257, | |
| "epoch": 1.75, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 7.532426023097063e-06, | |
| "loss": 0.7670794129371643, | |
| "mean_token_accuracy": 0.7854177579283714, | |
| "num_tokens": 4254275.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 1.0930557996034622, | |
| "epoch": 1.7619047619047619, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 7.411809548974792e-06, | |
| "loss": 0.8160566091537476, | |
| "mean_token_accuracy": 0.7741377875208855, | |
| "num_tokens": 4283150.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 1.0953784435987473, | |
| "epoch": 1.7738095238095237, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 7.291595318569951e-06, | |
| "loss": 0.8185824751853943, | |
| "mean_token_accuracy": 0.7722392901778221, | |
| "num_tokens": 4312410.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 1.0579064786434174, | |
| "epoch": 1.7857142857142856, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 7.171802014981726e-06, | |
| "loss": 0.748650848865509, | |
| "mean_token_accuracy": 0.79205472022295, | |
| "num_tokens": 4341254.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.0844353437423706, | |
| "epoch": 1.7976190476190477, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 7.052448255890958e-06, | |
| "loss": 0.7923084497451782, | |
| "mean_token_accuracy": 0.7777365446090698, | |
| "num_tokens": 4369573.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 1.098150685429573, | |
| "epoch": 1.8095238095238095, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 6.933552590666659e-06, | |
| "loss": 0.8330479860305786, | |
| "mean_token_accuracy": 0.7675687223672867, | |
| "num_tokens": 4397753.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 1.0822398364543915, | |
| "epoch": 1.8214285714285714, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 6.815133497483157e-06, | |
| "loss": 0.7889379262924194, | |
| "mean_token_accuracy": 0.7777184247970581, | |
| "num_tokens": 4427257.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 1.0754519402980804, | |
| "epoch": 1.8333333333333335, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 6.697209380448333e-06, | |
| "loss": 0.784767746925354, | |
| "mean_token_accuracy": 0.7786002233624458, | |
| "num_tokens": 4456709.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 1.084225744009018, | |
| "epoch": 1.8452380952380953, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 6.579798566743314e-06, | |
| "loss": 0.8074082732200623, | |
| "mean_token_accuracy": 0.7725819870829582, | |
| "num_tokens": 4485653.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 1.0734328627586365, | |
| "epoch": 1.8571428571428572, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 6.462919303774186e-06, | |
| "loss": 0.7693166136741638, | |
| "mean_token_accuracy": 0.7844012156128883, | |
| "num_tokens": 4515131.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 1.0944669842720032, | |
| "epoch": 1.869047619047619, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 6.34658975633605e-06, | |
| "loss": 0.8283172249794006, | |
| "mean_token_accuracy": 0.7699304968118668, | |
| "num_tokens": 4544554.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 1.0710095912218094, | |
| "epoch": 1.880952380952381, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 6.230828003789949e-06, | |
| "loss": 0.7723422050476074, | |
| "mean_token_accuracy": 0.785270169377327, | |
| "num_tokens": 4574526.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 1.1019357591867447, | |
| "epoch": 1.8928571428571428, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 6.115652037253054e-06, | |
| "loss": 0.842171847820282, | |
| "mean_token_accuracy": 0.7660646587610245, | |
| "num_tokens": 4603221.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 1.0798636227846146, | |
| "epoch": 1.9047619047619047, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 6.001079756802592e-06, | |
| "loss": 0.7799994945526123, | |
| "mean_token_accuracy": 0.7830873727798462, | |
| "num_tokens": 4632086.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.0717933773994446, | |
| "epoch": 1.9166666666666665, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 5.887128968693887e-06, | |
| "loss": 0.7654195427894592, | |
| "mean_token_accuracy": 0.7824560701847076, | |
| "num_tokens": 4660316.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 1.069181576371193, | |
| "epoch": 1.9285714285714286, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 5.773817382593008e-06, | |
| "loss": 0.7898048162460327, | |
| "mean_token_accuracy": 0.7769228145480156, | |
| "num_tokens": 4689587.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 1.0788903683423996, | |
| "epoch": 1.9404761904761905, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 5.66116260882442e-06, | |
| "loss": 0.7974780797958374, | |
| "mean_token_accuracy": 0.7752274572849274, | |
| "num_tokens": 4719335.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 1.1007077991962433, | |
| "epoch": 1.9523809523809523, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 5.549182155634076e-06, | |
| "loss": 0.7892836332321167, | |
| "mean_token_accuracy": 0.7779370620846748, | |
| "num_tokens": 4746463.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 1.0850374549627304, | |
| "epoch": 1.9642857142857144, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 5.43789342646837e-06, | |
| "loss": 0.7919931411743164, | |
| "mean_token_accuracy": 0.7770635932683945, | |
| "num_tokens": 4775141.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 1.081614837050438, | |
| "epoch": 1.9761904761904763, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 5.32731371726938e-06, | |
| "loss": 0.7762281894683838, | |
| "mean_token_accuracy": 0.7814824879169464, | |
| "num_tokens": 4803229.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 1.086833968758583, | |
| "epoch": 1.9880952380952381, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 5.217460213786822e-06, | |
| "loss": 0.8244621157646179, | |
| "mean_token_accuracy": 0.7730955481529236, | |
| "num_tokens": 4832506.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 1.0700944513082504, | |
| "epoch": 2.0, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 5.108349988907111e-06, | |
| "loss": 0.7783507704734802, | |
| "mean_token_accuracy": 0.7822717130184174, | |
| "num_tokens": 4860038.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_entropy": 1.0883367625872293, | |
| "eval_loss": 0.8387430906295776, | |
| "eval_mean_token_accuracy": 0.7681301248073578, | |
| "eval_model_preparation_time": 0.0051, | |
| "eval_num_tokens": 4860038.0, | |
| "eval_runtime": 19.5881, | |
| "eval_samples_per_second": 7.658, | |
| "eval_steps_per_second": 7.658, | |
| "step": 168 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 252, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1131690390237286e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |