Gemma 2 9B DevOps - Polish finetuned model

Browse files

Files changed (10) hide show

.gitattributes +1 -0
README.md +39 -0
chat_template.jinja +4 -0
config.json +82 -0
generation_config.json +10 -0
model.safetensors +3 -0
tokenizer.json +3 -0
tokenizer_config.json +19 -0
trainer_log_history.json +908 -0
training_metadata.json +45 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,39 @@

+---
+language:
+- pl
+- en
+license: mit
+tags:
+- devops
+- kubernetes
+- ansible
+- terraform
+- yaml
+base_model: google/gemma-2-9b-it
+---
+# Phi-3.5-mini-PL-DevOps-Instruct-v2
+Polish DevOps assistant fine-tuned on Infrastructure as Code tasks.
+## ⚠️ Fixes in v2
+- **Fixed YAML indentation** - consistent 2-space indentation
+- **High Quality Training** - Native BF16 training (no quantization errors)
+- Trained WITHOUT Unsloth (no padding-free mode)
+- `packing=False` to preserve whitespace
+## Evaluation / Inference
+This model is saved in **BFLOAT16**.
+- For 4-bit inference: Load with `load_in_4bit=True` (bitsandbytes)
+- For vLLM: Compatible with standard loading or FP8/AWQ quantization
+## Training
+| Param | Value |
+|-------|-------|
+| Base | google/gemma-2-9b-it |
+| Method | Full BF16 Finetuning + LoRA |
+| Batch | 96 effective |
+| Train samples | 170,305 |
+| Train loss | 0.6174 |
+| Time | 667.0 min |
+| GPU | H100 80GB |

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,4 @@

+{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '
+' + message['content'] | trim + '<end_of_turn>
+' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model
+'}}{% endif %}

config.json ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+  "architectures": [
+    "Gemma2ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attn_logit_softcapping": 50.0,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "dtype": "bfloat16",
+  "eos_token_id": 1,
+  "final_logit_softcapping": 30.0,
+  "head_dim": 256,
+  "hidden_act": "gelu_pytorch_tanh",
+  "hidden_activation": "gelu_pytorch_tanh",
+  "hidden_size": 3584,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "layer_types": [
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 8192,
+  "model_type": "gemma2",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 42,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "query_pre_attn_scalar": 256,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 10000.0,
+    "rope_type": "default"
+  },
+  "sliding_window": 4096,
+  "sliding_window_size": 4096,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "use_bidirectional_attention": null,
+  "use_cache": false,
+  "vocab_size": 256000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 2,
+  "cache_implementation": "hybrid",
+  "eos_token_id": [
+    1
+  ],
+  "pad_token_id": 0,
+  "transformers_version": "5.0.0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e79c3ec83bb4df5182eb6057df9c8df02d95dbc5d9ccf69e8cfb81dfce671589
+size 18483467000

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:394ace002a144ac6ad5486387502f2d36f70c087310c3d907857240c76fcb36e
+size 34362748

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<bos>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<eos>",
+  "extra_special_tokens": [
+    "<start_of_turn>",
+    "<end_of_turn>"
+  ],
+  "is_local": false,
+  "mask_token": "<mask>",
+  "model_max_length": 2048,
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "GemmaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

trainer_log_history.json ADDED Viewed

	@@ -0,0 +1,908 @@

+[
+  {
+    "loss": 1.9148719787597657,
+    "grad_norm": 0.984495222568512,
+    "learning_rate": 2.2641509433962265e-05,
+    "entropy": 0.9785909144083659,
+    "num_tokens": 909468.0,
+    "mean_token_accuracy": 0.6837772730986277,
+    "epoch": 0.014091122592766557,
+    "step": 25
+  },
+  {
+    "loss": 1.0547267150878907,
+    "grad_norm": 0.2687967121601105,
+    "learning_rate": 4.6226415094339625e-05,
+    "entropy": 1.0105916921297708,
+    "num_tokens": 1841827.0,
+    "mean_token_accuracy": 0.7993326298395793,
+    "epoch": 0.028182245185533115,
+    "step": 50
+  },
+  {
+    "loss": 0.909715805053711,
+    "grad_norm": 0.30847716331481934,
+    "learning_rate": 4.998165452627025e-05,
+    "entropy": 0.8024396904309591,
+    "num_tokens": 2783417.0,
+    "mean_token_accuracy": 0.8195314351717631,
+    "epoch": 0.042273367778299674,
+    "step": 75
+  },
+  {
+    "loss": 0.8271210479736328,
+    "grad_norm": 0.2949310839176178,
+    "learning_rate": 4.991201589453377e-05,
+    "entropy": 0.7632828823725383,
+    "num_tokens": 3703024.0,
+    "mean_token_accuracy": 0.8274987975756327,
+    "epoch": 0.05636449037106623,
+    "step": 100
+  },
+  {
+    "eval_loss": 0.7959097623825073,
+    "eval_runtime": 42.9182,
+    "eval_samples_per_second": 11.65,
+    "eval_steps_per_second": 0.746,
+    "eval_entropy": 0.7644990533590317,
+    "eval_num_tokens": 3703024.0,
+    "eval_mean_token_accuracy": 0.8274666927754879,
+    "epoch": 0.05636449037106623,
+    "step": 100
+  },
+  {
+    "loss": 0.7740676879882813,
+    "grad_norm": 0.29744288325309753,
+    "learning_rate": 4.97905632708703e-05,
+    "entropy": 0.7656655506292979,
+    "num_tokens": 4667355.0,
+    "mean_token_accuracy": 0.8285713505744934,
+    "epoch": 0.07045561296383279,
+    "step": 125
+  },
+  {
+    "loss": 0.7345146942138672,
+    "grad_norm": 0.3392024040222168,
+    "learning_rate": 4.9617549262105724e-05,
+    "entropy": 0.7322683656215667,
+    "num_tokens": 5580909.0,
+    "mean_token_accuracy": 0.8329473527272543,
+    "epoch": 0.08454673555659935,
+    "step": 150
+  },
+  {
+    "loss": 0.7085108184814453,
+    "grad_norm": 0.337108314037323,
+    "learning_rate": 4.939333371653541e-05,
+    "entropy": 0.7126858182748159,
+    "num_tokens": 6525675.0,
+    "mean_token_accuracy": 0.8382143716017405,
+    "epoch": 0.0986378581493659,
+    "step": 175
+  },
+  {
+    "loss": 0.7087242889404297,
+    "grad_norm": 0.39108389616012573,
+    "learning_rate": 4.911838297548306e-05,
+    "entropy": 0.7103402439753215,
+    "num_tokens": 7460420.0,
+    "mean_token_accuracy": 0.8385978392759958,
+    "epoch": 0.11272898074213246,
+    "step": 200
+  },
+  {
+    "eval_loss": 0.691197395324707,
+    "eval_runtime": 34.7324,
+    "eval_samples_per_second": 14.396,
+    "eval_steps_per_second": 0.921,
+    "eval_entropy": 0.7021831637248397,
+    "eval_num_tokens": 7460420.0,
+    "eval_mean_token_accuracy": 0.8397158589214087,
+    "epoch": 0.11272898074213246,
+    "step": 200
+  },
+  {
+    "loss": 0.6759407043457031,
+    "grad_norm": 0.41262030601501465,
+    "learning_rate": 4.8793268903366905e-05,
+    "entropy": 0.6836405583222707,
+    "num_tokens": 8367608.0,
+    "mean_token_accuracy": 0.8437444992860158,
+    "epoch": 0.12682010333489901,
+    "step": 225
+  },
+  {
+    "loss": 0.6790435028076172,
+    "grad_norm": 0.4088114798069,
+    "learning_rate": 4.8418667698290696e-05,
+    "entropy": 0.684131217400233,
+    "num_tokens": 9284184.0,
+    "mean_token_accuracy": 0.8435306719938914,
+    "epoch": 0.14091122592766558,
+    "step": 250
+  },
+  {
+    "loss": 0.6590489959716797,
+    "grad_norm": 0.4000810384750366,
+    "learning_rate": 4.7995358485633035e-05,
+    "entropy": 0.6666705779234569,
+    "num_tokens": 10228116.0,
+    "mean_token_accuracy": 0.8483462047576904,
+    "epoch": 0.15500234852043213,
+    "step": 275
+  },
+  {
+    "loss": 0.6522020721435546,
+    "grad_norm": 0.4356841742992401,
+    "learning_rate": 4.752422169756048e-05,
+    "entropy": 0.6561659761269887,
+    "num_tokens": 11141987.0,
+    "mean_token_accuracy": 0.8489574348926544,
+    "epoch": 0.1690934711131987,
+    "step": 300
+  },
+  {
+    "eval_loss": 0.6456555724143982,
+    "eval_runtime": 34.7464,
+    "eval_samples_per_second": 14.39,
+    "eval_steps_per_second": 0.921,
+    "eval_entropy": 0.6593516366556287,
+    "eval_num_tokens": 11141987.0,
+    "eval_mean_token_accuracy": 0.8488058932125568,
+    "epoch": 0.1690934711131987,
+    "step": 300
+  },
+  {
+    "loss": 0.6289921569824218,
+    "grad_norm": 0.44314464926719666,
+    "learning_rate": 4.700623724183468e-05,
+    "entropy": 0.6275538243850072,
+    "num_tokens": 12066391.0,
+    "mean_token_accuracy": 0.8543656957149506,
+    "epoch": 0.18318459370596524,
+    "step": 325
+  },
+  {
+    "loss": 0.6266510009765625,
+    "grad_norm": 0.4457905888557434,
+    "learning_rate": 4.644248246372233e-05,
+    "entropy": 0.6246062052249909,
+    "num_tokens": 13002518.0,
+    "mean_token_accuracy": 0.8540003776550293,
+    "epoch": 0.1972757162987318,
+    "step": 350
+  },
+  {
+    "loss": 0.6241617965698242,
+    "grad_norm": 0.45250752568244934,
+    "learning_rate": 4.5834129905246725e-05,
+    "entropy": 0.6225514455636343,
+    "num_tokens": 13915051.0,
+    "mean_token_accuracy": 0.8545078063011169,
+    "epoch": 0.21136683889149835,
+    "step": 375
+  },
+  {
+    "loss": 0.6225375366210938,
+    "grad_norm": 0.4502236545085907,
+    "learning_rate": 4.5182444866441694e-05,
+    "entropy": 0.6170252589384715,
+    "num_tokens": 14840689.0,
+    "mean_token_accuracy": 0.8547838560740153,
+    "epoch": 0.22545796148426492,
+    "step": 400
+  },
+  {
+    "eval_loss": 0.624234139919281,
+    "eval_runtime": 34.7767,
+    "eval_samples_per_second": 14.377,
+    "eval_steps_per_second": 0.92,
+    "eval_entropy": 0.6146921720355749,
+    "eval_num_tokens": 14840689.0,
+    "eval_mean_token_accuracy": 0.8521482553333044,
+    "epoch": 0.22545796148426492,
+    "step": 400
+  },
+  {
+    "loss": 0.6120803451538086,
+    "grad_norm": 0.40877044200897217,
+    "learning_rate": 4.4488782773679885e-05,
+    "entropy": 0.6126995925108591,
+    "num_tokens": 15781641.0,
+    "mean_token_accuracy": 0.8559599355856577,
+    "epoch": 0.23954908407703146,
+    "step": 425
+  },
+  {
+    "loss": 0.6238847732543945,
+    "grad_norm": 0.43226659297943115,
+    "learning_rate": 4.375458636054924e-05,
+    "entropy": 0.621622064312299,
+    "num_tokens": 16727723.0,
+    "mean_token_accuracy": 0.8534450817108155,
+    "epoch": 0.25364020666979803,
+    "step": 450
+  },
+  {
+    "loss": 0.5980339431762696,
+    "grad_norm": 0.4249129295349121,
+    "learning_rate": 4.298138266714094e-05,
+    "entropy": 0.5939697621266047,
+    "num_tokens": 17644465.0,
+    "mean_token_accuracy": 0.8589934686819712,
+    "epoch": 0.2677313292625646,
+    "step": 475
+  },
+  {
+    "loss": 0.6099626541137695,
+    "grad_norm": 0.4274967908859253,
+    "learning_rate": 4.2170779863989946e-05,
+    "entropy": 0.6078906120856603,
+    "num_tokens": 18563256.0,
+    "mean_token_accuracy": 0.8569075318177541,
+    "epoch": 0.28182245185533117,
+    "step": 500
+  },
+  {
+    "eval_loss": 0.6081598997116089,
+    "eval_runtime": 34.7944,
+    "eval_samples_per_second": 14.37,
+    "eval_steps_per_second": 0.92,
+    "eval_entropy": 0.6181821776553988,
+    "eval_num_tokens": 18563256.0,
+    "eval_mean_token_accuracy": 0.855755690485239,
+    "epoch": 0.28182245185533117,
+    "step": 500
+  },
+  {
+    "loss": 0.5912541961669922,
+    "grad_norm": 0.4574773907661438,
+    "learning_rate": 4.132446390727404e-05,
+    "entropy": 0.5872368462880453,
+    "num_tokens": 19469800.0,
+    "mean_token_accuracy": 0.8607503294944763,
+    "epoch": 0.2959135744480977,
+    "step": 525
+  },
+  {
+    "loss": 0.5932905197143554,
+    "grad_norm": 0.43131619691848755,
+    "learning_rate": 4.044419503222808e-05,
+    "entropy": 0.592293497522672,
+    "num_tokens": 20385241.0,
+    "mean_token_accuracy": 0.8604243552684784,
+    "epoch": 0.31000469704086425,
+    "step": 550
+  },
+  {
+    "loss": 0.5899901580810547,
+    "grad_norm": 0.4174489974975586,
+    "learning_rate": 3.953180409206677e-05,
+    "entropy": 0.5856852753957112,
+    "num_tokens": 21331318.0,
+    "mean_token_accuracy": 0.8611550823847453,
+    "epoch": 0.3240958196336308,
+    "step": 575
+  },
+  {
+    "loss": 0.600746955871582,
+    "grad_norm": 0.4607154428958893,
+    "learning_rate": 3.858918875003053e-05,
+    "entropy": 0.5992459511756897,
+    "num_tokens": 22288698.0,
+    "mean_token_accuracy": 0.8587616598606109,
+    "epoch": 0.3381869422263974,
+    "step": 600
+  },
+  {
+    "eval_loss": 0.5959565043449402,
+    "eval_runtime": 34.7871,
+    "eval_samples_per_second": 14.373,
+    "eval_steps_per_second": 0.92,
+    "eval_entropy": 0.6024799766018987,
+    "eval_num_tokens": 22288698.0,
+    "eval_mean_token_accuracy": 0.8582040295004845,
+    "epoch": 0.3381869422263974,
+    "step": 600
+  },
+  {
+    "loss": 0.5945447540283203,
+    "grad_norm": 0.4672609269618988,
+    "learning_rate": 3.761830953247457e-05,
+    "entropy": 0.5911998764673869,
+    "num_tokens": 23239625.0,
+    "mean_token_accuracy": 0.8611347631613413,
+    "epoch": 0.3522780648191639,
+    "step": 625
+  },
+  {
+    "loss": 0.5985645294189453,
+    "grad_norm": 0.4281597137451172,
+    "learning_rate": 3.662118575121024e-05,
+    "entropy": 0.5953911445538203,
+    "num_tokens": 24156885.0,
+    "mean_token_accuracy": 0.859769054253896,
+    "epoch": 0.3663691874119305,
+    "step": 650
+  },
+  {
+    "loss": 0.5921562957763672,
+    "grad_norm": 0.43555110692977905,
+    "learning_rate": 3.5599891303579746e-05,
+    "entropy": 0.5895072638988494,
+    "num_tokens": 25104558.0,
+    "mean_token_accuracy": 0.860583526690801,
+    "epoch": 0.38046031000469704,
+    "step": 675
+  },
+  {
+    "loss": 0.5891357421875,
+    "grad_norm": 0.46079888939857483,
+    "learning_rate": 3.455655035899951e-05,
+    "entropy": 0.5860749536752701,
+    "num_tokens": 26027947.0,
+    "mean_token_accuracy": 0.8607413911819458,
+    "epoch": 0.3945514325974636,
+    "step": 700
+  },
+  {
+    "eval_loss": 0.5854880213737488,
+    "eval_runtime": 34.8333,
+    "eval_samples_per_second": 14.354,
+    "eval_steps_per_second": 0.919,
+    "eval_entropy": 0.5855442956089973,
+    "eval_num_tokens": 26027947.0,
+    "eval_mean_token_accuracy": 0.8601280357688665,
+    "epoch": 0.3945514325974636,
+    "step": 700
+  },
+  {
+    "loss": 0.5821422576904297,
+    "grad_norm": 0.42215803265571594,
+    "learning_rate": 3.349333294094369e-05,
+    "entropy": 0.5828985869884491,
+    "num_tokens": 26996995.0,
+    "mean_token_accuracy": 0.8623941914240519,
+    "epoch": 0.40864255519023013,
+    "step": 725
+  },
+  {
+    "loss": 0.5734980392456055,
+    "grad_norm": 0.4118139147758484,
+    "learning_rate": 3.241245041355675e-05,
+    "entropy": 0.5695036280155182,
+    "num_tokens": 27948817.0,
+    "mean_token_accuracy": 0.8648126033941904,
+    "epoch": 0.4227336777829967,
+    "step": 750
+  },
+  {
+    "loss": 0.5755558776855468,
+    "grad_norm": 0.40968823432922363,
+    "learning_rate": 3.131615088228249e-05,
+    "entropy": 0.5767549270391464,
+    "num_tokens": 28893932.0,
+    "mean_token_accuracy": 0.8637475728988647,
+    "epoch": 0.43682480037576327,
+    "step": 775
+  },
+  {
+    "loss": 0.573729248046875,
+    "grad_norm": 0.4324798583984375,
+    "learning_rate": 3.0206714518075486e-05,
+    "entropy": 0.5696792916456859,
+    "num_tokens": 29833216.0,
+    "mean_token_accuracy": 0.8643758261203766,
+    "epoch": 0.45091592296852984,
+    "step": 800
+  },
+  {
+    "eval_loss": 0.5752155780792236,
+    "eval_runtime": 34.8957,
+    "eval_samples_per_second": 14.328,
+    "eval_steps_per_second": 0.917,
+    "eval_entropy": 0.5935880783945322,
+    "eval_num_tokens": 29833216.0,
+    "eval_mean_token_accuracy": 0.8622864987701178,
+    "epoch": 0.45091592296852984,
+    "step": 800
+  },
+  {
+    "loss": 0.5751077270507813,
+    "grad_norm": 0.4815407693386078,
+    "learning_rate": 2.9086448814920242e-05,
+    "entropy": 0.5717160554726919,
+    "num_tokens": 30736838.0,
+    "mean_token_accuracy": 0.864310040473938,
+    "epoch": 0.4650070455612964,
+    "step": 825
+  },
+  {
+    "loss": 0.566772575378418,
+    "grad_norm": 0.4774300158023834,
+    "learning_rate": 2.7957683790521676e-05,
+    "entropy": 0.5650917081038157,
+    "num_tokens": 31659300.0,
+    "mean_token_accuracy": 0.8658999156951904,
+    "epoch": 0.4790981681540629,
+    "step": 850
+  },
+  {
+    "loss": 0.5626054382324219,
+    "grad_norm": 0.42420145869255066,
+    "learning_rate": 2.6822767140148987e-05,
+    "entropy": 0.5590727700789769,
+    "num_tokens": 32593580.0,
+    "mean_token_accuracy": 0.8666303022702535,
+    "epoch": 0.4931892907468295,
+    "step": 875
+  },
+  {
+    "loss": 0.5539141082763672,
+    "grad_norm": 0.47889477014541626,
+    "learning_rate": 2.5684059353712307e-05,
+    "entropy": 0.5530497090021769,
+    "num_tokens": 33494838.0,
+    "mean_token_accuracy": 0.8674623111883799,
+    "epoch": 0.5072804133395961,
+    "step": 900
+  },
+  {
+    "eval_loss": 0.5693426728248596,
+    "eval_runtime": 34.8643,
+    "eval_samples_per_second": 14.341,
+    "eval_steps_per_second": 0.918,
+    "eval_entropy": 0.5721144182607532,
+    "eval_num_tokens": 33494838.0,
+    "eval_mean_token_accuracy": 0.8636170122772455,
+    "epoch": 0.5072804133395961,
+    "step": 900
+  },
+  {
+    "loss": 0.5634239959716797,
+    "grad_norm": 0.47955217957496643,
+    "learning_rate": 2.4543928806228074e-05,
+    "entropy": 0.562302614847819,
+    "num_tokens": 34443337.0,
+    "mean_token_accuracy": 0.8664345097541809,
+    "epoch": 0.5213715359323626,
+    "step": 925
+  },
+  {
+    "loss": 0.5764046096801758,
+    "grad_norm": 0.4992325007915497,
+    "learning_rate": 2.340474683188429e-05,
+    "entropy": 0.570437356432279,
+    "num_tokens": 35385705.0,
+    "mean_token_accuracy": 0.8647123599052429,
+    "epoch": 0.5354626585251292,
+    "step": 950
+  },
+  {
+    "loss": 0.5535079956054687,
+    "grad_norm": 0.5063010454177856,
+    "learning_rate": 2.2268882791951127e-05,
+    "entropy": 0.5491390575965246,
+    "num_tokens": 36300339.0,
+    "mean_token_accuracy": 0.8694652744134267,
+    "epoch": 0.5495537811178958,
+    "step": 975
+  },
+  {
+    "loss": 0.5499050521850586,
+    "grad_norm": 0.45809435844421387,
+    "learning_rate": 2.1138699146794867e-05,
+    "entropy": 0.5487177085876465,
+    "num_tokens": 37231011.0,
+    "mean_token_accuracy": 0.8694357828299204,
+    "epoch": 0.5636449037106623,
+    "step": 1000
+  },
+  {
+    "eval_loss": 0.5629469752311707,
+    "eval_runtime": 34.8051,
+    "eval_samples_per_second": 14.366,
+    "eval_steps_per_second": 0.919,
+    "eval_entropy": 0.5682820733636618,
+    "eval_num_tokens": 37231011.0,
+    "eval_mean_token_accuracy": 0.8649211004376411,
+    "epoch": 0.5636449037106623,
+    "step": 1000
+  },
+  {
+    "loss": 0.5626242446899414,
+    "grad_norm": 0.4490196108818054,
+    "learning_rate": 2.001654654224499e-05,
+    "entropy": 0.5606711500883103,
+    "num_tokens": 38163978.0,
+    "mean_token_accuracy": 0.8663252631823222,
+    "epoch": 0.5777360263034288,
+    "step": 1025
+  },
+  {
+    "loss": 0.5654045867919922,
+    "grad_norm": 0.4703851044178009,
+    "learning_rate": 1.8904758920533988e-05,
+    "entropy": 0.5644157862663269,
+    "num_tokens": 39100488.0,
+    "mean_token_accuracy": 0.8655120352904002,
+    "epoch": 0.5918271488961954,
+    "step": 1050
+  },
+  {
+    "loss": 0.5633118057250976,
+    "grad_norm": 0.507513165473938,
+    "learning_rate": 1.780564866597872e-05,
+    "entropy": 0.5595145153999329,
+    "num_tokens": 40015513.0,
+    "mean_token_accuracy": 0.8673883573214213,
+    "epoch": 0.6059182714889619,
+    "step": 1075
+  },
+  {
+    "loss": 0.5580905532836914,
+    "grad_norm": 0.48125702142715454,
+    "learning_rate": 1.67215017954996e-05,
+    "entropy": 0.5561687298615774,
+    "num_tokens": 40947677.0,
+    "mean_token_accuracy": 0.8681903723875681,
+    "epoch": 0.6200093940817285,
+    "step": 1100
+  },
+  {
+    "eval_loss": 0.5575993657112122,
+    "eval_runtime": 34.8521,
+    "eval_samples_per_second": 14.346,
+    "eval_steps_per_second": 0.918,
+    "eval_entropy": 0.5570412985980511,
+    "eval_num_tokens": 40947677.0,
+    "eval_mean_token_accuracy": 0.8659888282418251,
+    "epoch": 0.6200093940817285,
+    "step": 1100
+  },
+  {
+    "loss": 0.5523509979248047,
+    "grad_norm": 0.4786842167377472,
+    "learning_rate": 1.5654573203980784e-05,
+    "entropy": 0.5489772335688273,
+    "num_tokens": 41870358.0,
+    "mean_token_accuracy": 0.8689925694465637,
+    "epoch": 0.6341005166744951,
+    "step": 1125
+  },
+  {
+    "loss": 0.5597280883789062,
+    "grad_norm": 0.4649102985858917,
+    "learning_rate": 1.4607081974360465e-05,
+    "entropy": 0.5547034672896067,
+    "num_tokens": 42797849.0,
+    "mean_token_accuracy": 0.8681511521339417,
+    "epoch": 0.6481916392672616,
+    "step": 1150
+  },
+  {
+    "loss": 0.5571650695800782,
+    "grad_norm": 0.5057896971702576,
+    "learning_rate": 1.3581206762205706e-05,
+    "entropy": 0.5546683881680171,
+    "num_tokens": 43733188.0,
+    "mean_token_accuracy": 0.8681851788361867,
+    "epoch": 0.6622827618600282,
+    "step": 1175
+  },
+  {
+    "loss": 0.5405771255493164,
+    "grad_norm": 0.45025017857551575,
+    "learning_rate": 1.257908126437129e-05,
+    "entropy": 0.5370355778932572,
+    "num_tokens": 44647535.0,
+    "mean_token_accuracy": 0.8716498986879985,
+    "epoch": 0.6763738844527948,
+    "step": 1200
+  },
+  {
+    "eval_loss": 0.5538516640663147,
+    "eval_runtime": 34.8956,
+    "eval_samples_per_second": 14.328,
+    "eval_steps_per_second": 0.917,
+    "eval_entropy": 0.561651473864913,
+    "eval_num_tokens": 44647535.0,
+    "eval_mean_token_accuracy": 0.8666044622659683,
+    "epoch": 0.6763738844527948,
+    "step": 1200
+  },
+  {
+    "loss": 0.5540570831298828,
+    "grad_norm": 0.5011326670646667,
+    "learning_rate": 1.1602789781167347e-05,
+    "entropy": 0.5510254645347595,
+    "num_tokens": 45550031.0,
+    "mean_token_accuracy": 0.8685724465052287,
+    "epoch": 0.6904650070455612,
+    "step": 1225
+  },
+  {
+    "loss": 0.5520057296752929,
+    "grad_norm": 0.4687948226928711,
+    "learning_rate": 1.0654362881265754e-05,
+    "entropy": 0.549973030090332,
+    "num_tokens": 46479776.0,
+    "mean_token_accuracy": 0.8689675887425741,
+    "epoch": 0.7045561296383278,
+    "step": 1250
+  },
+  {
+    "loss": 0.5608898544311524,
+    "grad_norm": 0.5059524178504944,
+    "learning_rate": 9.735773178361964e-06,
+    "entropy": 0.5597832387685776,
+    "num_tokens": 47436308.0,
+    "mean_token_accuracy": 0.867308827638626,
+    "epoch": 0.7186472522310944,
+    "step": 1275
+  },
+  {
+    "loss": 0.5428831100463867,
+    "grad_norm": 0.5085554718971252,
+    "learning_rate": 8.848931228376136e-06,
+    "entropy": 0.5423163912693659,
+    "num_tokens": 48366020.0,
+    "mean_token_accuracy": 0.8694934193293253,
+    "epoch": 0.732738374823861,
+    "step": 1300
+  },
+  {
+    "eval_loss": 0.5501593947410583,
+    "eval_runtime": 34.8825,
+    "eval_samples_per_second": 14.334,
+    "eval_steps_per_second": 0.917,
+    "eval_entropy": 0.5527484444901347,
+    "eval_num_tokens": 48366020.0,
+    "eval_mean_token_accuracy": 0.8676421549171209,
+    "epoch": 0.732738374823861,
+    "step": 1300
+  },
+  {
+    "loss": 0.5539481353759765,
+    "grad_norm": 0.5734500288963318,
+    "learning_rate": 7.99568155572701e-06,
+    "entropy": 0.5485140432914098,
+    "num_tokens": 49280534.0,
+    "mean_token_accuracy": 0.8694952615102132,
+    "epoch": 0.7468294974166275,
+    "step": 1325
+  },
+  {
+    "loss": 0.5532180404663086,
+    "grad_norm": 0.4714227020740509,
+    "learning_rate": 7.177798816943287e-06,
+    "entropy": 0.5533179378509522,
+    "num_tokens": 50216029.0,
+    "mean_token_accuracy": 0.8688394419352213,
+    "epoch": 0.7609206200093941,
+    "step": 1350
+  },
+  {
+    "loss": 0.5513345336914063,
+    "grad_norm": 0.553312361240387,
+    "learning_rate": 6.3969841095918445e-06,
+    "entropy": 0.5489596172173818,
+    "num_tokens": 51157602.0,
+    "mean_token_accuracy": 0.8692836586634318,
+    "epoch": 0.7750117426021607,
+    "step": 1375
+  },
+  {
+    "loss": 0.5497291564941407,
+    "grad_norm": 0.4988526999950409,
+    "learning_rate": 5.654861434199757e-06,
+    "entropy": 0.5469332609574,
+    "num_tokens": 52117292.0,
+    "mean_token_accuracy": 0.8688764305909474,
+    "epoch": 0.7891028651949272,
+    "step": 1400
+  },
+  {
+    "eval_loss": 0.5481391549110413,
+    "eval_runtime": 34.9193,
+    "eval_samples_per_second": 14.319,
+    "eval_steps_per_second": 0.916,
+    "eval_entropy": 0.5496508749201894,
+    "eval_num_tokens": 52117292.0,
+    "eval_mean_token_accuracy": 0.8679818995296955,
+    "epoch": 0.7891028651949272,
+    "step": 1400
+  },
+  {
+    "loss": 0.541988639831543,
+    "grad_norm": 0.49807438254356384,
+    "learning_rate": 4.952974316528833e-06,
+    "entropy": 0.5369369254509608,
+    "num_tokens": 53073200.0,
+    "mean_token_accuracy": 0.8717625530560812,
+    "epoch": 0.8031939877876938,
+    "step": 1425
+  },
+  {
+    "loss": 0.5437938308715821,
+    "grad_norm": 0.5071395635604858,
+    "learning_rate": 4.292782597227962e-06,
+    "entropy": 0.5442028508583705,
+    "num_tokens": 54003611.0,
+    "mean_token_accuracy": 0.8705387047926585,
+    "epoch": 0.8172851103804603,
+    "step": 1450
+  },
+  {
+    "loss": 0.5272453689575195,
+    "grad_norm": 0.4754573404788971,
+    "learning_rate": 3.67565939554044e-06,
+    "entropy": 0.5260281827052434,
+    "num_tokens": 54934829.0,
+    "mean_token_accuracy": 0.873881352742513,
+    "epoch": 0.8313762329732268,
+    "step": 1475
+  },
+  {
+    "loss": 0.5533076095581054,
+    "grad_norm": 0.4666975140571594,
+    "learning_rate": 3.1028882533813643e-06,
+    "entropy": 0.5506138996283213,
+    "num_tokens": 55878834.0,
+    "mean_token_accuracy": 0.8685371776421865,
+    "epoch": 0.8454673555659934,
+    "step": 1500
+  },
+  {
+    "eval_loss": 0.5465222001075745,
+    "eval_runtime": 34.9157,
+    "eval_samples_per_second": 14.32,
+    "eval_steps_per_second": 0.916,
+    "eval_entropy": 0.5545311672613025,
+    "eval_num_tokens": 55878834.0,
+    "eval_mean_token_accuracy": 0.8680466562509537,
+    "epoch": 0.8454673555659934,
+    "step": 1500
+  },
+  {
+    "loss": 0.5680919265747071,
+    "grad_norm": 0.4815196990966797,
+    "learning_rate": 2.57566046572508e-06,
+    "entropy": 0.5671820533275604,
+    "num_tokens": 56792774.0,
+    "mean_token_accuracy": 0.8665999062856038,
+    "epoch": 0.85955847815876,
+    "step": 1525
+  },
+  {
+    "loss": 0.5526847839355469,
+    "grad_norm": 0.5315864086151123,
+    "learning_rate": 2.0950726028551306e-06,
+    "entropy": 0.5491122953097025,
+    "num_tokens": 57715569.0,
+    "mean_token_accuracy": 0.8698907673358918,
+    "epoch": 0.8736496007515265,
+    "step": 1550
+  },
+  {
+    "loss": 0.5509255599975585,
+    "grad_norm": 0.4776453375816345,
+    "learning_rate": 1.6621242296301964e-06,
+    "entropy": 0.5463390636444092,
+    "num_tokens": 58638742.0,
+    "mean_token_accuracy": 0.8696528116861979,
+    "epoch": 0.8877407233442931,
+    "step": 1575
+  },
+  {
+    "loss": 0.5376947021484375,
+    "grad_norm": 0.5089407563209534,
+    "learning_rate": 1.2777158265095901e-06,
+    "entropy": 0.5351915061473846,
+    "num_tokens": 59557570.0,
+    "mean_token_accuracy": 0.8726084315776825,
+    "epoch": 0.9018318459370597,
+    "step": 1600
+  },
+  {
+    "eval_loss": 0.5455822944641113,
+    "eval_runtime": 34.9436,
+    "eval_samples_per_second": 14.309,
+    "eval_steps_per_second": 0.916,
+    "eval_entropy": 0.5497067291289568,
+    "eval_num_tokens": 59557570.0,
+    "eval_mean_token_accuracy": 0.8682057596743107,
+    "epoch": 0.9018318459370597,
+    "step": 1600
+  },
+  {
+    "loss": 0.5281303787231445,
+    "grad_norm": 0.49538421630859375,
+    "learning_rate": 9.426469166623764e-07,
+    "entropy": 0.5247322716315588,
+    "num_tokens": 60483216.0,
+    "mean_token_accuracy": 0.874461769660314,
+    "epoch": 0.9159229685298262,
+    "step": 1625
+  },
+  {
+    "loss": 0.5454143524169922,
+    "grad_norm": 0.49807706475257874,
+    "learning_rate": 6.576144030555259e-07,
+    "entropy": 0.5443872211376826,
+    "num_tokens": 61433543.0,
+    "mean_token_accuracy": 0.8711319859822592,
+    "epoch": 0.9300140911225928,
+    "step": 1650
+  },
+  {
+    "loss": 0.5425717926025391,
+    "grad_norm": 0.4815911650657654,
+    "learning_rate": 4.2321111897965784e-07,
+    "entropy": 0.5405582892894745,
+    "num_tokens": 62383514.0,
+    "mean_token_accuracy": 0.8713100798924764,
+    "epoch": 0.9441052137153593,
+    "step": 1675
+  },
+  {
+    "loss": 0.5433485412597656,
+    "grad_norm": 0.7678675055503845,
+    "learning_rate": 2.399245950272466e-07,
+    "entropy": 0.5402486324310303,
+    "num_tokens": 63316188.0,
+    "mean_token_accuracy": 0.8717403117815653,
+    "epoch": 0.9581963363081258,
+    "step": 1700
+  },
+  {
+    "eval_loss": 0.5451184511184692,
+    "eval_runtime": 34.9697,
+    "eval_samples_per_second": 14.298,
+    "eval_steps_per_second": 0.915,
+    "eval_entropy": 0.5487884283065796,
+    "eval_num_tokens": 63316188.0,
+    "eval_mean_token_accuracy": 0.8685821667313576,
+    "epoch": 0.9581963363081258,
+    "step": 1700
+  },
+  {
+    "loss": 0.5411444473266601,
+    "grad_norm": 0.6061132550239563,
+    "learning_rate": 1.0813604508771169e-07,
+    "entropy": 0.5386804081996281,
+    "num_tokens": 64243151.0,
+    "mean_token_accuracy": 0.8711121753851573,
+    "epoch": 0.9722874589008924,
+    "step": 1725
+  },
+  {
+    "loss": 0.5380656433105468,
+    "grad_norm": 0.5095033645629883,
+    "learning_rate": 2.811957346845473e-08,
+    "entropy": 0.5319451389710108,
+    "num_tokens": 65152283.0,
+    "mean_token_accuracy": 0.8721891554196676,
+    "epoch": 0.986378581493659,
+    "step": 1750
+  },
+  {
+    "loss": 0.5395606231689453,
+    "grad_norm": 5.098243713378906,
+    "learning_rate": 4.160479090409286e-11,
+    "entropy": 0.5309042213291958,
+    "num_tokens": 66051107.0,
+    "mean_token_accuracy": 0.8716224238790314,
+    "epoch": 1.0,
+    "step": 1775
+  },
+  {
+    "train_runtime": 40018.5607,
+    "train_samples_per_second": 4.256,
+    "train_steps_per_second": 0.044,
+    "total_flos": 8.291891439265674e+18,
+    "train_loss": 0.6173567452229245,
+    "epoch": 1.0,
+    "step": 1775
+  }
+]

training_metadata.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "model_name": "google/gemma-2-9b-it",
+  "display_name": "Gemma 2 9B (BF16, Batch16 MaxSafe)",
+  "timestamp": "2026-02-04T13:14:38.929340",
+  "training_config": {
+    "num_train_epochs": 1,
+    "per_device_train_batch_size": 16,
+    "gradient_accumulation_steps": 6,
+    "learning_rate": 5e-05,
+    "warmup_ratio": 0.03,
+    "lr_scheduler_type": "cosine",
+    "weight_decay": 0.01,
+    "max_seq_length": 2048,
+    "logging_steps": 25,
+    "eval_steps": 100,
+    "save_steps": 200,
+    "seed": 42,
+    "bf16": true,
+    "optim": "adamw_torch_fused",
+    "dataloader_num_workers": 8,
+    "torch_compile": false
+  },
+  "lora_config": {
+    "r": 16,
+    "lora_alpha": 32,
+    "lora_dropout": 0.05,
+    "target_modules": [
+      "q_proj",
+      "k_proj",
+      "v_proj",
+      "o_proj",
+      "gate_proj",
+      "up_proj",
+      "down_proj"
+    ],
+    "bias": "none",
+    "task_type": "CAUSAL_LM"
+  },
+  "train_loss": 0.6173567452229245,
+  "train_samples": 170305,
+  "val_samples": 8965,
+  "train_time_minutes": 666.9830995202065,
+  "max_memory_gb": 77.72561597824097,
+  "fix_applied": "YAML normalization via PyYAML (2 spaces), packing=False, Native BF16 Training, Batch 96"
+}