finalform commited on Nov 9, 2025

Commit

d0b9c22

verified ·

1 Parent(s): 90431f1

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
README.md +4 -3
adapter_config.json +5 -5
adapter_model.safetensors +1 -1
checkpoint-104/adapter_config.json +6 -6
checkpoint-104/adapter_model.safetensors +1 -1
checkpoint-104/optimizer.pt +1 -1
checkpoint-104/rng_state_0.pth +1 -1
checkpoint-104/rng_state_1.pth +1 -1
checkpoint-104/rng_state_2.pth +1 -1
checkpoint-104/rng_state_3.pth +1 -1
checkpoint-104/scheduler.pt +2 -2
checkpoint-104/special_tokens_map.json +7 -1
checkpoint-104/tokenizer_config.json +1 -1
checkpoint-104/trainer_state.json +37 -101
checkpoint-104/training_args.bin +1 -1
checkpoint-208/adapter_config.json +6 -6
checkpoint-208/adapter_model.safetensors +1 -1
checkpoint-208/optimizer.pt +1 -1
checkpoint-208/rng_state_0.pth +1 -1
checkpoint-208/rng_state_1.pth +1 -1
checkpoint-208/rng_state_2.pth +1 -1
checkpoint-208/rng_state_3.pth +1 -1
checkpoint-208/scheduler.pt +2 -2
checkpoint-208/special_tokens_map.json +7 -1
checkpoint-208/tokenizer_config.json +1 -1
checkpoint-208/trainer_state.json +72 -200
checkpoint-208/training_args.bin +1 -1
checkpoint-312/adapter_config.json +6 -6
checkpoint-312/adapter_model.safetensors +1 -1
checkpoint-312/optimizer.pt +1 -1
checkpoint-312/rng_state_0.pth +1 -1
checkpoint-312/rng_state_1.pth +1 -1
checkpoint-312/rng_state_2.pth +1 -1
checkpoint-312/rng_state_3.pth +1 -1
checkpoint-312/scheduler.pt +1 -1
checkpoint-312/special_tokens_map.json +7 -1
checkpoint-312/tokenizer_config.json +1 -1
checkpoint-312/trainer_state.json +105 -306
checkpoint-312/training_args.bin +1 -1
checkpoint-416/README.md +209 -0
checkpoint-416/adapter_config.json +42 -0
checkpoint-416/adapter_model.safetensors +3 -0
checkpoint-416/added_tokens.json +24 -0
checkpoint-416/chat_template.jinja +54 -0
checkpoint-416/merges.txt +0 -0
checkpoint-416/optimizer.pt +3 -0
checkpoint-416/rng_state_0.pth +3 -0
checkpoint-416/rng_state_1.pth +3 -0
checkpoint-416/rng_state_2.pth +3 -0

.gitattributes CHANGED Viewed

@@ -41,3 +41,7 @@ checkpoint-312/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoint-52/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoint-364/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 checkpoint-52/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoint-364/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-416/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-520/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-624/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-728/tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,16 +1,17 @@
 ---
 base_model: Qwen/Qwen2.5-7B-Instruct
 library_name: peft
-model_name: foamqwen
 tags:
 - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
 - lora
 - sft
 - transformers
 - trl
-licence: license
-pipeline_tag: text-generation
 ---
 # Model Card for foamqwen

 ---
 base_model: Qwen/Qwen2.5-7B-Instruct
 library_name: peft
+pipeline_tag: text-generation
 tags:
 - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
 - lora
 - sft
 - transformers
 - trl
 ---
+### Framework versions
+- PEFT 0.17.0
 # Model Card for foamqwen

adapter_config.json CHANGED Viewed

@@ -15,7 +15,7 @@
   "loftq_config": {},
   "lora_alpha": 16,
   "lora_bias": false,
-  "lora_dropout": 0.05,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
@@ -25,13 +25,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "gate_proj",
-    "o_proj",
-    "up_proj",
     "v_proj",
     "k_proj",
     "down_proj",
-    "q_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "loftq_config": {},
   "lora_alpha": 16,
   "lora_bias": false,
+  "lora_dropout": 0.1,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "v_proj",
     "k_proj",
+    "up_proj",
     "down_proj",
+    "q_proj",
+    "o_proj",
+    "gate_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2c931ff65e7868a951e93334044f8b054bc3d954b4b2b9bb6db80e00db07c4c8
 size 645975704

 version https://git-lfs.github.com/spec/v1
+oid sha256:c5711c28d3d33aa09d94c62c9a136b8bf0f0cdbd49f90528dd206ab969f2ec3e
 size 645975704

checkpoint-104/adapter_config.json CHANGED Viewed

@@ -15,7 +15,7 @@
   "loftq_config": {},
   "lora_alpha": 16,
   "lora_bias": false,
-  "lora_dropout": 0.05,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
@@ -25,13 +25,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "gate_proj",
     "o_proj",
-    "up_proj",
     "v_proj",
-    "k_proj",
-    "down_proj",
-    "q_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "loftq_config": {},
   "lora_alpha": 16,
   "lora_bias": false,
+  "lora_dropout": 0.1,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "down_proj",
+    "k_proj",
     "o_proj",
+    "q_proj",
     "v_proj",
+    "up_proj",
+    "gate_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

checkpoint-104/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bde6f6c34ecfc667be607f66c0be3608b265b13ddd5bec606209d49f6a9ad377
 size 645975704

 version https://git-lfs.github.com/spec/v1
+oid sha256:4fc6cd409c955d35b5a6a620baedd9530a4c5f73f68bbe3082ddb660de6919d8
 size 645975704

checkpoint-104/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:58cc59bc26368fda46e4328622054e26e2a4e77ddf312428c353676f9e983a21
 size 1292087115

 version https://git-lfs.github.com/spec/v1
+oid sha256:5478a87659e3fd833f6e0be022f6cade6754457ac9844e658a304f95edb93418
 size 1292087115

checkpoint-104/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8514828f9d8aab559ae0f4b6b1989545764b4095d99270ac55d236f3a4ce2751
 size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:7b46d4a17bc33ca1534ecbb381b92bb917feb262b6cd6ea1e0aeef66ab3378a5
 size 15429

checkpoint-104/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c9baf0418fd508a533260a12df08c5a04eb2c254426d0e852be3e63d32fd9aa7
 size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:f8ea02c0f84cb7b79a7d01dc1ad12d59feb06a7206324175c7723c9e9e70ef38
 size 15429

checkpoint-104/rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e074ab2936100c00e26b1c10f116f1537ba6440f80d4ea504962bf27db9f8936
 size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:0510c2eb154e655092dfd2b66e653579331ca3559570cdc26dff724493936f08
 size 15429

checkpoint-104/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:604a1bcdea4bd8ceb2e9b35fa59830f94b6c7359f43a2d33f4f7f6c7e6284710
 size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:04ef7e6fbd41972ea743427f306fa0e581982d06dd2fd9a83bc6f1f6e4371346
 size 15429

checkpoint-104/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:50de828614ef29ce7400a822ce49ac96711a9eb7d83ddde8521d2ecc0b064dfb
-size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:a7973bd6977b296bd79b6e5eab7d855b5c9117eea90fc9a6d871d376ce4ddb2d
+size 1401

checkpoint-104/special_tokens_map.json CHANGED Viewed

@@ -21,5 +21,11 @@
     "rstrip": false,
     "single_word": false
   },
-  "pad_token": "<|im_end|>"
 }

     "rstrip": false,
     "single_word": false
   },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
 }

checkpoint-104/tokenizer_config.json CHANGED Viewed

@@ -200,7 +200,7 @@
   "errors": "replace",
   "extra_special_tokens": {},
   "model_max_length": 131072,
-  "pad_token": "<|im_end|>",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null

   "errors": "replace",
   "extra_special_tokens": {},
   "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null

checkpoint-104/trainer_state.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 2.0,
   "eval_steps": 500,
   "global_step": 104,
   "is_hyper_param_search": false,
@@ -10,118 +10,54 @@
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.1927710843373494,
-      "grad_norm": 95.26730346679688,
-      "learning_rate": 0.0004180909090909091,
-      "loss": 22.4465,
-      "mean_token_accuracy": 0.08969678990542888,
-      "num_tokens": 1310720.0,
-      "step": 10
     },
     {
-      "epoch": 0.3855421686746988,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0005103526968014265,
-      "loss": 0.8012,
-      "mean_token_accuracy": 0.13804710581898688,
-      "num_tokens": 2621440.0,
-      "step": 20
-    },
-    {
-      "epoch": 0.5783132530120482,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0005077286477833616,
-      "loss": 452.9577,
-      "mean_token_accuracy": 0.05160275483503938,
-      "num_tokens": 3932160.0,
-      "step": 30
-    },
-    {
-      "epoch": 0.7710843373493976,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0005031081504278389,
-      "loss": 470.5136,
-      "mean_token_accuracy": 0.03822226445190609,
-      "num_tokens": 5242880.0,
-      "step": 40
-    },
-    {
-      "epoch": 0.963855421686747,
-      "grad_norm": 26.303752899169922,
-      "learning_rate": 0.0004965277770447238,
-      "loss": 167.1384,
-      "mean_token_accuracy": 0.057517293840646744,
-      "num_tokens": 6553600.0,
       "step": 50
     },
     {
-      "epoch": 1.0,
-      "eval_loss": NaN,
-      "eval_mean_token_accuracy": 0.13677339731378757,
-      "eval_num_tokens": 6713344.0,
-      "eval_runtime": 8.9806,
-      "eval_samples_per_second": 41.089,
-      "eval_steps_per_second": 5.233,
-      "step": 52
-    },
-    {
-      "epoch": 1.1542168674698796,
-      "grad_norm": 28.755094528198242,
-      "learning_rate": 0.00048803961281790017,
-      "loss": 27.9726,
-      "mean_token_accuracy": 0.03075966710531259,
-      "num_tokens": 7761920.0,
-      "step": 60
-    },
-    {
-      "epoch": 1.346987951807229,
-      "grad_norm": 0.9693858027458191,
-      "learning_rate": 0.000477710843538941,
-      "loss": 2.2869,
-      "mean_token_accuracy": 0.10747051909565926,
-      "num_tokens": 9072640.0,
-      "step": 70
-    },
-    {
-      "epoch": 1.5397590361445783,
-      "grad_norm": 0.36548200249671936,
-      "learning_rate": 0.0004656232238159615,
-      "loss": 60.0031,
-      "mean_token_accuracy": 0.10124717205762863,
-      "num_tokens": 10383360.0,
-      "step": 80
     },
     {
-      "epoch": 1.7325301204819277,
-      "grad_norm": 0.8749092817306519,
-      "learning_rate": 0.0004518724299669051,
-      "loss": 0.8994,
-      "mean_token_accuracy": 0.1551567144691944,
-      "num_tokens": 11694080.0,
-      "step": 90
-    },
-    {
-      "epoch": 1.9253012048192772,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0004365673027192623,
-      "loss": 2.2759,
-      "mean_token_accuracy": 0.13096993789076805,
-      "num_tokens": 13004800.0,
       "step": 100
     },
     {
-      "epoch": 2.0,
-      "eval_loss": NaN,
-      "eval_mean_token_accuracy": 0.14378934084100925,
-      "eval_num_tokens": 13426688.0,
-      "eval_runtime": 8.9689,
-      "eval_samples_per_second": 41.142,
-      "eval_steps_per_second": 5.24,
       "step": 104
     }
   ],
-  "logging_steps": 10,
-  "max_steps": 364,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 7,
   "save_steps": 500,
@@ -137,7 +73,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 5.900837915658813e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.0,
   "eval_steps": 500,
   "global_step": 104,
   "is_hyper_param_search": false,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.24096385542168675,
+      "grad_norm": 0.2236759215593338,
+      "learning_rate": 0.000511,
+      "loss": 0.4204,
+      "mean_token_accuracy": 0.900120057463646,
+      "num_tokens": 567991.0,
+      "step": 25
     },
     {
+      "epoch": 0.4819277108433735,
+      "grad_norm": 0.1322442889213562,
+      "learning_rate": 0.000511,
+      "loss": 0.2913,
+      "mean_token_accuracy": 0.9270081639289856,
+      "num_tokens": 1135343.0,
       "step": 50
     },
     {
+      "epoch": 0.7228915662650602,
+      "grad_norm": 0.19739408791065216,
+      "learning_rate": 0.000511,
+      "loss": 0.2186,
+      "mean_token_accuracy": 0.9418566429615021,
+      "num_tokens": 1703784.0,
+      "step": 75
     },
     {
+      "epoch": 0.963855421686747,
+      "grad_norm": 0.17215745151042938,
+      "learning_rate": 0.000511,
+      "loss": 0.1963,
+      "mean_token_accuracy": 0.9479192215204238,
+      "num_tokens": 2269891.0,
       "step": 100
     },
     {
+      "epoch": 1.0,
+      "eval_loss": 0.19681453704833984,
+      "eval_mean_token_accuracy": 0.9478744319144715,
+      "eval_num_tokens": 2345494.0,
+      "eval_runtime": 4.3187,
+      "eval_samples_per_second": 85.442,
+      "eval_steps_per_second": 10.883,
       "step": 104
     }
   ],
+  "logging_steps": 25,
+  "max_steps": 728,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 7,
   "save_steps": 500,
       "attributes": {}
     }
   },
+  "total_flos": 1.0256509033879962e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

checkpoint-104/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eb24a2e21c60358ea1e7c21423d18ace82a12d90960084fd56c8a7388a71c974
 size 6097

 version https://git-lfs.github.com/spec/v1
+oid sha256:a92f33d5ca39ba292c6b171cffeb00f4d1c361214bebb8604f8ce3482d3b7c8c
 size 6097

checkpoint-208/adapter_config.json CHANGED Viewed

@@ -15,7 +15,7 @@
   "loftq_config": {},
   "lora_alpha": 16,
   "lora_bias": false,
-  "lora_dropout": 0.05,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
@@ -25,13 +25,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "gate_proj",
     "o_proj",
-    "up_proj",
     "v_proj",
-    "k_proj",
-    "down_proj",
-    "q_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "loftq_config": {},
   "lora_alpha": 16,
   "lora_bias": false,
+  "lora_dropout": 0.1,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "down_proj",
+    "k_proj",
     "o_proj",
+    "q_proj",
     "v_proj",
+    "up_proj",
+    "gate_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

checkpoint-208/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b0ac19c8de28684631474a0ad72407294c096a0f549e773c3d1fa82a19be6276
 size 645975704

 version https://git-lfs.github.com/spec/v1
+oid sha256:0bf478a6f256502ea7fff6dbca497e8460a17d13004420d85853719e2329b272
 size 645975704

checkpoint-208/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb5932db4edcbb15546d9f3cab8a3cbbcac452bef68d11ef0b0a0ed08a4fa764
 size 1292087115

 version https://git-lfs.github.com/spec/v1
+oid sha256:815bd40667d86d5a3beced54254a4fdff5e07d5682c6a4b1907679b709d56d9d
 size 1292087115

checkpoint-208/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7839e9b785d9556bc2a3ce31deff96475c6acae542ae5b9b51fbaaafd3b4a372
 size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:3adf91ff8bafb6d2e3300a7c332f71e91ce8b3ec728f0e2aab37908de663b1b8
 size 15429

checkpoint-208/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0976d571e1588d4916ac2d24c7fb1989d36b6419adbca862ff01a6c30d4d6007
 size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:8c70dde40156bddc38880631183ca59dc710551eb7a7733ad9d585cb374e86b3
 size 15429

checkpoint-208/rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a5b0b2b4ce9a1d4eee257302169deb7cf87f1a6dd6ab912fd5f7706ebaacde81
 size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:04aca530bed491901589d495872e054e18dea79299a5f18f260913d260faa876
 size 15429

checkpoint-208/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a6c367e71d78a211df461b18e7fd6a7dc0cbbb7f2f9a71f69b92b578bbf8b510
 size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:46b68311a8f36f1ffecea2c67c06bb30acb6b2d0c53572628d4d32cf4d54e271
 size 15429

checkpoint-208/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:408b45ea3dc39af31cb89d47706eff24014968d56252734cb0fd6c20d70c755a
-size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:b24bf8a41c3e3c688a38ba117e3127352bafa556de7e01cc189f2855569e6d7a
+size 1401

checkpoint-208/special_tokens_map.json CHANGED Viewed

@@ -21,5 +21,11 @@
     "rstrip": false,
     "single_word": false
   },
-  "pad_token": "<|im_end|>"
 }

     "rstrip": false,
     "single_word": false
   },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
 }

checkpoint-208/tokenizer_config.json CHANGED Viewed

@@ -200,7 +200,7 @@
   "errors": "replace",
   "extra_special_tokens": {},
   "model_max_length": 131072,
-  "pad_token": "<|im_end|>",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null

   "errors": "replace",
   "extra_special_tokens": {},
   "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null

checkpoint-208/trainer_state.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 4.0,
   "eval_steps": 500,
   "global_step": 208,
   "is_hyper_param_search": false,
@@ -10,228 +10,100 @@
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.1927710843373494,
-      "grad_norm": 95.26730346679688,
-      "learning_rate": 0.0004180909090909091,
-      "loss": 22.4465,
-      "mean_token_accuracy": 0.08969678990542888,
-      "num_tokens": 1310720.0,
-      "step": 10
-    },
-    {
-      "epoch": 0.3855421686746988,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0005103526968014265,
-      "loss": 0.8012,
-      "mean_token_accuracy": 0.13804710581898688,
-      "num_tokens": 2621440.0,
-      "step": 20
-    },
-    {
-      "epoch": 0.5783132530120482,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0005077286477833616,
-      "loss": 452.9577,
-      "mean_token_accuracy": 0.05160275483503938,
-      "num_tokens": 3932160.0,
-      "step": 30
-    },
-    {
-      "epoch": 0.7710843373493976,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0005031081504278389,
-      "loss": 470.5136,
-      "mean_token_accuracy": 0.03822226445190609,
-      "num_tokens": 5242880.0,
-      "step": 40
-    },
-    {
-      "epoch": 0.963855421686747,
-      "grad_norm": 26.303752899169922,
-      "learning_rate": 0.0004965277770447238,
-      "loss": 167.1384,
-      "mean_token_accuracy": 0.057517293840646744,
-      "num_tokens": 6553600.0,
       "step": 50
     },
     {
-      "epoch": 1.0,
-      "eval_loss": NaN,
-      "eval_mean_token_accuracy": 0.13677339731378757,
-      "eval_num_tokens": 6713344.0,
-      "eval_runtime": 8.9806,
-      "eval_samples_per_second": 41.089,
-      "eval_steps_per_second": 5.233,
-      "step": 52
     },
     {
-      "epoch": 1.1542168674698796,
-      "grad_norm": 28.755094528198242,
-      "learning_rate": 0.00048803961281790017,
-      "loss": 27.9726,
-      "mean_token_accuracy": 0.03075966710531259,
-      "num_tokens": 7761920.0,
-      "step": 60
-    },
-    {
-      "epoch": 1.346987951807229,
-      "grad_norm": 0.9693858027458191,
-      "learning_rate": 0.000477710843538941,
-      "loss": 2.2869,
-      "mean_token_accuracy": 0.10747051909565926,
-      "num_tokens": 9072640.0,
-      "step": 70
-    },
-    {
-      "epoch": 1.5397590361445783,
-      "grad_norm": 0.36548200249671936,
-      "learning_rate": 0.0004656232238159615,
-      "loss": 60.0031,
-      "mean_token_accuracy": 0.10124717205762863,
-      "num_tokens": 10383360.0,
-      "step": 80
-    },
-    {
-      "epoch": 1.7325301204819277,
-      "grad_norm": 0.8749092817306519,
-      "learning_rate": 0.0004518724299669051,
-      "loss": 0.8994,
-      "mean_token_accuracy": 0.1551567144691944,
-      "num_tokens": 11694080.0,
-      "step": 90
-    },
-    {
-      "epoch": 1.9253012048192772,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0004365673027192623,
-      "loss": 2.2759,
-      "mean_token_accuracy": 0.13096993789076805,
-      "num_tokens": 13004800.0,
       "step": 100
     },
     {
-      "epoch": 2.0,
-      "eval_loss": NaN,
-      "eval_mean_token_accuracy": 0.14378934084100925,
-      "eval_num_tokens": 13426688.0,
-      "eval_runtime": 8.9689,
-      "eval_samples_per_second": 41.142,
-      "eval_steps_per_second": 5.24,
       "step": 104
     },
     {
-      "epoch": 2.1156626506024097,
-      "grad_norm": 0.6895984411239624,
-      "learning_rate": 0.0004198289857104298,
-      "loss": 0.2064,
-      "mean_token_accuracy": 0.08719592305678356,
-      "num_tokens": 14213120.0,
-      "step": 110
-    },
-    {
-      "epoch": 2.3084337349397592,
-      "grad_norm": 0.7038294672966003,
-      "learning_rate": 0.0004017899666076801,
-      "loss": 1.3155,
-      "mean_token_accuracy": 0.1053241491317749,
-      "num_tokens": 15523840.0,
-      "step": 120
     },
     {
-      "epoch": 2.5012048192771084,
-      "grad_norm": 27.594745635986328,
-      "learning_rate": 0.0003825930284374996,
-      "loss": 0.0836,
-      "mean_token_accuracy": 0.07201291918754578,
-      "num_tokens": 16834560.0,
-      "step": 130
-    },
-    {
-      "epoch": 2.693975903614458,
-      "grad_norm": 0.0,
-      "learning_rate": 0.00036239011942476655,
-      "loss": 1.364,
-      "mean_token_accuracy": 0.15817394778132438,
-      "num_tokens": 18145280.0,
-      "step": 140
-    },
-    {
-      "epoch": 2.886746987951807,
-      "grad_norm": 0.0,
-      "learning_rate": 0.00034134115028725524,
-      "loss": 3.5977,
-      "mean_token_accuracy": 0.10589548945426941,
-      "num_tokens": 19456000.0,
       "step": 150
     },
     {
-      "epoch": 3.0,
-      "eval_loss": NaN,
-      "eval_mean_token_accuracy": 0.14411297884393245,
-      "eval_num_tokens": 20140032.0,
-      "eval_runtime": 8.9831,
-      "eval_samples_per_second": 41.077,
-      "eval_steps_per_second": 5.232,
-      "step": 156
     },
     {
-      "epoch": 3.07710843373494,
-      "grad_norm": 0.1902359277009964,
-      "learning_rate": 0.0003196127285051592,
-      "loss": 8.9424,
-      "mean_token_accuracy": 0.062061098557484304,
-      "num_tokens": 20664320.0,
-      "step": 160
-    },
-    {
-      "epoch": 3.269879518072289,
-      "grad_norm": 0.3390277326107025,
-      "learning_rate": 0.00029737683958418377,
-      "loss": 12.1371,
-      "mean_token_accuracy": 0.07337962239980697,
-      "num_tokens": 21975040.0,
-      "step": 170
-    },
-    {
-      "epoch": 3.4626506024096386,
-      "grad_norm": 0.0,
-      "learning_rate": 0.00027480948575031854,
-      "loss": 42.6417,
-      "mean_token_accuracy": 0.08556168600916862,
-      "num_tokens": 23285760.0,
-      "step": 180
-    },
-    {
-      "epoch": 3.6554216867469878,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0002520892928513346,
-      "loss": 1.5423,
-      "mean_token_accuracy": 0.13269576877355577,
-      "num_tokens": 24596480.0,
-      "step": 190
-    },
-    {
-      "epoch": 3.8481927710843373,
-      "grad_norm": 0.19443857669830322,
-      "learning_rate": 0.0002293960964917063,
-      "loss": 0.3356,
-      "mean_token_accuracy": 0.17251307517290115,
-      "num_tokens": 25907200.0,
       "step": 200
     },
     {
-      "epoch": 4.0,
-      "eval_loss": NaN,
-      "eval_mean_token_accuracy": 0.14541077867467353,
-      "eval_num_tokens": 26853376.0,
-      "eval_runtime": 8.9721,
-      "eval_samples_per_second": 41.127,
-      "eval_steps_per_second": 5.238,
       "step": 208
     }
   ],
-  "logging_steps": 10,
-  "max_steps": 364,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 7,
   "save_steps": 500,
@@ -247,7 +119,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.1801675852792463e+18,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 2.0,
   "eval_steps": 500,
   "global_step": 208,
   "is_hyper_param_search": false,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.24096385542168675,
+      "grad_norm": 0.2236759215593338,
+      "learning_rate": 0.000511,
+      "loss": 0.4204,
+      "mean_token_accuracy": 0.900120057463646,
+      "num_tokens": 567991.0,
+      "step": 25
+    },
+    {
+      "epoch": 0.4819277108433735,
+      "grad_norm": 0.1322442889213562,
+      "learning_rate": 0.000511,
+      "loss": 0.2913,
+      "mean_token_accuracy": 0.9270081639289856,
+      "num_tokens": 1135343.0,
       "step": 50
     },
     {
+      "epoch": 0.7228915662650602,
+      "grad_norm": 0.19739408791065216,
+      "learning_rate": 0.000511,
+      "loss": 0.2186,
+      "mean_token_accuracy": 0.9418566429615021,
+      "num_tokens": 1703784.0,
+      "step": 75
     },
     {
+      "epoch": 0.963855421686747,
+      "grad_norm": 0.17215745151042938,
+      "learning_rate": 0.000511,
+      "loss": 0.1963,
+      "mean_token_accuracy": 0.9479192215204238,
+      "num_tokens": 2269891.0,
       "step": 100
     },
     {
+      "epoch": 1.0,
+      "eval_loss": 0.19681453704833984,
+      "eval_mean_token_accuracy": 0.9478744319144715,
+      "eval_num_tokens": 2345494.0,
+      "eval_runtime": 4.3187,
+      "eval_samples_per_second": 85.442,
+      "eval_steps_per_second": 10.883,
       "step": 104
     },
     {
+      "epoch": 1.202409638554217,
+      "grad_norm": 0.10346771776676178,
+      "learning_rate": 0.000511,
+      "loss": 0.165,
+      "mean_token_accuracy": 0.9550067053900825,
+      "num_tokens": 2836234.0,
+      "step": 125
     },
     {
+      "epoch": 1.4433734939759035,
+      "grad_norm": 0.0941459909081459,
+      "learning_rate": 0.000511,
+      "loss": 0.1445,
+      "mean_token_accuracy": 0.9606501096487046,
+      "num_tokens": 3403671.0,
       "step": 150
     },
     {
+      "epoch": 1.6843373493975904,
+      "grad_norm": 0.07419874519109726,
+      "learning_rate": 0.000511,
+      "loss": 0.1184,
+      "mean_token_accuracy": 0.9665295648574829,
+      "num_tokens": 3972278.0,
+      "step": 175
     },
     {
+      "epoch": 1.9253012048192772,
+      "grad_norm": 0.08383649587631226,
+      "learning_rate": 0.000511,
+      "loss": 0.1309,
+      "mean_token_accuracy": 0.9640201306343079,
+      "num_tokens": 4538970.0,
       "step": 200
     },
     {
+      "epoch": 2.0,
+      "eval_loss": 0.16037927567958832,
+      "eval_mean_token_accuracy": 0.9557588328706458,
+      "eval_num_tokens": 4690728.0,
+      "eval_runtime": 4.2478,
+      "eval_samples_per_second": 86.868,
+      "eval_steps_per_second": 11.065,
       "step": 208
     }
   ],
+  "logging_steps": 25,
+  "max_steps": 728,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 7,
   "save_steps": 500,
       "attributes": {}
     }
   },
+  "total_flos": 2.0514658423144448e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

checkpoint-208/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eb24a2e21c60358ea1e7c21423d18ace82a12d90960084fd56c8a7388a71c974
 size 6097

 version https://git-lfs.github.com/spec/v1
+oid sha256:a92f33d5ca39ba292c6b171cffeb00f4d1c361214bebb8604f8ce3482d3b7c8c
 size 6097

checkpoint-312/adapter_config.json CHANGED Viewed

@@ -15,7 +15,7 @@
   "loftq_config": {},
   "lora_alpha": 16,
   "lora_bias": false,
-  "lora_dropout": 0.05,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
@@ -25,13 +25,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "gate_proj",
     "o_proj",
-    "up_proj",
     "v_proj",
-    "k_proj",
-    "down_proj",
-    "q_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "loftq_config": {},
   "lora_alpha": 16,
   "lora_bias": false,
+  "lora_dropout": 0.1,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "down_proj",
+    "k_proj",
     "o_proj",
+    "q_proj",
     "v_proj",
+    "up_proj",
+    "gate_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

checkpoint-312/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:553445b0bc853e965e43188ff4f3af2066675d8b2aa535b50f463ce82e72c5ce
 size 645975704

 version https://git-lfs.github.com/spec/v1
+oid sha256:062140058de69da2ff74416b96e510ff3ea8e3630e3cfa2414ae1fa5bed530bd
 size 645975704

checkpoint-312/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d86273ba53204d574c3c13f3700de47e0480acbb2994ae06d3fd7b4d44cc1a05
 size 1292087499

 version https://git-lfs.github.com/spec/v1
+oid sha256:a282343663fb90d99583879419e008ecf5ff31aa87f4664cfb11cd42543b327a
 size 1292087499

checkpoint-312/rng_state_0.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:503880c3e18b0a0a7d070feaf37d865b85b4c8cbe4833f2dc572248b7556301e
 size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:60094a06d6d79464dba44020816cd1c2f7e2a5da0bd09c1e533ad3eddb688564
 size 15429

checkpoint-312/rng_state_1.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f75a725dd878216d7b7084e23c639cb7108c692d446c5a9195ffea137d301dbc
 size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:785ee1730140ccaba6453ba94a5a713f346a9c29e9b86ce8e7c83f6634525222
 size 15429

checkpoint-312/rng_state_2.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e276e3be8d04824150633c0ebed694dcf7871c57f168fa21f75f97b608dff4ad
 size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:8fc115052040f16323733a9ece8dd57daa47ec295a6c498facac0b395731b471
 size 15429

checkpoint-312/rng_state_3.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:acf5a9a218be990f3c3d05fb0163857515aa84ba415c75d204f587b23c31dbc2
 size 15429

 version https://git-lfs.github.com/spec/v1
+oid sha256:49dc588331596d74bb4f1f27781ca80a1dfff453105267c466abff7513f86cff
 size 15429

checkpoint-312/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:88c1f19fbaac09a7b01b826d2a3eb05434d8b50a36c13d838feee781e2642515
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:7f4e419a1d6b526779af1dd5f4f57538634cc30d6affb1f8eceaed3cbe949aa4
 size 1465

checkpoint-312/special_tokens_map.json CHANGED Viewed

@@ -21,5 +21,11 @@
     "rstrip": false,
     "single_word": false
   },
-  "pad_token": "<|im_end|>"
 }

     "rstrip": false,
     "single_word": false
   },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
 }

checkpoint-312/tokenizer_config.json CHANGED Viewed

@@ -200,7 +200,7 @@
   "errors": "replace",
   "extra_special_tokens": {},
   "model_max_length": 131072,
-  "pad_token": "<|im_end|>",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null

   "errors": "replace",
   "extra_special_tokens": {},
   "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null

checkpoint-312/trainer_state.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 6.0,
   "eval_steps": 500,
   "global_step": 312,
   "is_hyper_param_search": false,
@@ -10,347 +10,146 @@
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.1927710843373494,
-      "grad_norm": 95.26730346679688,
-      "learning_rate": 0.0004180909090909091,
-      "loss": 22.4465,
-      "mean_token_accuracy": 0.08969678990542888,
-      "num_tokens": 1310720.0,
-      "step": 10
-    },
-    {
-      "epoch": 0.3855421686746988,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0005103526968014265,
-      "loss": 0.8012,
-      "mean_token_accuracy": 0.13804710581898688,
-      "num_tokens": 2621440.0,
-      "step": 20
-    },
-    {
-      "epoch": 0.5783132530120482,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0005077286477833616,
-      "loss": 452.9577,
-      "mean_token_accuracy": 0.05160275483503938,
-      "num_tokens": 3932160.0,
-      "step": 30
-    },
-    {
-      "epoch": 0.7710843373493976,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0005031081504278389,
-      "loss": 470.5136,
-      "mean_token_accuracy": 0.03822226445190609,
-      "num_tokens": 5242880.0,
-      "step": 40
-    },
-    {
-      "epoch": 0.963855421686747,
-      "grad_norm": 26.303752899169922,
-      "learning_rate": 0.0004965277770447238,
-      "loss": 167.1384,
-      "mean_token_accuracy": 0.057517293840646744,
-      "num_tokens": 6553600.0,
       "step": 50
     },
     {
-      "epoch": 1.0,
-      "eval_loss": NaN,
-      "eval_mean_token_accuracy": 0.13677339731378757,
-      "eval_num_tokens": 6713344.0,
-      "eval_runtime": 8.9806,
-      "eval_samples_per_second": 41.089,
-      "eval_steps_per_second": 5.233,
-      "step": 52
-    },
-    {
-      "epoch": 1.1542168674698796,
-      "grad_norm": 28.755094528198242,
-      "learning_rate": 0.00048803961281790017,
-      "loss": 27.9726,
-      "mean_token_accuracy": 0.03075966710531259,
-      "num_tokens": 7761920.0,
-      "step": 60
     },
     {
-      "epoch": 1.346987951807229,
-      "grad_norm": 0.9693858027458191,
-      "learning_rate": 0.000477710843538941,
-      "loss": 2.2869,
-      "mean_token_accuracy": 0.10747051909565926,
-      "num_tokens": 9072640.0,
-      "step": 70
-    },
-    {
-      "epoch": 1.5397590361445783,
-      "grad_norm": 0.36548200249671936,
-      "learning_rate": 0.0004656232238159615,
-      "loss": 60.0031,
-      "mean_token_accuracy": 0.10124717205762863,
-      "num_tokens": 10383360.0,
-      "step": 80
-    },
-    {
-      "epoch": 1.7325301204819277,
-      "grad_norm": 0.8749092817306519,
-      "learning_rate": 0.0004518724299669051,
-      "loss": 0.8994,
-      "mean_token_accuracy": 0.1551567144691944,
-      "num_tokens": 11694080.0,
-      "step": 90
-    },
-    {
-      "epoch": 1.9253012048192772,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0004365673027192623,
-      "loss": 2.2759,
-      "mean_token_accuracy": 0.13096993789076805,
-      "num_tokens": 13004800.0,
       "step": 100
     },
     {
-      "epoch": 2.0,
-      "eval_loss": NaN,
-      "eval_mean_token_accuracy": 0.14378934084100925,
-      "eval_num_tokens": 13426688.0,
-      "eval_runtime": 8.9689,
-      "eval_samples_per_second": 41.142,
-      "eval_steps_per_second": 5.24,
       "step": 104
     },
     {
-      "epoch": 2.1156626506024097,
-      "grad_norm": 0.6895984411239624,
-      "learning_rate": 0.0004198289857104298,
-      "loss": 0.2064,
-      "mean_token_accuracy": 0.08719592305678356,
-      "num_tokens": 14213120.0,
-      "step": 110
     },
     {
-      "epoch": 2.3084337349397592,
-      "grad_norm": 0.7038294672966003,
-      "learning_rate": 0.0004017899666076801,
-      "loss": 1.3155,
-      "mean_token_accuracy": 0.1053241491317749,
-      "num_tokens": 15523840.0,
-      "step": 120
-    },
-    {
-      "epoch": 2.5012048192771084,
-      "grad_norm": 27.594745635986328,
-      "learning_rate": 0.0003825930284374996,
-      "loss": 0.0836,
-      "mean_token_accuracy": 0.07201291918754578,
-      "num_tokens": 16834560.0,
-      "step": 130
-    },
-    {
-      "epoch": 2.693975903614458,
-      "grad_norm": 0.0,
-      "learning_rate": 0.00036239011942476655,
-      "loss": 1.364,
-      "mean_token_accuracy": 0.15817394778132438,
-      "num_tokens": 18145280.0,
-      "step": 140
-    },
-    {
-      "epoch": 2.886746987951807,
-      "grad_norm": 0.0,
-      "learning_rate": 0.00034134115028725524,
-      "loss": 3.5977,
-      "mean_token_accuracy": 0.10589548945426941,
-      "num_tokens": 19456000.0,
       "step": 150
     },
     {
-      "epoch": 3.0,
-      "eval_loss": NaN,
-      "eval_mean_token_accuracy": 0.14411297884393245,
-      "eval_num_tokens": 20140032.0,
-      "eval_runtime": 8.9831,
-      "eval_samples_per_second": 41.077,
-      "eval_steps_per_second": 5.232,
-      "step": 156
-    },
-    {
-      "epoch": 3.07710843373494,
-      "grad_norm": 0.1902359277009964,
-      "learning_rate": 0.0003196127285051592,
-      "loss": 8.9424,
-      "mean_token_accuracy": 0.062061098557484304,
-      "num_tokens": 20664320.0,
-      "step": 160
-    },
-    {
-      "epoch": 3.269879518072289,
-      "grad_norm": 0.3390277326107025,
-      "learning_rate": 0.00029737683958418377,
-      "loss": 12.1371,
-      "mean_token_accuracy": 0.07337962239980697,
-      "num_tokens": 21975040.0,
-      "step": 170
-    },
-    {
-      "epoch": 3.4626506024096386,
-      "grad_norm": 0.0,
-      "learning_rate": 0.00027480948575031854,
-      "loss": 42.6417,
-      "mean_token_accuracy": 0.08556168600916862,
-      "num_tokens": 23285760.0,
-      "step": 180
     },
     {
-      "epoch": 3.6554216867469878,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0002520892928513346,
-      "loss": 1.5423,
-      "mean_token_accuracy": 0.13269576877355577,
-      "num_tokens": 24596480.0,
-      "step": 190
-    },
-    {
-      "epoch": 3.8481927710843373,
-      "grad_norm": 0.19443857669830322,
-      "learning_rate": 0.0002293960964917063,
-      "loss": 0.3356,
-      "mean_token_accuracy": 0.17251307517290115,
-      "num_tokens": 25907200.0,
       "step": 200
     },
     {
-      "epoch": 4.0,
-      "eval_loss": NaN,
-      "eval_mean_token_accuracy": 0.14541077867467353,
-      "eval_num_tokens": 26853376.0,
-      "eval_runtime": 8.9721,
-      "eval_samples_per_second": 41.127,
-      "eval_steps_per_second": 5.238,
       "step": 208
     },
     {
-      "epoch": 4.03855421686747,
-      "grad_norm": 0.009991697035729885,
-      "learning_rate": 0.00020690951859202796,
-      "loss": 0.4818,
-      "mean_token_accuracy": 0.16193881593173062,
-      "num_tokens": 27115520.0,
-      "step": 210
-    },
-    {
-      "epoch": 4.231325301204819,
-      "grad_norm": 0.0,
-      "learning_rate": 0.0001848075456397883,
-      "loss": 0.3988,
-      "mean_token_accuracy": 0.12297056466341019,
-      "num_tokens": 28426240.0,
-      "step": 220
-    },
-    {
-      "epoch": 4.424096385542168,
-      "grad_norm": 0.007613173220306635,
-      "learning_rate": 0.00016326511988497662,
-      "loss": 0.0274,
-      "mean_token_accuracy": 0.11160993352532386,
-      "num_tokens": 29736960.0,
-      "step": 230
     },
     {
-      "epoch": 4.6168674698795185,
-      "grad_norm": 0.0015245635295286775,
-      "learning_rate": 0.0001424527546315377,
-      "loss": 2.8338,
-      "mean_token_accuracy": 0.06221988797187805,
-      "num_tokens": 31047680.0,
-      "step": 240
-    },
-    {
-      "epoch": 4.809638554216868,
-      "grad_norm": 0.26247891783714294,
-      "learning_rate": 0.00012253518458496144,
-      "loss": 0.2528,
-      "mean_token_accuracy": 0.09861943274736404,
-      "num_tokens": 32358400.0,
       "step": 250
     },
     {
-      "epoch": 5.0,
-      "grad_norm": 0.00404323311522603,
-      "learning_rate": 0.00010367006193882396,
-      "loss": 0.2094,
-      "mean_token_accuracy": 0.13716269520264637,
-      "num_tokens": 33566720.0,
-      "step": 260
-    },
-    {
-      "epoch": 5.0,
-      "eval_loss": NaN,
-      "eval_mean_token_accuracy": 0.14629162118789998,
-      "eval_num_tokens": 33566720.0,
-      "eval_runtime": 9.0068,
-      "eval_samples_per_second": 40.969,
-      "eval_steps_per_second": 5.218,
-      "step": 260
-    },
-    {
-      "epoch": 5.192771084337349,
-      "grad_norm": 0.05429690331220627,
-      "learning_rate": 8.600670852105292e-05,
-      "loss": 1.0623,
-      "mean_token_accuracy": 0.09910124614834785,
-      "num_tokens": 34877440.0,
-      "step": 270
-    },
-    {
-      "epoch": 5.385542168674699,
-      "grad_norm": 0.0,
-      "learning_rate": 6.968493387697466e-05,
-      "loss": 1.6628,
-      "mean_token_accuracy": 0.12442896366119385,
-      "num_tokens": 36188160.0,
-      "step": 280
-    },
-    {
-      "epoch": 5.578313253012048,
-      "grad_norm": 0.01778862252831459,
-      "learning_rate": 5.483392864428595e-05,
-      "loss": 0.027,
-      "mean_token_accuracy": 0.14972642660140992,
-      "num_tokens": 37498880.0,
-      "step": 290
     },
     {
-      "epoch": 5.771084337349397,
-      "grad_norm": 0.045461323112249374,
-      "learning_rate": 4.1571241979147114e-05,
-      "loss": 2.1638,
-      "mean_token_accuracy": 0.07477418482303619,
-      "num_tokens": 38809600.0,
       "step": 300
     },
     {
-      "epoch": 5.9638554216867465,
-      "grad_norm": 0.0,
-      "learning_rate": 3.000185112730528e-05,
-      "loss": 0.4219,
-      "mean_token_accuracy": 0.09914562478661537,
-      "num_tokens": 40120320.0,
-      "step": 310
-    },
-    {
-      "epoch": 6.0,
-      "eval_loss": NaN,
-      "eval_mean_token_accuracy": 0.14657345097115698,
-      "eval_num_tokens": 40280064.0,
-      "eval_runtime": 8.9922,
-      "eval_samples_per_second": 41.036,
-      "eval_steps_per_second": 5.227,
       "step": 312
     }
   ],
-  "logging_steps": 10,
-  "max_steps": 364,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 7,
   "save_steps": 500,
@@ -366,7 +165,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 1.7702513789926113e+18,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 3.0,
   "eval_steps": 500,
   "global_step": 312,
   "is_hyper_param_search": false,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.24096385542168675,
+      "grad_norm": 0.2236759215593338,
+      "learning_rate": 0.000511,
+      "loss": 0.4204,
+      "mean_token_accuracy": 0.900120057463646,
+      "num_tokens": 567991.0,
+      "step": 25
+    },
+    {
+      "epoch": 0.4819277108433735,
+      "grad_norm": 0.1322442889213562,
+      "learning_rate": 0.000511,
+      "loss": 0.2913,
+      "mean_token_accuracy": 0.9270081639289856,
+      "num_tokens": 1135343.0,
       "step": 50
     },
     {
+      "epoch": 0.7228915662650602,
+      "grad_norm": 0.19739408791065216,
+      "learning_rate": 0.000511,
+      "loss": 0.2186,
+      "mean_token_accuracy": 0.9418566429615021,
+      "num_tokens": 1703784.0,
+      "step": 75
     },
     {
+      "epoch": 0.963855421686747,
+      "grad_norm": 0.17215745151042938,
+      "learning_rate": 0.000511,
+      "loss": 0.1963,
+      "mean_token_accuracy": 0.9479192215204238,
+      "num_tokens": 2269891.0,
       "step": 100
     },
     {
+      "epoch": 1.0,
+      "eval_loss": 0.19681453704833984,
+      "eval_mean_token_accuracy": 0.9478744319144715,
+      "eval_num_tokens": 2345494.0,
+      "eval_runtime": 4.3187,
+      "eval_samples_per_second": 85.442,
+      "eval_steps_per_second": 10.883,
       "step": 104
     },
     {
+      "epoch": 1.202409638554217,
+      "grad_norm": 0.10346771776676178,
+      "learning_rate": 0.000511,
+      "loss": 0.165,
+      "mean_token_accuracy": 0.9550067053900825,
+      "num_tokens": 2836234.0,
+      "step": 125
     },
     {
+      "epoch": 1.4433734939759035,
+      "grad_norm": 0.0941459909081459,
+      "learning_rate": 0.000511,
+      "loss": 0.1445,
+      "mean_token_accuracy": 0.9606501096487046,
+      "num_tokens": 3403671.0,
       "step": 150
     },
     {
+      "epoch": 1.6843373493975904,
+      "grad_norm": 0.07419874519109726,
+      "learning_rate": 0.000511,
+      "loss": 0.1184,
+      "mean_token_accuracy": 0.9665295648574829,
+      "num_tokens": 3972278.0,
+      "step": 175
     },
     {
+      "epoch": 1.9253012048192772,
+      "grad_norm": 0.08383649587631226,
+      "learning_rate": 0.000511,
+      "loss": 0.1309,
+      "mean_token_accuracy": 0.9640201306343079,
+      "num_tokens": 4538970.0,
       "step": 200
     },
     {
+      "epoch": 2.0,
+      "eval_loss": 0.16037927567958832,
+      "eval_mean_token_accuracy": 0.9557588328706458,
+      "eval_num_tokens": 4690728.0,
+      "eval_runtime": 4.2478,
+      "eval_samples_per_second": 86.868,
+      "eval_steps_per_second": 11.065,
       "step": 208
     },
     {
+      "epoch": 2.163855421686747,
+      "grad_norm": 0.09131479263305664,
+      "learning_rate": 0.000511,
+      "loss": 0.1127,
+      "mean_token_accuracy": 0.9709722676662483,
+      "num_tokens": 5091564.0,
+      "step": 225
     },
     {
+      "epoch": 2.404819277108434,
+      "grad_norm": 0.09491455554962158,
+      "learning_rate": 0.000511,
+      "loss": 0.1007,
+      "mean_token_accuracy": 0.9710033702850341,
+      "num_tokens": 5659070.0,
       "step": 250
     },
     {
+      "epoch": 2.6457831325301204,
+      "grad_norm": 0.07198868691921234,
+      "learning_rate": 0.000511,
+      "loss": 0.0858,
+      "mean_token_accuracy": 0.9747626584768295,
+      "num_tokens": 6228488.0,
+      "step": 275
     },
     {
+      "epoch": 2.886746987951807,
+      "grad_norm": 0.07914356887340546,
+      "learning_rate": 0.000511,
+      "loss": 0.0961,
+      "mean_token_accuracy": 0.9724871903657913,
+      "num_tokens": 6795848.0,
       "step": 300
     },
     {
+      "epoch": 3.0,
+      "eval_loss": 0.16401147842407227,
+      "eval_mean_token_accuracy": 0.958565741143328,
+      "eval_num_tokens": 7036588.0,
+      "eval_runtime": 4.2348,
+      "eval_samples_per_second": 87.135,
+      "eval_steps_per_second": 11.099,
       "step": 312
     }
   ],
+  "logging_steps": 25,
+  "max_steps": 728,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 7,
   "save_steps": 500,
       "attributes": {}
     }
   },
+  "total_flos": 3.077171409898701e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

checkpoint-312/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eb24a2e21c60358ea1e7c21423d18ace82a12d90960084fd56c8a7388a71c974
 size 6097

 version https://git-lfs.github.com/spec/v1
+oid sha256:a92f33d5ca39ba292c6b171cffeb00f4d1c361214bebb8604f8ce3482d3b7c8c
 size 6097

checkpoint-416/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.0

checkpoint-416/adapter_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "k_proj",
+    "o_proj",
+    "q_proj",
+    "v_proj",
+    "up_proj",
+    "gate_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoint-416/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:542d542fbe24ec80310418b793ff4ba1972ae6587fdf5669491fc92c83b08a09
+size 645975704

checkpoint-416/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoint-416/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoint-416/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-416/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b165c5df29ba238a2a6fe6aee452efc2a9acfdba35ea32f0b467fd2d02c5353
+size 1292087499

checkpoint-416/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:152c5038eb73ee59d2dde5d5b103ac1fbd66c3a40e654eb5c0300cac7dbc116d
+size 15429

checkpoint-416/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bf88dea0c41ff8af9c7036f185396537d2c81ba9abbc7a1f1b60ece0652d2ad
+size 15429

checkpoint-416/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8efe00d663f3c5a76caedcb1606763b501b981c15ca59077f9933614d1cf693e
+size 15429