Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +4 -0
- README.md +4 -3
- adapter_config.json +5 -5
- adapter_model.safetensors +1 -1
- checkpoint-104/adapter_config.json +6 -6
- checkpoint-104/adapter_model.safetensors +1 -1
- checkpoint-104/optimizer.pt +1 -1
- checkpoint-104/rng_state_0.pth +1 -1
- checkpoint-104/rng_state_1.pth +1 -1
- checkpoint-104/rng_state_2.pth +1 -1
- checkpoint-104/rng_state_3.pth +1 -1
- checkpoint-104/scheduler.pt +2 -2
- checkpoint-104/special_tokens_map.json +7 -1
- checkpoint-104/tokenizer_config.json +1 -1
- checkpoint-104/trainer_state.json +37 -101
- checkpoint-104/training_args.bin +1 -1
- checkpoint-208/adapter_config.json +6 -6
- checkpoint-208/adapter_model.safetensors +1 -1
- checkpoint-208/optimizer.pt +1 -1
- checkpoint-208/rng_state_0.pth +1 -1
- checkpoint-208/rng_state_1.pth +1 -1
- checkpoint-208/rng_state_2.pth +1 -1
- checkpoint-208/rng_state_3.pth +1 -1
- checkpoint-208/scheduler.pt +2 -2
- checkpoint-208/special_tokens_map.json +7 -1
- checkpoint-208/tokenizer_config.json +1 -1
- checkpoint-208/trainer_state.json +72 -200
- checkpoint-208/training_args.bin +1 -1
- checkpoint-312/adapter_config.json +6 -6
- checkpoint-312/adapter_model.safetensors +1 -1
- checkpoint-312/optimizer.pt +1 -1
- checkpoint-312/rng_state_0.pth +1 -1
- checkpoint-312/rng_state_1.pth +1 -1
- checkpoint-312/rng_state_2.pth +1 -1
- checkpoint-312/rng_state_3.pth +1 -1
- checkpoint-312/scheduler.pt +1 -1
- checkpoint-312/special_tokens_map.json +7 -1
- checkpoint-312/tokenizer_config.json +1 -1
- checkpoint-312/trainer_state.json +105 -306
- checkpoint-312/training_args.bin +1 -1
- checkpoint-416/README.md +209 -0
- checkpoint-416/adapter_config.json +42 -0
- checkpoint-416/adapter_model.safetensors +3 -0
- checkpoint-416/added_tokens.json +24 -0
- checkpoint-416/chat_template.jinja +54 -0
- checkpoint-416/merges.txt +0 -0
- checkpoint-416/optimizer.pt +3 -0
- checkpoint-416/rng_state_0.pth +3 -0
- checkpoint-416/rng_state_1.pth +3 -0
- checkpoint-416/rng_state_2.pth +3 -0
.gitattributes
CHANGED
|
@@ -41,3 +41,7 @@ checkpoint-312/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
| 41 |
checkpoint-52/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 42 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 43 |
checkpoint-364/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
checkpoint-52/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 42 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 43 |
checkpoint-364/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
checkpoint-416/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
checkpoint-520/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
checkpoint-624/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
checkpoint-728/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,16 +1,17 @@
|
|
| 1 |
---
|
| 2 |
base_model: Qwen/Qwen2.5-7B-Instruct
|
| 3 |
library_name: peft
|
| 4 |
-
|
| 5 |
tags:
|
| 6 |
- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
|
| 7 |
- lora
|
| 8 |
- sft
|
| 9 |
- transformers
|
| 10 |
- trl
|
| 11 |
-
licence: license
|
| 12 |
-
pipeline_tag: text-generation
|
| 13 |
---
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# Model Card for foamqwen
|
| 16 |
|
|
|
|
| 1 |
---
|
| 2 |
base_model: Qwen/Qwen2.5-7B-Instruct
|
| 3 |
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
tags:
|
| 6 |
- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
|
| 7 |
- lora
|
| 8 |
- sft
|
| 9 |
- transformers
|
| 10 |
- trl
|
|
|
|
|
|
|
| 11 |
---
|
| 12 |
+
### Framework versions
|
| 13 |
+
|
| 14 |
+
- PEFT 0.17.0
|
| 15 |
|
| 16 |
# Model Card for foamqwen
|
| 17 |
|
adapter_config.json
CHANGED
|
@@ -15,7 +15,7 @@
|
|
| 15 |
"loftq_config": {},
|
| 16 |
"lora_alpha": 16,
|
| 17 |
"lora_bias": false,
|
| 18 |
-
"lora_dropout": 0.
|
| 19 |
"megatron_config": null,
|
| 20 |
"megatron_core": "megatron.core",
|
| 21 |
"modules_to_save": null,
|
|
@@ -25,13 +25,13 @@
|
|
| 25 |
"rank_pattern": {},
|
| 26 |
"revision": null,
|
| 27 |
"target_modules": [
|
| 28 |
-
"gate_proj",
|
| 29 |
-
"o_proj",
|
| 30 |
-
"up_proj",
|
| 31 |
"v_proj",
|
| 32 |
"k_proj",
|
|
|
|
| 33 |
"down_proj",
|
| 34 |
-
"q_proj"
|
|
|
|
|
|
|
| 35 |
],
|
| 36 |
"target_parameters": null,
|
| 37 |
"task_type": "CAUSAL_LM",
|
|
|
|
| 15 |
"loftq_config": {},
|
| 16 |
"lora_alpha": 16,
|
| 17 |
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.1,
|
| 19 |
"megatron_config": null,
|
| 20 |
"megatron_core": "megatron.core",
|
| 21 |
"modules_to_save": null,
|
|
|
|
| 25 |
"rank_pattern": {},
|
| 26 |
"revision": null,
|
| 27 |
"target_modules": [
|
|
|
|
|
|
|
|
|
|
| 28 |
"v_proj",
|
| 29 |
"k_proj",
|
| 30 |
+
"up_proj",
|
| 31 |
"down_proj",
|
| 32 |
+
"q_proj",
|
| 33 |
+
"o_proj",
|
| 34 |
+
"gate_proj"
|
| 35 |
],
|
| 36 |
"target_parameters": null,
|
| 37 |
"task_type": "CAUSAL_LM",
|
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 645975704
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5711c28d3d33aa09d94c62c9a136b8bf0f0cdbd49f90528dd206ab969f2ec3e
|
| 3 |
size 645975704
|
checkpoint-104/adapter_config.json
CHANGED
|
@@ -15,7 +15,7 @@
|
|
| 15 |
"loftq_config": {},
|
| 16 |
"lora_alpha": 16,
|
| 17 |
"lora_bias": false,
|
| 18 |
-
"lora_dropout": 0.
|
| 19 |
"megatron_config": null,
|
| 20 |
"megatron_core": "megatron.core",
|
| 21 |
"modules_to_save": null,
|
|
@@ -25,13 +25,13 @@
|
|
| 25 |
"rank_pattern": {},
|
| 26 |
"revision": null,
|
| 27 |
"target_modules": [
|
| 28 |
-
"
|
|
|
|
| 29 |
"o_proj",
|
| 30 |
-
"
|
| 31 |
"v_proj",
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"q_proj"
|
| 35 |
],
|
| 36 |
"target_parameters": null,
|
| 37 |
"task_type": "CAUSAL_LM",
|
|
|
|
| 15 |
"loftq_config": {},
|
| 16 |
"lora_alpha": 16,
|
| 17 |
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.1,
|
| 19 |
"megatron_config": null,
|
| 20 |
"megatron_core": "megatron.core",
|
| 21 |
"modules_to_save": null,
|
|
|
|
| 25 |
"rank_pattern": {},
|
| 26 |
"revision": null,
|
| 27 |
"target_modules": [
|
| 28 |
+
"down_proj",
|
| 29 |
+
"k_proj",
|
| 30 |
"o_proj",
|
| 31 |
+
"q_proj",
|
| 32 |
"v_proj",
|
| 33 |
+
"up_proj",
|
| 34 |
+
"gate_proj"
|
|
|
|
| 35 |
],
|
| 36 |
"target_parameters": null,
|
| 37 |
"task_type": "CAUSAL_LM",
|
checkpoint-104/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 645975704
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4fc6cd409c955d35b5a6a620baedd9530a4c5f73f68bbe3082ddb660de6919d8
|
| 3 |
size 645975704
|
checkpoint-104/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1292087115
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5478a87659e3fd833f6e0be022f6cade6754457ac9844e658a304f95edb93418
|
| 3 |
size 1292087115
|
checkpoint-104/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b46d4a17bc33ca1534ecbb381b92bb917feb262b6cd6ea1e0aeef66ab3378a5
|
| 3 |
size 15429
|
checkpoint-104/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f8ea02c0f84cb7b79a7d01dc1ad12d59feb06a7206324175c7723c9e9e70ef38
|
| 3 |
size 15429
|
checkpoint-104/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0510c2eb154e655092dfd2b66e653579331ca3559570cdc26dff724493936f08
|
| 3 |
size 15429
|
checkpoint-104/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04ef7e6fbd41972ea743427f306fa0e581982d06dd2fd9a83bc6f1f6e4371346
|
| 3 |
size 15429
|
checkpoint-104/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a7973bd6977b296bd79b6e5eab7d855b5c9117eea90fc9a6d871d376ce4ddb2d
|
| 3 |
+
size 1401
|
checkpoint-104/special_tokens_map.json
CHANGED
|
@@ -21,5 +21,11 @@
|
|
| 21 |
"rstrip": false,
|
| 22 |
"single_word": false
|
| 23 |
},
|
| 24 |
-
"pad_token":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
}
|
|
|
|
| 21 |
"rstrip": false,
|
| 22 |
"single_word": false
|
| 23 |
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
}
|
checkpoint-104/tokenizer_config.json
CHANGED
|
@@ -200,7 +200,7 @@
|
|
| 200 |
"errors": "replace",
|
| 201 |
"extra_special_tokens": {},
|
| 202 |
"model_max_length": 131072,
|
| 203 |
-
"pad_token": "<|
|
| 204 |
"split_special_tokens": false,
|
| 205 |
"tokenizer_class": "Qwen2Tokenizer",
|
| 206 |
"unk_token": null
|
|
|
|
| 200 |
"errors": "replace",
|
| 201 |
"extra_special_tokens": {},
|
| 202 |
"model_max_length": 131072,
|
| 203 |
+
"pad_token": "<|endoftext|>",
|
| 204 |
"split_special_tokens": false,
|
| 205 |
"tokenizer_class": "Qwen2Tokenizer",
|
| 206 |
"unk_token": null
|
checkpoint-104/trainer_state.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
"global_step": 104,
|
| 8 |
"is_hyper_param_search": false,
|
|
@@ -10,118 +10,54 @@
|
|
| 10 |
"is_world_process_zero": true,
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
-
"epoch": 0.
|
| 14 |
-
"grad_norm":
|
| 15 |
-
"learning_rate": 0.
|
| 16 |
-
"loss":
|
| 17 |
-
"mean_token_accuracy": 0.
|
| 18 |
-
"num_tokens":
|
| 19 |
-
"step":
|
| 20 |
},
|
| 21 |
{
|
| 22 |
-
"epoch": 0.
|
| 23 |
-
"grad_norm": 0.
|
| 24 |
-
"learning_rate": 0.
|
| 25 |
-
"loss": 0.
|
| 26 |
-
"mean_token_accuracy": 0.
|
| 27 |
-
"num_tokens":
|
| 28 |
-
"step": 20
|
| 29 |
-
},
|
| 30 |
-
{
|
| 31 |
-
"epoch": 0.5783132530120482,
|
| 32 |
-
"grad_norm": 0.0,
|
| 33 |
-
"learning_rate": 0.0005077286477833616,
|
| 34 |
-
"loss": 452.9577,
|
| 35 |
-
"mean_token_accuracy": 0.05160275483503938,
|
| 36 |
-
"num_tokens": 3932160.0,
|
| 37 |
-
"step": 30
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"epoch": 0.7710843373493976,
|
| 41 |
-
"grad_norm": 0.0,
|
| 42 |
-
"learning_rate": 0.0005031081504278389,
|
| 43 |
-
"loss": 470.5136,
|
| 44 |
-
"mean_token_accuracy": 0.03822226445190609,
|
| 45 |
-
"num_tokens": 5242880.0,
|
| 46 |
-
"step": 40
|
| 47 |
-
},
|
| 48 |
-
{
|
| 49 |
-
"epoch": 0.963855421686747,
|
| 50 |
-
"grad_norm": 26.303752899169922,
|
| 51 |
-
"learning_rate": 0.0004965277770447238,
|
| 52 |
-
"loss": 167.1384,
|
| 53 |
-
"mean_token_accuracy": 0.057517293840646744,
|
| 54 |
-
"num_tokens": 6553600.0,
|
| 55 |
"step": 50
|
| 56 |
},
|
| 57 |
{
|
| 58 |
-
"epoch":
|
| 59 |
-
"
|
| 60 |
-
"
|
| 61 |
-
"
|
| 62 |
-
"
|
| 63 |
-
"
|
| 64 |
-
"
|
| 65 |
-
"step": 52
|
| 66 |
-
},
|
| 67 |
-
{
|
| 68 |
-
"epoch": 1.1542168674698796,
|
| 69 |
-
"grad_norm": 28.755094528198242,
|
| 70 |
-
"learning_rate": 0.00048803961281790017,
|
| 71 |
-
"loss": 27.9726,
|
| 72 |
-
"mean_token_accuracy": 0.03075966710531259,
|
| 73 |
-
"num_tokens": 7761920.0,
|
| 74 |
-
"step": 60
|
| 75 |
-
},
|
| 76 |
-
{
|
| 77 |
-
"epoch": 1.346987951807229,
|
| 78 |
-
"grad_norm": 0.9693858027458191,
|
| 79 |
-
"learning_rate": 0.000477710843538941,
|
| 80 |
-
"loss": 2.2869,
|
| 81 |
-
"mean_token_accuracy": 0.10747051909565926,
|
| 82 |
-
"num_tokens": 9072640.0,
|
| 83 |
-
"step": 70
|
| 84 |
-
},
|
| 85 |
-
{
|
| 86 |
-
"epoch": 1.5397590361445783,
|
| 87 |
-
"grad_norm": 0.36548200249671936,
|
| 88 |
-
"learning_rate": 0.0004656232238159615,
|
| 89 |
-
"loss": 60.0031,
|
| 90 |
-
"mean_token_accuracy": 0.10124717205762863,
|
| 91 |
-
"num_tokens": 10383360.0,
|
| 92 |
-
"step": 80
|
| 93 |
},
|
| 94 |
{
|
| 95 |
-
"epoch":
|
| 96 |
-
"grad_norm": 0.
|
| 97 |
-
"learning_rate": 0.
|
| 98 |
-
"loss": 0.
|
| 99 |
-
"mean_token_accuracy": 0.
|
| 100 |
-
"num_tokens":
|
| 101 |
-
"step": 90
|
| 102 |
-
},
|
| 103 |
-
{
|
| 104 |
-
"epoch": 1.9253012048192772,
|
| 105 |
-
"grad_norm": 0.0,
|
| 106 |
-
"learning_rate": 0.0004365673027192623,
|
| 107 |
-
"loss": 2.2759,
|
| 108 |
-
"mean_token_accuracy": 0.13096993789076805,
|
| 109 |
-
"num_tokens": 13004800.0,
|
| 110 |
"step": 100
|
| 111 |
},
|
| 112 |
{
|
| 113 |
-
"epoch":
|
| 114 |
-
"eval_loss":
|
| 115 |
-
"eval_mean_token_accuracy": 0.
|
| 116 |
-
"eval_num_tokens":
|
| 117 |
-
"eval_runtime":
|
| 118 |
-
"eval_samples_per_second":
|
| 119 |
-
"eval_steps_per_second":
|
| 120 |
"step": 104
|
| 121 |
}
|
| 122 |
],
|
| 123 |
-
"logging_steps":
|
| 124 |
-
"max_steps":
|
| 125 |
"num_input_tokens_seen": 0,
|
| 126 |
"num_train_epochs": 7,
|
| 127 |
"save_steps": 500,
|
|
@@ -137,7 +73,7 @@
|
|
| 137 |
"attributes": {}
|
| 138 |
}
|
| 139 |
},
|
| 140 |
-
"total_flos":
|
| 141 |
"train_batch_size": 2,
|
| 142 |
"trial_name": null,
|
| 143 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.0,
|
| 6 |
"eval_steps": 500,
|
| 7 |
"global_step": 104,
|
| 8 |
"is_hyper_param_search": false,
|
|
|
|
| 10 |
"is_world_process_zero": true,
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
+
"epoch": 0.24096385542168675,
|
| 14 |
+
"grad_norm": 0.2236759215593338,
|
| 15 |
+
"learning_rate": 0.000511,
|
| 16 |
+
"loss": 0.4204,
|
| 17 |
+
"mean_token_accuracy": 0.900120057463646,
|
| 18 |
+
"num_tokens": 567991.0,
|
| 19 |
+
"step": 25
|
| 20 |
},
|
| 21 |
{
|
| 22 |
+
"epoch": 0.4819277108433735,
|
| 23 |
+
"grad_norm": 0.1322442889213562,
|
| 24 |
+
"learning_rate": 0.000511,
|
| 25 |
+
"loss": 0.2913,
|
| 26 |
+
"mean_token_accuracy": 0.9270081639289856,
|
| 27 |
+
"num_tokens": 1135343.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
"step": 50
|
| 29 |
},
|
| 30 |
{
|
| 31 |
+
"epoch": 0.7228915662650602,
|
| 32 |
+
"grad_norm": 0.19739408791065216,
|
| 33 |
+
"learning_rate": 0.000511,
|
| 34 |
+
"loss": 0.2186,
|
| 35 |
+
"mean_token_accuracy": 0.9418566429615021,
|
| 36 |
+
"num_tokens": 1703784.0,
|
| 37 |
+
"step": 75
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
},
|
| 39 |
{
|
| 40 |
+
"epoch": 0.963855421686747,
|
| 41 |
+
"grad_norm": 0.17215745151042938,
|
| 42 |
+
"learning_rate": 0.000511,
|
| 43 |
+
"loss": 0.1963,
|
| 44 |
+
"mean_token_accuracy": 0.9479192215204238,
|
| 45 |
+
"num_tokens": 2269891.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
"step": 100
|
| 47 |
},
|
| 48 |
{
|
| 49 |
+
"epoch": 1.0,
|
| 50 |
+
"eval_loss": 0.19681453704833984,
|
| 51 |
+
"eval_mean_token_accuracy": 0.9478744319144715,
|
| 52 |
+
"eval_num_tokens": 2345494.0,
|
| 53 |
+
"eval_runtime": 4.3187,
|
| 54 |
+
"eval_samples_per_second": 85.442,
|
| 55 |
+
"eval_steps_per_second": 10.883,
|
| 56 |
"step": 104
|
| 57 |
}
|
| 58 |
],
|
| 59 |
+
"logging_steps": 25,
|
| 60 |
+
"max_steps": 728,
|
| 61 |
"num_input_tokens_seen": 0,
|
| 62 |
"num_train_epochs": 7,
|
| 63 |
"save_steps": 500,
|
|
|
|
| 73 |
"attributes": {}
|
| 74 |
}
|
| 75 |
},
|
| 76 |
+
"total_flos": 1.0256509033879962e+17,
|
| 77 |
"train_batch_size": 2,
|
| 78 |
"trial_name": null,
|
| 79 |
"trial_params": null
|
checkpoint-104/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 6097
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a92f33d5ca39ba292c6b171cffeb00f4d1c361214bebb8604f8ce3482d3b7c8c
|
| 3 |
size 6097
|
checkpoint-208/adapter_config.json
CHANGED
|
@@ -15,7 +15,7 @@
|
|
| 15 |
"loftq_config": {},
|
| 16 |
"lora_alpha": 16,
|
| 17 |
"lora_bias": false,
|
| 18 |
-
"lora_dropout": 0.
|
| 19 |
"megatron_config": null,
|
| 20 |
"megatron_core": "megatron.core",
|
| 21 |
"modules_to_save": null,
|
|
@@ -25,13 +25,13 @@
|
|
| 25 |
"rank_pattern": {},
|
| 26 |
"revision": null,
|
| 27 |
"target_modules": [
|
| 28 |
-
"
|
|
|
|
| 29 |
"o_proj",
|
| 30 |
-
"
|
| 31 |
"v_proj",
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"q_proj"
|
| 35 |
],
|
| 36 |
"target_parameters": null,
|
| 37 |
"task_type": "CAUSAL_LM",
|
|
|
|
| 15 |
"loftq_config": {},
|
| 16 |
"lora_alpha": 16,
|
| 17 |
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.1,
|
| 19 |
"megatron_config": null,
|
| 20 |
"megatron_core": "megatron.core",
|
| 21 |
"modules_to_save": null,
|
|
|
|
| 25 |
"rank_pattern": {},
|
| 26 |
"revision": null,
|
| 27 |
"target_modules": [
|
| 28 |
+
"down_proj",
|
| 29 |
+
"k_proj",
|
| 30 |
"o_proj",
|
| 31 |
+
"q_proj",
|
| 32 |
"v_proj",
|
| 33 |
+
"up_proj",
|
| 34 |
+
"gate_proj"
|
|
|
|
| 35 |
],
|
| 36 |
"target_parameters": null,
|
| 37 |
"task_type": "CAUSAL_LM",
|
checkpoint-208/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 645975704
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0bf478a6f256502ea7fff6dbca497e8460a17d13004420d85853719e2329b272
|
| 3 |
size 645975704
|
checkpoint-208/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1292087115
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:815bd40667d86d5a3beced54254a4fdff5e07d5682c6a4b1907679b709d56d9d
|
| 3 |
size 1292087115
|
checkpoint-208/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3adf91ff8bafb6d2e3300a7c332f71e91ce8b3ec728f0e2aab37908de663b1b8
|
| 3 |
size 15429
|
checkpoint-208/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c70dde40156bddc38880631183ca59dc710551eb7a7733ad9d585cb374e86b3
|
| 3 |
size 15429
|
checkpoint-208/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04aca530bed491901589d495872e054e18dea79299a5f18f260913d260faa876
|
| 3 |
size 15429
|
checkpoint-208/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:46b68311a8f36f1ffecea2c67c06bb30acb6b2d0c53572628d4d32cf4d54e271
|
| 3 |
size 15429
|
checkpoint-208/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b24bf8a41c3e3c688a38ba117e3127352bafa556de7e01cc189f2855569e6d7a
|
| 3 |
+
size 1401
|
checkpoint-208/special_tokens_map.json
CHANGED
|
@@ -21,5 +21,11 @@
|
|
| 21 |
"rstrip": false,
|
| 22 |
"single_word": false
|
| 23 |
},
|
| 24 |
-
"pad_token":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
}
|
|
|
|
| 21 |
"rstrip": false,
|
| 22 |
"single_word": false
|
| 23 |
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
}
|
checkpoint-208/tokenizer_config.json
CHANGED
|
@@ -200,7 +200,7 @@
|
|
| 200 |
"errors": "replace",
|
| 201 |
"extra_special_tokens": {},
|
| 202 |
"model_max_length": 131072,
|
| 203 |
-
"pad_token": "<|
|
| 204 |
"split_special_tokens": false,
|
| 205 |
"tokenizer_class": "Qwen2Tokenizer",
|
| 206 |
"unk_token": null
|
|
|
|
| 200 |
"errors": "replace",
|
| 201 |
"extra_special_tokens": {},
|
| 202 |
"model_max_length": 131072,
|
| 203 |
+
"pad_token": "<|endoftext|>",
|
| 204 |
"split_special_tokens": false,
|
| 205 |
"tokenizer_class": "Qwen2Tokenizer",
|
| 206 |
"unk_token": null
|
checkpoint-208/trainer_state.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
"global_step": 208,
|
| 8 |
"is_hyper_param_search": false,
|
|
@@ -10,228 +10,100 @@
|
|
| 10 |
"is_world_process_zero": true,
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
-
"epoch": 0.
|
| 14 |
-
"grad_norm":
|
| 15 |
-
"learning_rate": 0.
|
| 16 |
-
"loss":
|
| 17 |
-
"mean_token_accuracy": 0.
|
| 18 |
-
"num_tokens":
|
| 19 |
-
"step":
|
| 20 |
-
},
|
| 21 |
-
{
|
| 22 |
-
"epoch": 0.
|
| 23 |
-
"grad_norm": 0.
|
| 24 |
-
"learning_rate": 0.
|
| 25 |
-
"loss": 0.
|
| 26 |
-
"mean_token_accuracy": 0.
|
| 27 |
-
"num_tokens":
|
| 28 |
-
"step": 20
|
| 29 |
-
},
|
| 30 |
-
{
|
| 31 |
-
"epoch": 0.5783132530120482,
|
| 32 |
-
"grad_norm": 0.0,
|
| 33 |
-
"learning_rate": 0.0005077286477833616,
|
| 34 |
-
"loss": 452.9577,
|
| 35 |
-
"mean_token_accuracy": 0.05160275483503938,
|
| 36 |
-
"num_tokens": 3932160.0,
|
| 37 |
-
"step": 30
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"epoch": 0.7710843373493976,
|
| 41 |
-
"grad_norm": 0.0,
|
| 42 |
-
"learning_rate": 0.0005031081504278389,
|
| 43 |
-
"loss": 470.5136,
|
| 44 |
-
"mean_token_accuracy": 0.03822226445190609,
|
| 45 |
-
"num_tokens": 5242880.0,
|
| 46 |
-
"step": 40
|
| 47 |
-
},
|
| 48 |
-
{
|
| 49 |
-
"epoch": 0.963855421686747,
|
| 50 |
-
"grad_norm": 26.303752899169922,
|
| 51 |
-
"learning_rate": 0.0004965277770447238,
|
| 52 |
-
"loss": 167.1384,
|
| 53 |
-
"mean_token_accuracy": 0.057517293840646744,
|
| 54 |
-
"num_tokens": 6553600.0,
|
| 55 |
"step": 50
|
| 56 |
},
|
| 57 |
{
|
| 58 |
-
"epoch":
|
| 59 |
-
"
|
| 60 |
-
"
|
| 61 |
-
"
|
| 62 |
-
"
|
| 63 |
-
"
|
| 64 |
-
"
|
| 65 |
-
"step": 52
|
| 66 |
},
|
| 67 |
{
|
| 68 |
-
"epoch":
|
| 69 |
-
"grad_norm":
|
| 70 |
-
"learning_rate": 0.
|
| 71 |
-
"loss":
|
| 72 |
-
"mean_token_accuracy": 0.
|
| 73 |
-
"num_tokens":
|
| 74 |
-
"step": 60
|
| 75 |
-
},
|
| 76 |
-
{
|
| 77 |
-
"epoch": 1.346987951807229,
|
| 78 |
-
"grad_norm": 0.9693858027458191,
|
| 79 |
-
"learning_rate": 0.000477710843538941,
|
| 80 |
-
"loss": 2.2869,
|
| 81 |
-
"mean_token_accuracy": 0.10747051909565926,
|
| 82 |
-
"num_tokens": 9072640.0,
|
| 83 |
-
"step": 70
|
| 84 |
-
},
|
| 85 |
-
{
|
| 86 |
-
"epoch": 1.5397590361445783,
|
| 87 |
-
"grad_norm": 0.36548200249671936,
|
| 88 |
-
"learning_rate": 0.0004656232238159615,
|
| 89 |
-
"loss": 60.0031,
|
| 90 |
-
"mean_token_accuracy": 0.10124717205762863,
|
| 91 |
-
"num_tokens": 10383360.0,
|
| 92 |
-
"step": 80
|
| 93 |
-
},
|
| 94 |
-
{
|
| 95 |
-
"epoch": 1.7325301204819277,
|
| 96 |
-
"grad_norm": 0.8749092817306519,
|
| 97 |
-
"learning_rate": 0.0004518724299669051,
|
| 98 |
-
"loss": 0.8994,
|
| 99 |
-
"mean_token_accuracy": 0.1551567144691944,
|
| 100 |
-
"num_tokens": 11694080.0,
|
| 101 |
-
"step": 90
|
| 102 |
-
},
|
| 103 |
-
{
|
| 104 |
-
"epoch": 1.9253012048192772,
|
| 105 |
-
"grad_norm": 0.0,
|
| 106 |
-
"learning_rate": 0.0004365673027192623,
|
| 107 |
-
"loss": 2.2759,
|
| 108 |
-
"mean_token_accuracy": 0.13096993789076805,
|
| 109 |
-
"num_tokens": 13004800.0,
|
| 110 |
"step": 100
|
| 111 |
},
|
| 112 |
{
|
| 113 |
-
"epoch":
|
| 114 |
-
"eval_loss":
|
| 115 |
-
"eval_mean_token_accuracy": 0.
|
| 116 |
-
"eval_num_tokens":
|
| 117 |
-
"eval_runtime":
|
| 118 |
-
"eval_samples_per_second":
|
| 119 |
-
"eval_steps_per_second":
|
| 120 |
"step": 104
|
| 121 |
},
|
| 122 |
{
|
| 123 |
-
"epoch":
|
| 124 |
-
"grad_norm": 0.
|
| 125 |
-
"learning_rate": 0.
|
| 126 |
-
"loss": 0.
|
| 127 |
-
"mean_token_accuracy": 0.
|
| 128 |
-
"num_tokens":
|
| 129 |
-
"step":
|
| 130 |
-
},
|
| 131 |
-
{
|
| 132 |
-
"epoch": 2.3084337349397592,
|
| 133 |
-
"grad_norm": 0.7038294672966003,
|
| 134 |
-
"learning_rate": 0.0004017899666076801,
|
| 135 |
-
"loss": 1.3155,
|
| 136 |
-
"mean_token_accuracy": 0.1053241491317749,
|
| 137 |
-
"num_tokens": 15523840.0,
|
| 138 |
-
"step": 120
|
| 139 |
},
|
| 140 |
{
|
| 141 |
-
"epoch":
|
| 142 |
-
"grad_norm":
|
| 143 |
-
"learning_rate": 0.
|
| 144 |
-
"loss": 0.
|
| 145 |
-
"mean_token_accuracy": 0.
|
| 146 |
-
"num_tokens":
|
| 147 |
-
"step": 130
|
| 148 |
-
},
|
| 149 |
-
{
|
| 150 |
-
"epoch": 2.693975903614458,
|
| 151 |
-
"grad_norm": 0.0,
|
| 152 |
-
"learning_rate": 0.00036239011942476655,
|
| 153 |
-
"loss": 1.364,
|
| 154 |
-
"mean_token_accuracy": 0.15817394778132438,
|
| 155 |
-
"num_tokens": 18145280.0,
|
| 156 |
-
"step": 140
|
| 157 |
-
},
|
| 158 |
-
{
|
| 159 |
-
"epoch": 2.886746987951807,
|
| 160 |
-
"grad_norm": 0.0,
|
| 161 |
-
"learning_rate": 0.00034134115028725524,
|
| 162 |
-
"loss": 3.5977,
|
| 163 |
-
"mean_token_accuracy": 0.10589548945426941,
|
| 164 |
-
"num_tokens": 19456000.0,
|
| 165 |
"step": 150
|
| 166 |
},
|
| 167 |
{
|
| 168 |
-
"epoch":
|
| 169 |
-
"
|
| 170 |
-
"
|
| 171 |
-
"
|
| 172 |
-
"
|
| 173 |
-
"
|
| 174 |
-
"
|
| 175 |
-
"step": 156
|
| 176 |
},
|
| 177 |
{
|
| 178 |
-
"epoch":
|
| 179 |
-
"grad_norm": 0.
|
| 180 |
-
"learning_rate": 0.
|
| 181 |
-
"loss":
|
| 182 |
-
"mean_token_accuracy": 0.
|
| 183 |
-
"num_tokens":
|
| 184 |
-
"step": 160
|
| 185 |
-
},
|
| 186 |
-
{
|
| 187 |
-
"epoch": 3.269879518072289,
|
| 188 |
-
"grad_norm": 0.3390277326107025,
|
| 189 |
-
"learning_rate": 0.00029737683958418377,
|
| 190 |
-
"loss": 12.1371,
|
| 191 |
-
"mean_token_accuracy": 0.07337962239980697,
|
| 192 |
-
"num_tokens": 21975040.0,
|
| 193 |
-
"step": 170
|
| 194 |
-
},
|
| 195 |
-
{
|
| 196 |
-
"epoch": 3.4626506024096386,
|
| 197 |
-
"grad_norm": 0.0,
|
| 198 |
-
"learning_rate": 0.00027480948575031854,
|
| 199 |
-
"loss": 42.6417,
|
| 200 |
-
"mean_token_accuracy": 0.08556168600916862,
|
| 201 |
-
"num_tokens": 23285760.0,
|
| 202 |
-
"step": 180
|
| 203 |
-
},
|
| 204 |
-
{
|
| 205 |
-
"epoch": 3.6554216867469878,
|
| 206 |
-
"grad_norm": 0.0,
|
| 207 |
-
"learning_rate": 0.0002520892928513346,
|
| 208 |
-
"loss": 1.5423,
|
| 209 |
-
"mean_token_accuracy": 0.13269576877355577,
|
| 210 |
-
"num_tokens": 24596480.0,
|
| 211 |
-
"step": 190
|
| 212 |
-
},
|
| 213 |
-
{
|
| 214 |
-
"epoch": 3.8481927710843373,
|
| 215 |
-
"grad_norm": 0.19443857669830322,
|
| 216 |
-
"learning_rate": 0.0002293960964917063,
|
| 217 |
-
"loss": 0.3356,
|
| 218 |
-
"mean_token_accuracy": 0.17251307517290115,
|
| 219 |
-
"num_tokens": 25907200.0,
|
| 220 |
"step": 200
|
| 221 |
},
|
| 222 |
{
|
| 223 |
-
"epoch":
|
| 224 |
-
"eval_loss":
|
| 225 |
-
"eval_mean_token_accuracy": 0.
|
| 226 |
-
"eval_num_tokens":
|
| 227 |
-
"eval_runtime":
|
| 228 |
-
"eval_samples_per_second":
|
| 229 |
-
"eval_steps_per_second":
|
| 230 |
"step": 208
|
| 231 |
}
|
| 232 |
],
|
| 233 |
-
"logging_steps":
|
| 234 |
-
"max_steps":
|
| 235 |
"num_input_tokens_seen": 0,
|
| 236 |
"num_train_epochs": 7,
|
| 237 |
"save_steps": 500,
|
|
@@ -247,7 +119,7 @@
|
|
| 247 |
"attributes": {}
|
| 248 |
}
|
| 249 |
},
|
| 250 |
-
"total_flos":
|
| 251 |
"train_batch_size": 2,
|
| 252 |
"trial_name": null,
|
| 253 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
"eval_steps": 500,
|
| 7 |
"global_step": 208,
|
| 8 |
"is_hyper_param_search": false,
|
|
|
|
| 10 |
"is_world_process_zero": true,
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
+
"epoch": 0.24096385542168675,
|
| 14 |
+
"grad_norm": 0.2236759215593338,
|
| 15 |
+
"learning_rate": 0.000511,
|
| 16 |
+
"loss": 0.4204,
|
| 17 |
+
"mean_token_accuracy": 0.900120057463646,
|
| 18 |
+
"num_tokens": 567991.0,
|
| 19 |
+
"step": 25
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"epoch": 0.4819277108433735,
|
| 23 |
+
"grad_norm": 0.1322442889213562,
|
| 24 |
+
"learning_rate": 0.000511,
|
| 25 |
+
"loss": 0.2913,
|
| 26 |
+
"mean_token_accuracy": 0.9270081639289856,
|
| 27 |
+
"num_tokens": 1135343.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
"step": 50
|
| 29 |
},
|
| 30 |
{
|
| 31 |
+
"epoch": 0.7228915662650602,
|
| 32 |
+
"grad_norm": 0.19739408791065216,
|
| 33 |
+
"learning_rate": 0.000511,
|
| 34 |
+
"loss": 0.2186,
|
| 35 |
+
"mean_token_accuracy": 0.9418566429615021,
|
| 36 |
+
"num_tokens": 1703784.0,
|
| 37 |
+
"step": 75
|
|
|
|
| 38 |
},
|
| 39 |
{
|
| 40 |
+
"epoch": 0.963855421686747,
|
| 41 |
+
"grad_norm": 0.17215745151042938,
|
| 42 |
+
"learning_rate": 0.000511,
|
| 43 |
+
"loss": 0.1963,
|
| 44 |
+
"mean_token_accuracy": 0.9479192215204238,
|
| 45 |
+
"num_tokens": 2269891.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
"step": 100
|
| 47 |
},
|
| 48 |
{
|
| 49 |
+
"epoch": 1.0,
|
| 50 |
+
"eval_loss": 0.19681453704833984,
|
| 51 |
+
"eval_mean_token_accuracy": 0.9478744319144715,
|
| 52 |
+
"eval_num_tokens": 2345494.0,
|
| 53 |
+
"eval_runtime": 4.3187,
|
| 54 |
+
"eval_samples_per_second": 85.442,
|
| 55 |
+
"eval_steps_per_second": 10.883,
|
| 56 |
"step": 104
|
| 57 |
},
|
| 58 |
{
|
| 59 |
+
"epoch": 1.202409638554217,
|
| 60 |
+
"grad_norm": 0.10346771776676178,
|
| 61 |
+
"learning_rate": 0.000511,
|
| 62 |
+
"loss": 0.165,
|
| 63 |
+
"mean_token_accuracy": 0.9550067053900825,
|
| 64 |
+
"num_tokens": 2836234.0,
|
| 65 |
+
"step": 125
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
},
|
| 67 |
{
|
| 68 |
+
"epoch": 1.4433734939759035,
|
| 69 |
+
"grad_norm": 0.0941459909081459,
|
| 70 |
+
"learning_rate": 0.000511,
|
| 71 |
+
"loss": 0.1445,
|
| 72 |
+
"mean_token_accuracy": 0.9606501096487046,
|
| 73 |
+
"num_tokens": 3403671.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
"step": 150
|
| 75 |
},
|
| 76 |
{
|
| 77 |
+
"epoch": 1.6843373493975904,
|
| 78 |
+
"grad_norm": 0.07419874519109726,
|
| 79 |
+
"learning_rate": 0.000511,
|
| 80 |
+
"loss": 0.1184,
|
| 81 |
+
"mean_token_accuracy": 0.9665295648574829,
|
| 82 |
+
"num_tokens": 3972278.0,
|
| 83 |
+
"step": 175
|
|
|
|
| 84 |
},
|
| 85 |
{
|
| 86 |
+
"epoch": 1.9253012048192772,
|
| 87 |
+
"grad_norm": 0.08383649587631226,
|
| 88 |
+
"learning_rate": 0.000511,
|
| 89 |
+
"loss": 0.1309,
|
| 90 |
+
"mean_token_accuracy": 0.9640201306343079,
|
| 91 |
+
"num_tokens": 4538970.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
"step": 200
|
| 93 |
},
|
| 94 |
{
|
| 95 |
+
"epoch": 2.0,
|
| 96 |
+
"eval_loss": 0.16037927567958832,
|
| 97 |
+
"eval_mean_token_accuracy": 0.9557588328706458,
|
| 98 |
+
"eval_num_tokens": 4690728.0,
|
| 99 |
+
"eval_runtime": 4.2478,
|
| 100 |
+
"eval_samples_per_second": 86.868,
|
| 101 |
+
"eval_steps_per_second": 11.065,
|
| 102 |
"step": 208
|
| 103 |
}
|
| 104 |
],
|
| 105 |
+
"logging_steps": 25,
|
| 106 |
+
"max_steps": 728,
|
| 107 |
"num_input_tokens_seen": 0,
|
| 108 |
"num_train_epochs": 7,
|
| 109 |
"save_steps": 500,
|
|
|
|
| 119 |
"attributes": {}
|
| 120 |
}
|
| 121 |
},
|
| 122 |
+
"total_flos": 2.0514658423144448e+17,
|
| 123 |
"train_batch_size": 2,
|
| 124 |
"trial_name": null,
|
| 125 |
"trial_params": null
|
checkpoint-208/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 6097
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a92f33d5ca39ba292c6b171cffeb00f4d1c361214bebb8604f8ce3482d3b7c8c
|
| 3 |
size 6097
|
checkpoint-312/adapter_config.json
CHANGED
|
@@ -15,7 +15,7 @@
|
|
| 15 |
"loftq_config": {},
|
| 16 |
"lora_alpha": 16,
|
| 17 |
"lora_bias": false,
|
| 18 |
-
"lora_dropout": 0.
|
| 19 |
"megatron_config": null,
|
| 20 |
"megatron_core": "megatron.core",
|
| 21 |
"modules_to_save": null,
|
|
@@ -25,13 +25,13 @@
|
|
| 25 |
"rank_pattern": {},
|
| 26 |
"revision": null,
|
| 27 |
"target_modules": [
|
| 28 |
-
"
|
|
|
|
| 29 |
"o_proj",
|
| 30 |
-
"
|
| 31 |
"v_proj",
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"q_proj"
|
| 35 |
],
|
| 36 |
"target_parameters": null,
|
| 37 |
"task_type": "CAUSAL_LM",
|
|
|
|
| 15 |
"loftq_config": {},
|
| 16 |
"lora_alpha": 16,
|
| 17 |
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.1,
|
| 19 |
"megatron_config": null,
|
| 20 |
"megatron_core": "megatron.core",
|
| 21 |
"modules_to_save": null,
|
|
|
|
| 25 |
"rank_pattern": {},
|
| 26 |
"revision": null,
|
| 27 |
"target_modules": [
|
| 28 |
+
"down_proj",
|
| 29 |
+
"k_proj",
|
| 30 |
"o_proj",
|
| 31 |
+
"q_proj",
|
| 32 |
"v_proj",
|
| 33 |
+
"up_proj",
|
| 34 |
+
"gate_proj"
|
|
|
|
| 35 |
],
|
| 36 |
"target_parameters": null,
|
| 37 |
"task_type": "CAUSAL_LM",
|
checkpoint-312/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 645975704
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:062140058de69da2ff74416b96e510ff3ea8e3630e3cfa2414ae1fa5bed530bd
|
| 3 |
size 645975704
|
checkpoint-312/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1292087499
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a282343663fb90d99583879419e008ecf5ff31aa87f4664cfb11cd42543b327a
|
| 3 |
size 1292087499
|
checkpoint-312/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60094a06d6d79464dba44020816cd1c2f7e2a5da0bd09c1e533ad3eddb688564
|
| 3 |
size 15429
|
checkpoint-312/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:785ee1730140ccaba6453ba94a5a713f346a9c29e9b86ce8e7c83f6634525222
|
| 3 |
size 15429
|
checkpoint-312/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8fc115052040f16323733a9ece8dd57daa47ec295a6c498facac0b395731b471
|
| 3 |
size 15429
|
checkpoint-312/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 15429
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:49dc588331596d74bb4f1f27781ca80a1dfff453105267c466abff7513f86cff
|
| 3 |
size 15429
|
checkpoint-312/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f4e419a1d6b526779af1dd5f4f57538634cc30d6affb1f8eceaed3cbe949aa4
|
| 3 |
size 1465
|
checkpoint-312/special_tokens_map.json
CHANGED
|
@@ -21,5 +21,11 @@
|
|
| 21 |
"rstrip": false,
|
| 22 |
"single_word": false
|
| 23 |
},
|
| 24 |
-
"pad_token":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
}
|
|
|
|
| 21 |
"rstrip": false,
|
| 22 |
"single_word": false
|
| 23 |
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
}
|
checkpoint-312/tokenizer_config.json
CHANGED
|
@@ -200,7 +200,7 @@
|
|
| 200 |
"errors": "replace",
|
| 201 |
"extra_special_tokens": {},
|
| 202 |
"model_max_length": 131072,
|
| 203 |
-
"pad_token": "<|
|
| 204 |
"split_special_tokens": false,
|
| 205 |
"tokenizer_class": "Qwen2Tokenizer",
|
| 206 |
"unk_token": null
|
|
|
|
| 200 |
"errors": "replace",
|
| 201 |
"extra_special_tokens": {},
|
| 202 |
"model_max_length": 131072,
|
| 203 |
+
"pad_token": "<|endoftext|>",
|
| 204 |
"split_special_tokens": false,
|
| 205 |
"tokenizer_class": "Qwen2Tokenizer",
|
| 206 |
"unk_token": null
|
checkpoint-312/trainer_state.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch":
|
| 6 |
"eval_steps": 500,
|
| 7 |
"global_step": 312,
|
| 8 |
"is_hyper_param_search": false,
|
|
@@ -10,347 +10,146 @@
|
|
| 10 |
"is_world_process_zero": true,
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
-
"epoch": 0.
|
| 14 |
-
"grad_norm":
|
| 15 |
-
"learning_rate": 0.
|
| 16 |
-
"loss":
|
| 17 |
-
"mean_token_accuracy": 0.
|
| 18 |
-
"num_tokens":
|
| 19 |
-
"step":
|
| 20 |
-
},
|
| 21 |
-
{
|
| 22 |
-
"epoch": 0.
|
| 23 |
-
"grad_norm": 0.
|
| 24 |
-
"learning_rate": 0.
|
| 25 |
-
"loss": 0.
|
| 26 |
-
"mean_token_accuracy": 0.
|
| 27 |
-
"num_tokens":
|
| 28 |
-
"step": 20
|
| 29 |
-
},
|
| 30 |
-
{
|
| 31 |
-
"epoch": 0.5783132530120482,
|
| 32 |
-
"grad_norm": 0.0,
|
| 33 |
-
"learning_rate": 0.0005077286477833616,
|
| 34 |
-
"loss": 452.9577,
|
| 35 |
-
"mean_token_accuracy": 0.05160275483503938,
|
| 36 |
-
"num_tokens": 3932160.0,
|
| 37 |
-
"step": 30
|
| 38 |
-
},
|
| 39 |
-
{
|
| 40 |
-
"epoch": 0.7710843373493976,
|
| 41 |
-
"grad_norm": 0.0,
|
| 42 |
-
"learning_rate": 0.0005031081504278389,
|
| 43 |
-
"loss": 470.5136,
|
| 44 |
-
"mean_token_accuracy": 0.03822226445190609,
|
| 45 |
-
"num_tokens": 5242880.0,
|
| 46 |
-
"step": 40
|
| 47 |
-
},
|
| 48 |
-
{
|
| 49 |
-
"epoch": 0.963855421686747,
|
| 50 |
-
"grad_norm": 26.303752899169922,
|
| 51 |
-
"learning_rate": 0.0004965277770447238,
|
| 52 |
-
"loss": 167.1384,
|
| 53 |
-
"mean_token_accuracy": 0.057517293840646744,
|
| 54 |
-
"num_tokens": 6553600.0,
|
| 55 |
"step": 50
|
| 56 |
},
|
| 57 |
{
|
| 58 |
-
"epoch":
|
| 59 |
-
"
|
| 60 |
-
"
|
| 61 |
-
"
|
| 62 |
-
"
|
| 63 |
-
"
|
| 64 |
-
"
|
| 65 |
-
"step": 52
|
| 66 |
-
},
|
| 67 |
-
{
|
| 68 |
-
"epoch": 1.1542168674698796,
|
| 69 |
-
"grad_norm": 28.755094528198242,
|
| 70 |
-
"learning_rate": 0.00048803961281790017,
|
| 71 |
-
"loss": 27.9726,
|
| 72 |
-
"mean_token_accuracy": 0.03075966710531259,
|
| 73 |
-
"num_tokens": 7761920.0,
|
| 74 |
-
"step": 60
|
| 75 |
},
|
| 76 |
{
|
| 77 |
-
"epoch":
|
| 78 |
-
"grad_norm": 0.
|
| 79 |
-
"learning_rate": 0.
|
| 80 |
-
"loss":
|
| 81 |
-
"mean_token_accuracy": 0.
|
| 82 |
-
"num_tokens":
|
| 83 |
-
"step": 70
|
| 84 |
-
},
|
| 85 |
-
{
|
| 86 |
-
"epoch": 1.5397590361445783,
|
| 87 |
-
"grad_norm": 0.36548200249671936,
|
| 88 |
-
"learning_rate": 0.0004656232238159615,
|
| 89 |
-
"loss": 60.0031,
|
| 90 |
-
"mean_token_accuracy": 0.10124717205762863,
|
| 91 |
-
"num_tokens": 10383360.0,
|
| 92 |
-
"step": 80
|
| 93 |
-
},
|
| 94 |
-
{
|
| 95 |
-
"epoch": 1.7325301204819277,
|
| 96 |
-
"grad_norm": 0.8749092817306519,
|
| 97 |
-
"learning_rate": 0.0004518724299669051,
|
| 98 |
-
"loss": 0.8994,
|
| 99 |
-
"mean_token_accuracy": 0.1551567144691944,
|
| 100 |
-
"num_tokens": 11694080.0,
|
| 101 |
-
"step": 90
|
| 102 |
-
},
|
| 103 |
-
{
|
| 104 |
-
"epoch": 1.9253012048192772,
|
| 105 |
-
"grad_norm": 0.0,
|
| 106 |
-
"learning_rate": 0.0004365673027192623,
|
| 107 |
-
"loss": 2.2759,
|
| 108 |
-
"mean_token_accuracy": 0.13096993789076805,
|
| 109 |
-
"num_tokens": 13004800.0,
|
| 110 |
"step": 100
|
| 111 |
},
|
| 112 |
{
|
| 113 |
-
"epoch":
|
| 114 |
-
"eval_loss":
|
| 115 |
-
"eval_mean_token_accuracy": 0.
|
| 116 |
-
"eval_num_tokens":
|
| 117 |
-
"eval_runtime":
|
| 118 |
-
"eval_samples_per_second":
|
| 119 |
-
"eval_steps_per_second":
|
| 120 |
"step": 104
|
| 121 |
},
|
| 122 |
{
|
| 123 |
-
"epoch":
|
| 124 |
-
"grad_norm": 0.
|
| 125 |
-
"learning_rate": 0.
|
| 126 |
-
"loss": 0.
|
| 127 |
-
"mean_token_accuracy": 0.
|
| 128 |
-
"num_tokens":
|
| 129 |
-
"step":
|
| 130 |
},
|
| 131 |
{
|
| 132 |
-
"epoch":
|
| 133 |
-
"grad_norm": 0.
|
| 134 |
-
"learning_rate": 0.
|
| 135 |
-
"loss":
|
| 136 |
-
"mean_token_accuracy": 0.
|
| 137 |
-
"num_tokens":
|
| 138 |
-
"step": 120
|
| 139 |
-
},
|
| 140 |
-
{
|
| 141 |
-
"epoch": 2.5012048192771084,
|
| 142 |
-
"grad_norm": 27.594745635986328,
|
| 143 |
-
"learning_rate": 0.0003825930284374996,
|
| 144 |
-
"loss": 0.0836,
|
| 145 |
-
"mean_token_accuracy": 0.07201291918754578,
|
| 146 |
-
"num_tokens": 16834560.0,
|
| 147 |
-
"step": 130
|
| 148 |
-
},
|
| 149 |
-
{
|
| 150 |
-
"epoch": 2.693975903614458,
|
| 151 |
-
"grad_norm": 0.0,
|
| 152 |
-
"learning_rate": 0.00036239011942476655,
|
| 153 |
-
"loss": 1.364,
|
| 154 |
-
"mean_token_accuracy": 0.15817394778132438,
|
| 155 |
-
"num_tokens": 18145280.0,
|
| 156 |
-
"step": 140
|
| 157 |
-
},
|
| 158 |
-
{
|
| 159 |
-
"epoch": 2.886746987951807,
|
| 160 |
-
"grad_norm": 0.0,
|
| 161 |
-
"learning_rate": 0.00034134115028725524,
|
| 162 |
-
"loss": 3.5977,
|
| 163 |
-
"mean_token_accuracy": 0.10589548945426941,
|
| 164 |
-
"num_tokens": 19456000.0,
|
| 165 |
"step": 150
|
| 166 |
},
|
| 167 |
{
|
| 168 |
-
"epoch":
|
| 169 |
-
"
|
| 170 |
-
"
|
| 171 |
-
"
|
| 172 |
-
"
|
| 173 |
-
"
|
| 174 |
-
"
|
| 175 |
-
"step": 156
|
| 176 |
-
},
|
| 177 |
-
{
|
| 178 |
-
"epoch": 3.07710843373494,
|
| 179 |
-
"grad_norm": 0.1902359277009964,
|
| 180 |
-
"learning_rate": 0.0003196127285051592,
|
| 181 |
-
"loss": 8.9424,
|
| 182 |
-
"mean_token_accuracy": 0.062061098557484304,
|
| 183 |
-
"num_tokens": 20664320.0,
|
| 184 |
-
"step": 160
|
| 185 |
-
},
|
| 186 |
-
{
|
| 187 |
-
"epoch": 3.269879518072289,
|
| 188 |
-
"grad_norm": 0.3390277326107025,
|
| 189 |
-
"learning_rate": 0.00029737683958418377,
|
| 190 |
-
"loss": 12.1371,
|
| 191 |
-
"mean_token_accuracy": 0.07337962239980697,
|
| 192 |
-
"num_tokens": 21975040.0,
|
| 193 |
-
"step": 170
|
| 194 |
-
},
|
| 195 |
-
{
|
| 196 |
-
"epoch": 3.4626506024096386,
|
| 197 |
-
"grad_norm": 0.0,
|
| 198 |
-
"learning_rate": 0.00027480948575031854,
|
| 199 |
-
"loss": 42.6417,
|
| 200 |
-
"mean_token_accuracy": 0.08556168600916862,
|
| 201 |
-
"num_tokens": 23285760.0,
|
| 202 |
-
"step": 180
|
| 203 |
},
|
| 204 |
{
|
| 205 |
-
"epoch":
|
| 206 |
-
"grad_norm": 0.
|
| 207 |
-
"learning_rate": 0.
|
| 208 |
-
"loss":
|
| 209 |
-
"mean_token_accuracy": 0.
|
| 210 |
-
"num_tokens":
|
| 211 |
-
"step": 190
|
| 212 |
-
},
|
| 213 |
-
{
|
| 214 |
-
"epoch": 3.8481927710843373,
|
| 215 |
-
"grad_norm": 0.19443857669830322,
|
| 216 |
-
"learning_rate": 0.0002293960964917063,
|
| 217 |
-
"loss": 0.3356,
|
| 218 |
-
"mean_token_accuracy": 0.17251307517290115,
|
| 219 |
-
"num_tokens": 25907200.0,
|
| 220 |
"step": 200
|
| 221 |
},
|
| 222 |
{
|
| 223 |
-
"epoch":
|
| 224 |
-
"eval_loss":
|
| 225 |
-
"eval_mean_token_accuracy": 0.
|
| 226 |
-
"eval_num_tokens":
|
| 227 |
-
"eval_runtime":
|
| 228 |
-
"eval_samples_per_second":
|
| 229 |
-
"eval_steps_per_second":
|
| 230 |
"step": 208
|
| 231 |
},
|
| 232 |
{
|
| 233 |
-
"epoch":
|
| 234 |
-
"grad_norm": 0.
|
| 235 |
-
"learning_rate": 0.
|
| 236 |
-
"loss": 0.
|
| 237 |
-
"mean_token_accuracy": 0.
|
| 238 |
-
"num_tokens":
|
| 239 |
-
"step":
|
| 240 |
-
},
|
| 241 |
-
{
|
| 242 |
-
"epoch": 4.231325301204819,
|
| 243 |
-
"grad_norm": 0.0,
|
| 244 |
-
"learning_rate": 0.0001848075456397883,
|
| 245 |
-
"loss": 0.3988,
|
| 246 |
-
"mean_token_accuracy": 0.12297056466341019,
|
| 247 |
-
"num_tokens": 28426240.0,
|
| 248 |
-
"step": 220
|
| 249 |
-
},
|
| 250 |
-
{
|
| 251 |
-
"epoch": 4.424096385542168,
|
| 252 |
-
"grad_norm": 0.007613173220306635,
|
| 253 |
-
"learning_rate": 0.00016326511988497662,
|
| 254 |
-
"loss": 0.0274,
|
| 255 |
-
"mean_token_accuracy": 0.11160993352532386,
|
| 256 |
-
"num_tokens": 29736960.0,
|
| 257 |
-
"step": 230
|
| 258 |
},
|
| 259 |
{
|
| 260 |
-
"epoch":
|
| 261 |
-
"grad_norm": 0.
|
| 262 |
-
"learning_rate": 0.
|
| 263 |
-
"loss":
|
| 264 |
-
"mean_token_accuracy": 0.
|
| 265 |
-
"num_tokens":
|
| 266 |
-
"step": 240
|
| 267 |
-
},
|
| 268 |
-
{
|
| 269 |
-
"epoch": 4.809638554216868,
|
| 270 |
-
"grad_norm": 0.26247891783714294,
|
| 271 |
-
"learning_rate": 0.00012253518458496144,
|
| 272 |
-
"loss": 0.2528,
|
| 273 |
-
"mean_token_accuracy": 0.09861943274736404,
|
| 274 |
-
"num_tokens": 32358400.0,
|
| 275 |
"step": 250
|
| 276 |
},
|
| 277 |
{
|
| 278 |
-
"epoch":
|
| 279 |
-
"grad_norm": 0.
|
| 280 |
-
"learning_rate": 0.
|
| 281 |
-
"loss": 0.
|
| 282 |
-
"mean_token_accuracy": 0.
|
| 283 |
-
"num_tokens":
|
| 284 |
-
"step":
|
| 285 |
-
},
|
| 286 |
-
{
|
| 287 |
-
"epoch": 5.0,
|
| 288 |
-
"eval_loss": NaN,
|
| 289 |
-
"eval_mean_token_accuracy": 0.14629162118789998,
|
| 290 |
-
"eval_num_tokens": 33566720.0,
|
| 291 |
-
"eval_runtime": 9.0068,
|
| 292 |
-
"eval_samples_per_second": 40.969,
|
| 293 |
-
"eval_steps_per_second": 5.218,
|
| 294 |
-
"step": 260
|
| 295 |
-
},
|
| 296 |
-
{
|
| 297 |
-
"epoch": 5.192771084337349,
|
| 298 |
-
"grad_norm": 0.05429690331220627,
|
| 299 |
-
"learning_rate": 8.600670852105292e-05,
|
| 300 |
-
"loss": 1.0623,
|
| 301 |
-
"mean_token_accuracy": 0.09910124614834785,
|
| 302 |
-
"num_tokens": 34877440.0,
|
| 303 |
-
"step": 270
|
| 304 |
-
},
|
| 305 |
-
{
|
| 306 |
-
"epoch": 5.385542168674699,
|
| 307 |
-
"grad_norm": 0.0,
|
| 308 |
-
"learning_rate": 6.968493387697466e-05,
|
| 309 |
-
"loss": 1.6628,
|
| 310 |
-
"mean_token_accuracy": 0.12442896366119385,
|
| 311 |
-
"num_tokens": 36188160.0,
|
| 312 |
-
"step": 280
|
| 313 |
-
},
|
| 314 |
-
{
|
| 315 |
-
"epoch": 5.578313253012048,
|
| 316 |
-
"grad_norm": 0.01778862252831459,
|
| 317 |
-
"learning_rate": 5.483392864428595e-05,
|
| 318 |
-
"loss": 0.027,
|
| 319 |
-
"mean_token_accuracy": 0.14972642660140992,
|
| 320 |
-
"num_tokens": 37498880.0,
|
| 321 |
-
"step": 290
|
| 322 |
},
|
| 323 |
{
|
| 324 |
-
"epoch":
|
| 325 |
-
"grad_norm": 0.
|
| 326 |
-
"learning_rate":
|
| 327 |
-
"loss":
|
| 328 |
-
"mean_token_accuracy": 0.
|
| 329 |
-
"num_tokens":
|
| 330 |
"step": 300
|
| 331 |
},
|
| 332 |
{
|
| 333 |
-
"epoch":
|
| 334 |
-
"
|
| 335 |
-
"
|
| 336 |
-
"
|
| 337 |
-
"
|
| 338 |
-
"
|
| 339 |
-
"
|
| 340 |
-
},
|
| 341 |
-
{
|
| 342 |
-
"epoch": 6.0,
|
| 343 |
-
"eval_loss": NaN,
|
| 344 |
-
"eval_mean_token_accuracy": 0.14657345097115698,
|
| 345 |
-
"eval_num_tokens": 40280064.0,
|
| 346 |
-
"eval_runtime": 8.9922,
|
| 347 |
-
"eval_samples_per_second": 41.036,
|
| 348 |
-
"eval_steps_per_second": 5.227,
|
| 349 |
"step": 312
|
| 350 |
}
|
| 351 |
],
|
| 352 |
-
"logging_steps":
|
| 353 |
-
"max_steps":
|
| 354 |
"num_input_tokens_seen": 0,
|
| 355 |
"num_train_epochs": 7,
|
| 356 |
"save_steps": 500,
|
|
@@ -366,7 +165,7 @@
|
|
| 366 |
"attributes": {}
|
| 367 |
}
|
| 368 |
},
|
| 369 |
-
"total_flos":
|
| 370 |
"train_batch_size": 2,
|
| 371 |
"trial_name": null,
|
| 372 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
"eval_steps": 500,
|
| 7 |
"global_step": 312,
|
| 8 |
"is_hyper_param_search": false,
|
|
|
|
| 10 |
"is_world_process_zero": true,
|
| 11 |
"log_history": [
|
| 12 |
{
|
| 13 |
+
"epoch": 0.24096385542168675,
|
| 14 |
+
"grad_norm": 0.2236759215593338,
|
| 15 |
+
"learning_rate": 0.000511,
|
| 16 |
+
"loss": 0.4204,
|
| 17 |
+
"mean_token_accuracy": 0.900120057463646,
|
| 18 |
+
"num_tokens": 567991.0,
|
| 19 |
+
"step": 25
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"epoch": 0.4819277108433735,
|
| 23 |
+
"grad_norm": 0.1322442889213562,
|
| 24 |
+
"learning_rate": 0.000511,
|
| 25 |
+
"loss": 0.2913,
|
| 26 |
+
"mean_token_accuracy": 0.9270081639289856,
|
| 27 |
+
"num_tokens": 1135343.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
"step": 50
|
| 29 |
},
|
| 30 |
{
|
| 31 |
+
"epoch": 0.7228915662650602,
|
| 32 |
+
"grad_norm": 0.19739408791065216,
|
| 33 |
+
"learning_rate": 0.000511,
|
| 34 |
+
"loss": 0.2186,
|
| 35 |
+
"mean_token_accuracy": 0.9418566429615021,
|
| 36 |
+
"num_tokens": 1703784.0,
|
| 37 |
+
"step": 75
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
},
|
| 39 |
{
|
| 40 |
+
"epoch": 0.963855421686747,
|
| 41 |
+
"grad_norm": 0.17215745151042938,
|
| 42 |
+
"learning_rate": 0.000511,
|
| 43 |
+
"loss": 0.1963,
|
| 44 |
+
"mean_token_accuracy": 0.9479192215204238,
|
| 45 |
+
"num_tokens": 2269891.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
"step": 100
|
| 47 |
},
|
| 48 |
{
|
| 49 |
+
"epoch": 1.0,
|
| 50 |
+
"eval_loss": 0.19681453704833984,
|
| 51 |
+
"eval_mean_token_accuracy": 0.9478744319144715,
|
| 52 |
+
"eval_num_tokens": 2345494.0,
|
| 53 |
+
"eval_runtime": 4.3187,
|
| 54 |
+
"eval_samples_per_second": 85.442,
|
| 55 |
+
"eval_steps_per_second": 10.883,
|
| 56 |
"step": 104
|
| 57 |
},
|
| 58 |
{
|
| 59 |
+
"epoch": 1.202409638554217,
|
| 60 |
+
"grad_norm": 0.10346771776676178,
|
| 61 |
+
"learning_rate": 0.000511,
|
| 62 |
+
"loss": 0.165,
|
| 63 |
+
"mean_token_accuracy": 0.9550067053900825,
|
| 64 |
+
"num_tokens": 2836234.0,
|
| 65 |
+
"step": 125
|
| 66 |
},
|
| 67 |
{
|
| 68 |
+
"epoch": 1.4433734939759035,
|
| 69 |
+
"grad_norm": 0.0941459909081459,
|
| 70 |
+
"learning_rate": 0.000511,
|
| 71 |
+
"loss": 0.1445,
|
| 72 |
+
"mean_token_accuracy": 0.9606501096487046,
|
| 73 |
+
"num_tokens": 3403671.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
"step": 150
|
| 75 |
},
|
| 76 |
{
|
| 77 |
+
"epoch": 1.6843373493975904,
|
| 78 |
+
"grad_norm": 0.07419874519109726,
|
| 79 |
+
"learning_rate": 0.000511,
|
| 80 |
+
"loss": 0.1184,
|
| 81 |
+
"mean_token_accuracy": 0.9665295648574829,
|
| 82 |
+
"num_tokens": 3972278.0,
|
| 83 |
+
"step": 175
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
},
|
| 85 |
{
|
| 86 |
+
"epoch": 1.9253012048192772,
|
| 87 |
+
"grad_norm": 0.08383649587631226,
|
| 88 |
+
"learning_rate": 0.000511,
|
| 89 |
+
"loss": 0.1309,
|
| 90 |
+
"mean_token_accuracy": 0.9640201306343079,
|
| 91 |
+
"num_tokens": 4538970.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
"step": 200
|
| 93 |
},
|
| 94 |
{
|
| 95 |
+
"epoch": 2.0,
|
| 96 |
+
"eval_loss": 0.16037927567958832,
|
| 97 |
+
"eval_mean_token_accuracy": 0.9557588328706458,
|
| 98 |
+
"eval_num_tokens": 4690728.0,
|
| 99 |
+
"eval_runtime": 4.2478,
|
| 100 |
+
"eval_samples_per_second": 86.868,
|
| 101 |
+
"eval_steps_per_second": 11.065,
|
| 102 |
"step": 208
|
| 103 |
},
|
| 104 |
{
|
| 105 |
+
"epoch": 2.163855421686747,
|
| 106 |
+
"grad_norm": 0.09131479263305664,
|
| 107 |
+
"learning_rate": 0.000511,
|
| 108 |
+
"loss": 0.1127,
|
| 109 |
+
"mean_token_accuracy": 0.9709722676662483,
|
| 110 |
+
"num_tokens": 5091564.0,
|
| 111 |
+
"step": 225
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
},
|
| 113 |
{
|
| 114 |
+
"epoch": 2.404819277108434,
|
| 115 |
+
"grad_norm": 0.09491455554962158,
|
| 116 |
+
"learning_rate": 0.000511,
|
| 117 |
+
"loss": 0.1007,
|
| 118 |
+
"mean_token_accuracy": 0.9710033702850341,
|
| 119 |
+
"num_tokens": 5659070.0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
"step": 250
|
| 121 |
},
|
| 122 |
{
|
| 123 |
+
"epoch": 2.6457831325301204,
|
| 124 |
+
"grad_norm": 0.07198868691921234,
|
| 125 |
+
"learning_rate": 0.000511,
|
| 126 |
+
"loss": 0.0858,
|
| 127 |
+
"mean_token_accuracy": 0.9747626584768295,
|
| 128 |
+
"num_tokens": 6228488.0,
|
| 129 |
+
"step": 275
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
},
|
| 131 |
{
|
| 132 |
+
"epoch": 2.886746987951807,
|
| 133 |
+
"grad_norm": 0.07914356887340546,
|
| 134 |
+
"learning_rate": 0.000511,
|
| 135 |
+
"loss": 0.0961,
|
| 136 |
+
"mean_token_accuracy": 0.9724871903657913,
|
| 137 |
+
"num_tokens": 6795848.0,
|
| 138 |
"step": 300
|
| 139 |
},
|
| 140 |
{
|
| 141 |
+
"epoch": 3.0,
|
| 142 |
+
"eval_loss": 0.16401147842407227,
|
| 143 |
+
"eval_mean_token_accuracy": 0.958565741143328,
|
| 144 |
+
"eval_num_tokens": 7036588.0,
|
| 145 |
+
"eval_runtime": 4.2348,
|
| 146 |
+
"eval_samples_per_second": 87.135,
|
| 147 |
+
"eval_steps_per_second": 11.099,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
"step": 312
|
| 149 |
}
|
| 150 |
],
|
| 151 |
+
"logging_steps": 25,
|
| 152 |
+
"max_steps": 728,
|
| 153 |
"num_input_tokens_seen": 0,
|
| 154 |
"num_train_epochs": 7,
|
| 155 |
"save_steps": 500,
|
|
|
|
| 165 |
"attributes": {}
|
| 166 |
}
|
| 167 |
},
|
| 168 |
+
"total_flos": 3.077171409898701e+17,
|
| 169 |
"train_batch_size": 2,
|
| 170 |
"trial_name": null,
|
| 171 |
"trial_params": null
|
checkpoint-312/training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 6097
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a92f33d5ca39ba292c6b171cffeb00f4d1c361214bebb8604f8ce3482d3b7c8c
|
| 3 |
size 6097
|
checkpoint-416/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: Qwen/Qwen2.5-7B-Instruct
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
|
| 7 |
+
- lora
|
| 8 |
+
- sft
|
| 9 |
+
- transformers
|
| 10 |
+
- trl
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Model Card for Model ID
|
| 14 |
+
|
| 15 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** [More Information Needed]
|
| 28 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 29 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 30 |
+
- **Model type:** [More Information Needed]
|
| 31 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 32 |
+
- **License:** [More Information Needed]
|
| 33 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 34 |
+
|
| 35 |
+
### Model Sources [optional]
|
| 36 |
+
|
| 37 |
+
<!-- Provide the basic links for the model. -->
|
| 38 |
+
|
| 39 |
+
- **Repository:** [More Information Needed]
|
| 40 |
+
- **Paper [optional]:** [More Information Needed]
|
| 41 |
+
- **Demo [optional]:** [More Information Needed]
|
| 42 |
+
|
| 43 |
+
## Uses
|
| 44 |
+
|
| 45 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 46 |
+
|
| 47 |
+
### Direct Use
|
| 48 |
+
|
| 49 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 50 |
+
|
| 51 |
+
[More Information Needed]
|
| 52 |
+
|
| 53 |
+
### Downstream Use [optional]
|
| 54 |
+
|
| 55 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 56 |
+
|
| 57 |
+
[More Information Needed]
|
| 58 |
+
|
| 59 |
+
### Out-of-Scope Use
|
| 60 |
+
|
| 61 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 62 |
+
|
| 63 |
+
[More Information Needed]
|
| 64 |
+
|
| 65 |
+
## Bias, Risks, and Limitations
|
| 66 |
+
|
| 67 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 68 |
+
|
| 69 |
+
[More Information Needed]
|
| 70 |
+
|
| 71 |
+
### Recommendations
|
| 72 |
+
|
| 73 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 74 |
+
|
| 75 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 76 |
+
|
| 77 |
+
## How to Get Started with the Model
|
| 78 |
+
|
| 79 |
+
Use the code below to get started with the model.
|
| 80 |
+
|
| 81 |
+
[More Information Needed]
|
| 82 |
+
|
| 83 |
+
## Training Details
|
| 84 |
+
|
| 85 |
+
### Training Data
|
| 86 |
+
|
| 87 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 88 |
+
|
| 89 |
+
[More Information Needed]
|
| 90 |
+
|
| 91 |
+
### Training Procedure
|
| 92 |
+
|
| 93 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 94 |
+
|
| 95 |
+
#### Preprocessing [optional]
|
| 96 |
+
|
| 97 |
+
[More Information Needed]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
#### Training Hyperparameters
|
| 101 |
+
|
| 102 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 103 |
+
|
| 104 |
+
#### Speeds, Sizes, Times [optional]
|
| 105 |
+
|
| 106 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 107 |
+
|
| 108 |
+
[More Information Needed]
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 113 |
+
|
| 114 |
+
### Testing Data, Factors & Metrics
|
| 115 |
+
|
| 116 |
+
#### Testing Data
|
| 117 |
+
|
| 118 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 119 |
+
|
| 120 |
+
[More Information Needed]
|
| 121 |
+
|
| 122 |
+
#### Factors
|
| 123 |
+
|
| 124 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 125 |
+
|
| 126 |
+
[More Information Needed]
|
| 127 |
+
|
| 128 |
+
#### Metrics
|
| 129 |
+
|
| 130 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
### Results
|
| 135 |
+
|
| 136 |
+
[More Information Needed]
|
| 137 |
+
|
| 138 |
+
#### Summary
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Model Examination [optional]
|
| 143 |
+
|
| 144 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 145 |
+
|
| 146 |
+
[More Information Needed]
|
| 147 |
+
|
| 148 |
+
## Environmental Impact
|
| 149 |
+
|
| 150 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 151 |
+
|
| 152 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 153 |
+
|
| 154 |
+
- **Hardware Type:** [More Information Needed]
|
| 155 |
+
- **Hours used:** [More Information Needed]
|
| 156 |
+
- **Cloud Provider:** [More Information Needed]
|
| 157 |
+
- **Compute Region:** [More Information Needed]
|
| 158 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 159 |
+
|
| 160 |
+
## Technical Specifications [optional]
|
| 161 |
+
|
| 162 |
+
### Model Architecture and Objective
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
### Compute Infrastructure
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Hardware
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
#### Software
|
| 175 |
+
|
| 176 |
+
[More Information Needed]
|
| 177 |
+
|
| 178 |
+
## Citation [optional]
|
| 179 |
+
|
| 180 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 181 |
+
|
| 182 |
+
**BibTeX:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
**APA:**
|
| 187 |
+
|
| 188 |
+
[More Information Needed]
|
| 189 |
+
|
| 190 |
+
## Glossary [optional]
|
| 191 |
+
|
| 192 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## More Information [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Authors [optional]
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
|
| 204 |
+
## Model Card Contact
|
| 205 |
+
|
| 206 |
+
[More Information Needed]
|
| 207 |
+
### Framework versions
|
| 208 |
+
|
| 209 |
+
- PEFT 0.17.0
|
checkpoint-416/adapter_config.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alpha_pattern": {},
|
| 3 |
+
"auto_mapping": null,
|
| 4 |
+
"base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
|
| 5 |
+
"bias": "none",
|
| 6 |
+
"corda_config": null,
|
| 7 |
+
"eva_config": null,
|
| 8 |
+
"exclude_modules": null,
|
| 9 |
+
"fan_in_fan_out": false,
|
| 10 |
+
"inference_mode": true,
|
| 11 |
+
"init_lora_weights": true,
|
| 12 |
+
"layer_replication": null,
|
| 13 |
+
"layers_pattern": null,
|
| 14 |
+
"layers_to_transform": null,
|
| 15 |
+
"loftq_config": {},
|
| 16 |
+
"lora_alpha": 16,
|
| 17 |
+
"lora_bias": false,
|
| 18 |
+
"lora_dropout": 0.1,
|
| 19 |
+
"megatron_config": null,
|
| 20 |
+
"megatron_core": "megatron.core",
|
| 21 |
+
"modules_to_save": null,
|
| 22 |
+
"peft_type": "LORA",
|
| 23 |
+
"qalora_group_size": 16,
|
| 24 |
+
"r": 64,
|
| 25 |
+
"rank_pattern": {},
|
| 26 |
+
"revision": null,
|
| 27 |
+
"target_modules": [
|
| 28 |
+
"down_proj",
|
| 29 |
+
"k_proj",
|
| 30 |
+
"o_proj",
|
| 31 |
+
"q_proj",
|
| 32 |
+
"v_proj",
|
| 33 |
+
"up_proj",
|
| 34 |
+
"gate_proj"
|
| 35 |
+
],
|
| 36 |
+
"target_parameters": null,
|
| 37 |
+
"task_type": "CAUSAL_LM",
|
| 38 |
+
"trainable_token_indices": null,
|
| 39 |
+
"use_dora": false,
|
| 40 |
+
"use_qalora": false,
|
| 41 |
+
"use_rslora": false
|
| 42 |
+
}
|
checkpoint-416/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:542d542fbe24ec80310418b793ff4ba1972ae6587fdf5669491fc92c83b08a09
|
| 3 |
+
size 645975704
|
checkpoint-416/added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
checkpoint-416/chat_template.jinja
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 4 |
+
{{- messages[0]['content'] }}
|
| 5 |
+
{%- else %}
|
| 6 |
+
{{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
|
| 7 |
+
{%- endif %}
|
| 8 |
+
{{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 9 |
+
{%- for tool in tools %}
|
| 10 |
+
{{- "\n" }}
|
| 11 |
+
{{- tool | tojson }}
|
| 12 |
+
{%- endfor %}
|
| 13 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 14 |
+
{%- else %}
|
| 15 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 16 |
+
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
|
| 17 |
+
{%- else %}
|
| 18 |
+
{{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
|
| 19 |
+
{%- endif %}
|
| 20 |
+
{%- endif %}
|
| 21 |
+
{%- for message in messages %}
|
| 22 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
|
| 23 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 24 |
+
{%- elif message.role == "assistant" %}
|
| 25 |
+
{{- '<|im_start|>' + message.role }}
|
| 26 |
+
{%- if message.content %}
|
| 27 |
+
{{- '\n' + message.content }}
|
| 28 |
+
{%- endif %}
|
| 29 |
+
{%- for tool_call in message.tool_calls %}
|
| 30 |
+
{%- if tool_call.function is defined %}
|
| 31 |
+
{%- set tool_call = tool_call.function %}
|
| 32 |
+
{%- endif %}
|
| 33 |
+
{{- '\n<tool_call>\n{"name": "' }}
|
| 34 |
+
{{- tool_call.name }}
|
| 35 |
+
{{- '", "arguments": ' }}
|
| 36 |
+
{{- tool_call.arguments | tojson }}
|
| 37 |
+
{{- '}\n</tool_call>' }}
|
| 38 |
+
{%- endfor %}
|
| 39 |
+
{{- '<|im_end|>\n' }}
|
| 40 |
+
{%- elif message.role == "tool" %}
|
| 41 |
+
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
|
| 42 |
+
{{- '<|im_start|>user' }}
|
| 43 |
+
{%- endif %}
|
| 44 |
+
{{- '\n<tool_response>\n' }}
|
| 45 |
+
{{- message.content }}
|
| 46 |
+
{{- '\n</tool_response>' }}
|
| 47 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 48 |
+
{{- '<|im_end|>\n' }}
|
| 49 |
+
{%- endif %}
|
| 50 |
+
{%- endif %}
|
| 51 |
+
{%- endfor %}
|
| 52 |
+
{%- if add_generation_prompt %}
|
| 53 |
+
{{- '<|im_start|>assistant\n' }}
|
| 54 |
+
{%- endif %}
|
checkpoint-416/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
checkpoint-416/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b165c5df29ba238a2a6fe6aee452efc2a9acfdba35ea32f0b467fd2d02c5353
|
| 3 |
+
size 1292087499
|
checkpoint-416/rng_state_0.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:152c5038eb73ee59d2dde5d5b103ac1fbd66c3a40e654eb5c0300cac7dbc116d
|
| 3 |
+
size 15429
|
checkpoint-416/rng_state_1.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4bf88dea0c41ff8af9c7036f185396537d2c81ba9abbc7a1f1b60ece0652d2ad
|
| 3 |
+
size 15429
|
checkpoint-416/rng_state_2.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8efe00d663f3c5a76caedcb1606763b501b981c15ca59077f9933614d1cf693e
|
| 3 |
+
size 15429
|