finalform commited on
Commit
d0b9c22
·
verified ·
1 Parent(s): 90431f1

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. README.md +4 -3
  3. adapter_config.json +5 -5
  4. adapter_model.safetensors +1 -1
  5. checkpoint-104/adapter_config.json +6 -6
  6. checkpoint-104/adapter_model.safetensors +1 -1
  7. checkpoint-104/optimizer.pt +1 -1
  8. checkpoint-104/rng_state_0.pth +1 -1
  9. checkpoint-104/rng_state_1.pth +1 -1
  10. checkpoint-104/rng_state_2.pth +1 -1
  11. checkpoint-104/rng_state_3.pth +1 -1
  12. checkpoint-104/scheduler.pt +2 -2
  13. checkpoint-104/special_tokens_map.json +7 -1
  14. checkpoint-104/tokenizer_config.json +1 -1
  15. checkpoint-104/trainer_state.json +37 -101
  16. checkpoint-104/training_args.bin +1 -1
  17. checkpoint-208/adapter_config.json +6 -6
  18. checkpoint-208/adapter_model.safetensors +1 -1
  19. checkpoint-208/optimizer.pt +1 -1
  20. checkpoint-208/rng_state_0.pth +1 -1
  21. checkpoint-208/rng_state_1.pth +1 -1
  22. checkpoint-208/rng_state_2.pth +1 -1
  23. checkpoint-208/rng_state_3.pth +1 -1
  24. checkpoint-208/scheduler.pt +2 -2
  25. checkpoint-208/special_tokens_map.json +7 -1
  26. checkpoint-208/tokenizer_config.json +1 -1
  27. checkpoint-208/trainer_state.json +72 -200
  28. checkpoint-208/training_args.bin +1 -1
  29. checkpoint-312/adapter_config.json +6 -6
  30. checkpoint-312/adapter_model.safetensors +1 -1
  31. checkpoint-312/optimizer.pt +1 -1
  32. checkpoint-312/rng_state_0.pth +1 -1
  33. checkpoint-312/rng_state_1.pth +1 -1
  34. checkpoint-312/rng_state_2.pth +1 -1
  35. checkpoint-312/rng_state_3.pth +1 -1
  36. checkpoint-312/scheduler.pt +1 -1
  37. checkpoint-312/special_tokens_map.json +7 -1
  38. checkpoint-312/tokenizer_config.json +1 -1
  39. checkpoint-312/trainer_state.json +105 -306
  40. checkpoint-312/training_args.bin +1 -1
  41. checkpoint-416/README.md +209 -0
  42. checkpoint-416/adapter_config.json +42 -0
  43. checkpoint-416/adapter_model.safetensors +3 -0
  44. checkpoint-416/added_tokens.json +24 -0
  45. checkpoint-416/chat_template.jinja +54 -0
  46. checkpoint-416/merges.txt +0 -0
  47. checkpoint-416/optimizer.pt +3 -0
  48. checkpoint-416/rng_state_0.pth +3 -0
  49. checkpoint-416/rng_state_1.pth +3 -0
  50. checkpoint-416/rng_state_2.pth +3 -0
.gitattributes CHANGED
@@ -41,3 +41,7 @@ checkpoint-312/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
  checkpoint-52/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
  checkpoint-364/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
41
  checkpoint-52/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
  checkpoint-364/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
+ checkpoint-416/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
+ checkpoint-520/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
+ checkpoint-624/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
+ checkpoint-728/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,16 +1,17 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-7B-Instruct
3
  library_name: peft
4
- model_name: foamqwen
5
  tags:
6
  - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
  - lora
8
  - sft
9
  - transformers
10
  - trl
11
- licence: license
12
- pipeline_tag: text-generation
13
  ---
 
 
 
14
 
15
  # Model Card for foamqwen
16
 
 
1
  ---
2
  base_model: Qwen/Qwen2.5-7B-Instruct
3
  library_name: peft
4
+ pipeline_tag: text-generation
5
  tags:
6
  - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
  - lora
8
  - sft
9
  - transformers
10
  - trl
 
 
11
  ---
12
+ ### Framework versions
13
+
14
+ - PEFT 0.17.0
15
 
16
  # Model Card for foamqwen
17
 
adapter_config.json CHANGED
@@ -15,7 +15,7 @@
15
  "loftq_config": {},
16
  "lora_alpha": 16,
17
  "lora_bias": false,
18
- "lora_dropout": 0.05,
19
  "megatron_config": null,
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
@@ -25,13 +25,13 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "gate_proj",
29
- "o_proj",
30
- "up_proj",
31
  "v_proj",
32
  "k_proj",
 
33
  "down_proj",
34
- "q_proj"
 
 
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
 
15
  "loftq_config": {},
16
  "lora_alpha": 16,
17
  "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
  "megatron_config": null,
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
 
 
 
28
  "v_proj",
29
  "k_proj",
30
+ "up_proj",
31
  "down_proj",
32
+ "q_proj",
33
+ "o_proj",
34
+ "gate_proj"
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c931ff65e7868a951e93334044f8b054bc3d954b4b2b9bb6db80e00db07c4c8
3
  size 645975704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5711c28d3d33aa09d94c62c9a136b8bf0f0cdbd49f90528dd206ab969f2ec3e
3
  size 645975704
checkpoint-104/adapter_config.json CHANGED
@@ -15,7 +15,7 @@
15
  "loftq_config": {},
16
  "lora_alpha": 16,
17
  "lora_bias": false,
18
- "lora_dropout": 0.05,
19
  "megatron_config": null,
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
@@ -25,13 +25,13 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "gate_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "v_proj",
32
- "k_proj",
33
- "down_proj",
34
- "q_proj"
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
 
15
  "loftq_config": {},
16
  "lora_alpha": 16,
17
  "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
  "megatron_config": null,
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
+ "down_proj",
29
+ "k_proj",
30
  "o_proj",
31
+ "q_proj",
32
  "v_proj",
33
+ "up_proj",
34
+ "gate_proj"
 
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
checkpoint-104/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bde6f6c34ecfc667be607f66c0be3608b265b13ddd5bec606209d49f6a9ad377
3
  size 645975704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fc6cd409c955d35b5a6a620baedd9530a4c5f73f68bbe3082ddb660de6919d8
3
  size 645975704
checkpoint-104/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58cc59bc26368fda46e4328622054e26e2a4e77ddf312428c353676f9e983a21
3
  size 1292087115
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5478a87659e3fd833f6e0be022f6cade6754457ac9844e658a304f95edb93418
3
  size 1292087115
checkpoint-104/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8514828f9d8aab559ae0f4b6b1989545764b4095d99270ac55d236f3a4ce2751
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b46d4a17bc33ca1534ecbb381b92bb917feb262b6cd6ea1e0aeef66ab3378a5
3
  size 15429
checkpoint-104/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9baf0418fd508a533260a12df08c5a04eb2c254426d0e852be3e63d32fd9aa7
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8ea02c0f84cb7b79a7d01dc1ad12d59feb06a7206324175c7723c9e9e70ef38
3
  size 15429
checkpoint-104/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e074ab2936100c00e26b1c10f116f1537ba6440f80d4ea504962bf27db9f8936
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0510c2eb154e655092dfd2b66e653579331ca3559570cdc26dff724493936f08
3
  size 15429
checkpoint-104/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:604a1bcdea4bd8ceb2e9b35fa59830f94b6c7359f43a2d33f4f7f6c7e6284710
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04ef7e6fbd41972ea743427f306fa0e581982d06dd2fd9a83bc6f1f6e4371346
3
  size 15429
checkpoint-104/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50de828614ef29ce7400a822ce49ac96711a9eb7d83ddde8521d2ecc0b064dfb
3
- size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7973bd6977b296bd79b6e5eab7d855b5c9117eea90fc9a6d871d376ce4ddb2d
3
+ size 1401
checkpoint-104/special_tokens_map.json CHANGED
@@ -21,5 +21,11 @@
21
  "rstrip": false,
22
  "single_word": false
23
  },
24
- "pad_token": "<|im_end|>"
 
 
 
 
 
 
25
  }
 
21
  "rstrip": false,
22
  "single_word": false
23
  },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
  }
checkpoint-104/tokenizer_config.json CHANGED
@@ -200,7 +200,7 @@
200
  "errors": "replace",
201
  "extra_special_tokens": {},
202
  "model_max_length": 131072,
203
- "pad_token": "<|im_end|>",
204
  "split_special_tokens": false,
205
  "tokenizer_class": "Qwen2Tokenizer",
206
  "unk_token": null
 
200
  "errors": "replace",
201
  "extra_special_tokens": {},
202
  "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
  "split_special_tokens": false,
205
  "tokenizer_class": "Qwen2Tokenizer",
206
  "unk_token": null
checkpoint-104/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.0,
6
  "eval_steps": 500,
7
  "global_step": 104,
8
  "is_hyper_param_search": false,
@@ -10,118 +10,54 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.1927710843373494,
14
- "grad_norm": 95.26730346679688,
15
- "learning_rate": 0.0004180909090909091,
16
- "loss": 22.4465,
17
- "mean_token_accuracy": 0.08969678990542888,
18
- "num_tokens": 1310720.0,
19
- "step": 10
20
  },
21
  {
22
- "epoch": 0.3855421686746988,
23
- "grad_norm": 0.0,
24
- "learning_rate": 0.0005103526968014265,
25
- "loss": 0.8012,
26
- "mean_token_accuracy": 0.13804710581898688,
27
- "num_tokens": 2621440.0,
28
- "step": 20
29
- },
30
- {
31
- "epoch": 0.5783132530120482,
32
- "grad_norm": 0.0,
33
- "learning_rate": 0.0005077286477833616,
34
- "loss": 452.9577,
35
- "mean_token_accuracy": 0.05160275483503938,
36
- "num_tokens": 3932160.0,
37
- "step": 30
38
- },
39
- {
40
- "epoch": 0.7710843373493976,
41
- "grad_norm": 0.0,
42
- "learning_rate": 0.0005031081504278389,
43
- "loss": 470.5136,
44
- "mean_token_accuracy": 0.03822226445190609,
45
- "num_tokens": 5242880.0,
46
- "step": 40
47
- },
48
- {
49
- "epoch": 0.963855421686747,
50
- "grad_norm": 26.303752899169922,
51
- "learning_rate": 0.0004965277770447238,
52
- "loss": 167.1384,
53
- "mean_token_accuracy": 0.057517293840646744,
54
- "num_tokens": 6553600.0,
55
  "step": 50
56
  },
57
  {
58
- "epoch": 1.0,
59
- "eval_loss": NaN,
60
- "eval_mean_token_accuracy": 0.13677339731378757,
61
- "eval_num_tokens": 6713344.0,
62
- "eval_runtime": 8.9806,
63
- "eval_samples_per_second": 41.089,
64
- "eval_steps_per_second": 5.233,
65
- "step": 52
66
- },
67
- {
68
- "epoch": 1.1542168674698796,
69
- "grad_norm": 28.755094528198242,
70
- "learning_rate": 0.00048803961281790017,
71
- "loss": 27.9726,
72
- "mean_token_accuracy": 0.03075966710531259,
73
- "num_tokens": 7761920.0,
74
- "step": 60
75
- },
76
- {
77
- "epoch": 1.346987951807229,
78
- "grad_norm": 0.9693858027458191,
79
- "learning_rate": 0.000477710843538941,
80
- "loss": 2.2869,
81
- "mean_token_accuracy": 0.10747051909565926,
82
- "num_tokens": 9072640.0,
83
- "step": 70
84
- },
85
- {
86
- "epoch": 1.5397590361445783,
87
- "grad_norm": 0.36548200249671936,
88
- "learning_rate": 0.0004656232238159615,
89
- "loss": 60.0031,
90
- "mean_token_accuracy": 0.10124717205762863,
91
- "num_tokens": 10383360.0,
92
- "step": 80
93
  },
94
  {
95
- "epoch": 1.7325301204819277,
96
- "grad_norm": 0.8749092817306519,
97
- "learning_rate": 0.0004518724299669051,
98
- "loss": 0.8994,
99
- "mean_token_accuracy": 0.1551567144691944,
100
- "num_tokens": 11694080.0,
101
- "step": 90
102
- },
103
- {
104
- "epoch": 1.9253012048192772,
105
- "grad_norm": 0.0,
106
- "learning_rate": 0.0004365673027192623,
107
- "loss": 2.2759,
108
- "mean_token_accuracy": 0.13096993789076805,
109
- "num_tokens": 13004800.0,
110
  "step": 100
111
  },
112
  {
113
- "epoch": 2.0,
114
- "eval_loss": NaN,
115
- "eval_mean_token_accuracy": 0.14378934084100925,
116
- "eval_num_tokens": 13426688.0,
117
- "eval_runtime": 8.9689,
118
- "eval_samples_per_second": 41.142,
119
- "eval_steps_per_second": 5.24,
120
  "step": 104
121
  }
122
  ],
123
- "logging_steps": 10,
124
- "max_steps": 364,
125
  "num_input_tokens_seen": 0,
126
  "num_train_epochs": 7,
127
  "save_steps": 500,
@@ -137,7 +73,7 @@
137
  "attributes": {}
138
  }
139
  },
140
- "total_flos": 5.900837915658813e+17,
141
  "train_batch_size": 2,
142
  "trial_name": null,
143
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
  "eval_steps": 500,
7
  "global_step": 104,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.24096385542168675,
14
+ "grad_norm": 0.2236759215593338,
15
+ "learning_rate": 0.000511,
16
+ "loss": 0.4204,
17
+ "mean_token_accuracy": 0.900120057463646,
18
+ "num_tokens": 567991.0,
19
+ "step": 25
20
  },
21
  {
22
+ "epoch": 0.4819277108433735,
23
+ "grad_norm": 0.1322442889213562,
24
+ "learning_rate": 0.000511,
25
+ "loss": 0.2913,
26
+ "mean_token_accuracy": 0.9270081639289856,
27
+ "num_tokens": 1135343.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  "step": 50
29
  },
30
  {
31
+ "epoch": 0.7228915662650602,
32
+ "grad_norm": 0.19739408791065216,
33
+ "learning_rate": 0.000511,
34
+ "loss": 0.2186,
35
+ "mean_token_accuracy": 0.9418566429615021,
36
+ "num_tokens": 1703784.0,
37
+ "step": 75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  },
39
  {
40
+ "epoch": 0.963855421686747,
41
+ "grad_norm": 0.17215745151042938,
42
+ "learning_rate": 0.000511,
43
+ "loss": 0.1963,
44
+ "mean_token_accuracy": 0.9479192215204238,
45
+ "num_tokens": 2269891.0,
 
 
 
 
 
 
 
 
 
46
  "step": 100
47
  },
48
  {
49
+ "epoch": 1.0,
50
+ "eval_loss": 0.19681453704833984,
51
+ "eval_mean_token_accuracy": 0.9478744319144715,
52
+ "eval_num_tokens": 2345494.0,
53
+ "eval_runtime": 4.3187,
54
+ "eval_samples_per_second": 85.442,
55
+ "eval_steps_per_second": 10.883,
56
  "step": 104
57
  }
58
  ],
59
+ "logging_steps": 25,
60
+ "max_steps": 728,
61
  "num_input_tokens_seen": 0,
62
  "num_train_epochs": 7,
63
  "save_steps": 500,
 
73
  "attributes": {}
74
  }
75
  },
76
+ "total_flos": 1.0256509033879962e+17,
77
  "train_batch_size": 2,
78
  "trial_name": null,
79
  "trial_params": null
checkpoint-104/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb24a2e21c60358ea1e7c21423d18ace82a12d90960084fd56c8a7388a71c974
3
  size 6097
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a92f33d5ca39ba292c6b171cffeb00f4d1c361214bebb8604f8ce3482d3b7c8c
3
  size 6097
checkpoint-208/adapter_config.json CHANGED
@@ -15,7 +15,7 @@
15
  "loftq_config": {},
16
  "lora_alpha": 16,
17
  "lora_bias": false,
18
- "lora_dropout": 0.05,
19
  "megatron_config": null,
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
@@ -25,13 +25,13 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "gate_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "v_proj",
32
- "k_proj",
33
- "down_proj",
34
- "q_proj"
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
 
15
  "loftq_config": {},
16
  "lora_alpha": 16,
17
  "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
  "megatron_config": null,
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
+ "down_proj",
29
+ "k_proj",
30
  "o_proj",
31
+ "q_proj",
32
  "v_proj",
33
+ "up_proj",
34
+ "gate_proj"
 
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
checkpoint-208/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0ac19c8de28684631474a0ad72407294c096a0f549e773c3d1fa82a19be6276
3
  size 645975704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bf478a6f256502ea7fff6dbca497e8460a17d13004420d85853719e2329b272
3
  size 645975704
checkpoint-208/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb5932db4edcbb15546d9f3cab8a3cbbcac452bef68d11ef0b0a0ed08a4fa764
3
  size 1292087115
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:815bd40667d86d5a3beced54254a4fdff5e07d5682c6a4b1907679b709d56d9d
3
  size 1292087115
checkpoint-208/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7839e9b785d9556bc2a3ce31deff96475c6acae542ae5b9b51fbaaafd3b4a372
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3adf91ff8bafb6d2e3300a7c332f71e91ce8b3ec728f0e2aab37908de663b1b8
3
  size 15429
checkpoint-208/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0976d571e1588d4916ac2d24c7fb1989d36b6419adbca862ff01a6c30d4d6007
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c70dde40156bddc38880631183ca59dc710551eb7a7733ad9d585cb374e86b3
3
  size 15429
checkpoint-208/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5b0b2b4ce9a1d4eee257302169deb7cf87f1a6dd6ab912fd5f7706ebaacde81
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04aca530bed491901589d495872e054e18dea79299a5f18f260913d260faa876
3
  size 15429
checkpoint-208/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6c367e71d78a211df461b18e7fd6a7dc0cbbb7f2f9a71f69b92b578bbf8b510
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46b68311a8f36f1ffecea2c67c06bb30acb6b2d0c53572628d4d32cf4d54e271
3
  size 15429
checkpoint-208/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:408b45ea3dc39af31cb89d47706eff24014968d56252734cb0fd6c20d70c755a
3
- size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b24bf8a41c3e3c688a38ba117e3127352bafa556de7e01cc189f2855569e6d7a
3
+ size 1401
checkpoint-208/special_tokens_map.json CHANGED
@@ -21,5 +21,11 @@
21
  "rstrip": false,
22
  "single_word": false
23
  },
24
- "pad_token": "<|im_end|>"
 
 
 
 
 
 
25
  }
 
21
  "rstrip": false,
22
  "single_word": false
23
  },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
  }
checkpoint-208/tokenizer_config.json CHANGED
@@ -200,7 +200,7 @@
200
  "errors": "replace",
201
  "extra_special_tokens": {},
202
  "model_max_length": 131072,
203
- "pad_token": "<|im_end|>",
204
  "split_special_tokens": false,
205
  "tokenizer_class": "Qwen2Tokenizer",
206
  "unk_token": null
 
200
  "errors": "replace",
201
  "extra_special_tokens": {},
202
  "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
  "split_special_tokens": false,
205
  "tokenizer_class": "Qwen2Tokenizer",
206
  "unk_token": null
checkpoint-208/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 4.0,
6
  "eval_steps": 500,
7
  "global_step": 208,
8
  "is_hyper_param_search": false,
@@ -10,228 +10,100 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.1927710843373494,
14
- "grad_norm": 95.26730346679688,
15
- "learning_rate": 0.0004180909090909091,
16
- "loss": 22.4465,
17
- "mean_token_accuracy": 0.08969678990542888,
18
- "num_tokens": 1310720.0,
19
- "step": 10
20
- },
21
- {
22
- "epoch": 0.3855421686746988,
23
- "grad_norm": 0.0,
24
- "learning_rate": 0.0005103526968014265,
25
- "loss": 0.8012,
26
- "mean_token_accuracy": 0.13804710581898688,
27
- "num_tokens": 2621440.0,
28
- "step": 20
29
- },
30
- {
31
- "epoch": 0.5783132530120482,
32
- "grad_norm": 0.0,
33
- "learning_rate": 0.0005077286477833616,
34
- "loss": 452.9577,
35
- "mean_token_accuracy": 0.05160275483503938,
36
- "num_tokens": 3932160.0,
37
- "step": 30
38
- },
39
- {
40
- "epoch": 0.7710843373493976,
41
- "grad_norm": 0.0,
42
- "learning_rate": 0.0005031081504278389,
43
- "loss": 470.5136,
44
- "mean_token_accuracy": 0.03822226445190609,
45
- "num_tokens": 5242880.0,
46
- "step": 40
47
- },
48
- {
49
- "epoch": 0.963855421686747,
50
- "grad_norm": 26.303752899169922,
51
- "learning_rate": 0.0004965277770447238,
52
- "loss": 167.1384,
53
- "mean_token_accuracy": 0.057517293840646744,
54
- "num_tokens": 6553600.0,
55
  "step": 50
56
  },
57
  {
58
- "epoch": 1.0,
59
- "eval_loss": NaN,
60
- "eval_mean_token_accuracy": 0.13677339731378757,
61
- "eval_num_tokens": 6713344.0,
62
- "eval_runtime": 8.9806,
63
- "eval_samples_per_second": 41.089,
64
- "eval_steps_per_second": 5.233,
65
- "step": 52
66
  },
67
  {
68
- "epoch": 1.1542168674698796,
69
- "grad_norm": 28.755094528198242,
70
- "learning_rate": 0.00048803961281790017,
71
- "loss": 27.9726,
72
- "mean_token_accuracy": 0.03075966710531259,
73
- "num_tokens": 7761920.0,
74
- "step": 60
75
- },
76
- {
77
- "epoch": 1.346987951807229,
78
- "grad_norm": 0.9693858027458191,
79
- "learning_rate": 0.000477710843538941,
80
- "loss": 2.2869,
81
- "mean_token_accuracy": 0.10747051909565926,
82
- "num_tokens": 9072640.0,
83
- "step": 70
84
- },
85
- {
86
- "epoch": 1.5397590361445783,
87
- "grad_norm": 0.36548200249671936,
88
- "learning_rate": 0.0004656232238159615,
89
- "loss": 60.0031,
90
- "mean_token_accuracy": 0.10124717205762863,
91
- "num_tokens": 10383360.0,
92
- "step": 80
93
- },
94
- {
95
- "epoch": 1.7325301204819277,
96
- "grad_norm": 0.8749092817306519,
97
- "learning_rate": 0.0004518724299669051,
98
- "loss": 0.8994,
99
- "mean_token_accuracy": 0.1551567144691944,
100
- "num_tokens": 11694080.0,
101
- "step": 90
102
- },
103
- {
104
- "epoch": 1.9253012048192772,
105
- "grad_norm": 0.0,
106
- "learning_rate": 0.0004365673027192623,
107
- "loss": 2.2759,
108
- "mean_token_accuracy": 0.13096993789076805,
109
- "num_tokens": 13004800.0,
110
  "step": 100
111
  },
112
  {
113
- "epoch": 2.0,
114
- "eval_loss": NaN,
115
- "eval_mean_token_accuracy": 0.14378934084100925,
116
- "eval_num_tokens": 13426688.0,
117
- "eval_runtime": 8.9689,
118
- "eval_samples_per_second": 41.142,
119
- "eval_steps_per_second": 5.24,
120
  "step": 104
121
  },
122
  {
123
- "epoch": 2.1156626506024097,
124
- "grad_norm": 0.6895984411239624,
125
- "learning_rate": 0.0004198289857104298,
126
- "loss": 0.2064,
127
- "mean_token_accuracy": 0.08719592305678356,
128
- "num_tokens": 14213120.0,
129
- "step": 110
130
- },
131
- {
132
- "epoch": 2.3084337349397592,
133
- "grad_norm": 0.7038294672966003,
134
- "learning_rate": 0.0004017899666076801,
135
- "loss": 1.3155,
136
- "mean_token_accuracy": 0.1053241491317749,
137
- "num_tokens": 15523840.0,
138
- "step": 120
139
  },
140
  {
141
- "epoch": 2.5012048192771084,
142
- "grad_norm": 27.594745635986328,
143
- "learning_rate": 0.0003825930284374996,
144
- "loss": 0.0836,
145
- "mean_token_accuracy": 0.07201291918754578,
146
- "num_tokens": 16834560.0,
147
- "step": 130
148
- },
149
- {
150
- "epoch": 2.693975903614458,
151
- "grad_norm": 0.0,
152
- "learning_rate": 0.00036239011942476655,
153
- "loss": 1.364,
154
- "mean_token_accuracy": 0.15817394778132438,
155
- "num_tokens": 18145280.0,
156
- "step": 140
157
- },
158
- {
159
- "epoch": 2.886746987951807,
160
- "grad_norm": 0.0,
161
- "learning_rate": 0.00034134115028725524,
162
- "loss": 3.5977,
163
- "mean_token_accuracy": 0.10589548945426941,
164
- "num_tokens": 19456000.0,
165
  "step": 150
166
  },
167
  {
168
- "epoch": 3.0,
169
- "eval_loss": NaN,
170
- "eval_mean_token_accuracy": 0.14411297884393245,
171
- "eval_num_tokens": 20140032.0,
172
- "eval_runtime": 8.9831,
173
- "eval_samples_per_second": 41.077,
174
- "eval_steps_per_second": 5.232,
175
- "step": 156
176
  },
177
  {
178
- "epoch": 3.07710843373494,
179
- "grad_norm": 0.1902359277009964,
180
- "learning_rate": 0.0003196127285051592,
181
- "loss": 8.9424,
182
- "mean_token_accuracy": 0.062061098557484304,
183
- "num_tokens": 20664320.0,
184
- "step": 160
185
- },
186
- {
187
- "epoch": 3.269879518072289,
188
- "grad_norm": 0.3390277326107025,
189
- "learning_rate": 0.00029737683958418377,
190
- "loss": 12.1371,
191
- "mean_token_accuracy": 0.07337962239980697,
192
- "num_tokens": 21975040.0,
193
- "step": 170
194
- },
195
- {
196
- "epoch": 3.4626506024096386,
197
- "grad_norm": 0.0,
198
- "learning_rate": 0.00027480948575031854,
199
- "loss": 42.6417,
200
- "mean_token_accuracy": 0.08556168600916862,
201
- "num_tokens": 23285760.0,
202
- "step": 180
203
- },
204
- {
205
- "epoch": 3.6554216867469878,
206
- "grad_norm": 0.0,
207
- "learning_rate": 0.0002520892928513346,
208
- "loss": 1.5423,
209
- "mean_token_accuracy": 0.13269576877355577,
210
- "num_tokens": 24596480.0,
211
- "step": 190
212
- },
213
- {
214
- "epoch": 3.8481927710843373,
215
- "grad_norm": 0.19443857669830322,
216
- "learning_rate": 0.0002293960964917063,
217
- "loss": 0.3356,
218
- "mean_token_accuracy": 0.17251307517290115,
219
- "num_tokens": 25907200.0,
220
  "step": 200
221
  },
222
  {
223
- "epoch": 4.0,
224
- "eval_loss": NaN,
225
- "eval_mean_token_accuracy": 0.14541077867467353,
226
- "eval_num_tokens": 26853376.0,
227
- "eval_runtime": 8.9721,
228
- "eval_samples_per_second": 41.127,
229
- "eval_steps_per_second": 5.238,
230
  "step": 208
231
  }
232
  ],
233
- "logging_steps": 10,
234
- "max_steps": 364,
235
  "num_input_tokens_seen": 0,
236
  "num_train_epochs": 7,
237
  "save_steps": 500,
@@ -247,7 +119,7 @@
247
  "attributes": {}
248
  }
249
  },
250
- "total_flos": 1.1801675852792463e+18,
251
  "train_batch_size": 2,
252
  "trial_name": null,
253
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
  "eval_steps": 500,
7
  "global_step": 208,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.24096385542168675,
14
+ "grad_norm": 0.2236759215593338,
15
+ "learning_rate": 0.000511,
16
+ "loss": 0.4204,
17
+ "mean_token_accuracy": 0.900120057463646,
18
+ "num_tokens": 567991.0,
19
+ "step": 25
20
+ },
21
+ {
22
+ "epoch": 0.4819277108433735,
23
+ "grad_norm": 0.1322442889213562,
24
+ "learning_rate": 0.000511,
25
+ "loss": 0.2913,
26
+ "mean_token_accuracy": 0.9270081639289856,
27
+ "num_tokens": 1135343.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  "step": 50
29
  },
30
  {
31
+ "epoch": 0.7228915662650602,
32
+ "grad_norm": 0.19739408791065216,
33
+ "learning_rate": 0.000511,
34
+ "loss": 0.2186,
35
+ "mean_token_accuracy": 0.9418566429615021,
36
+ "num_tokens": 1703784.0,
37
+ "step": 75
 
38
  },
39
  {
40
+ "epoch": 0.963855421686747,
41
+ "grad_norm": 0.17215745151042938,
42
+ "learning_rate": 0.000511,
43
+ "loss": 0.1963,
44
+ "mean_token_accuracy": 0.9479192215204238,
45
+ "num_tokens": 2269891.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  "step": 100
47
  },
48
  {
49
+ "epoch": 1.0,
50
+ "eval_loss": 0.19681453704833984,
51
+ "eval_mean_token_accuracy": 0.9478744319144715,
52
+ "eval_num_tokens": 2345494.0,
53
+ "eval_runtime": 4.3187,
54
+ "eval_samples_per_second": 85.442,
55
+ "eval_steps_per_second": 10.883,
56
  "step": 104
57
  },
58
  {
59
+ "epoch": 1.202409638554217,
60
+ "grad_norm": 0.10346771776676178,
61
+ "learning_rate": 0.000511,
62
+ "loss": 0.165,
63
+ "mean_token_accuracy": 0.9550067053900825,
64
+ "num_tokens": 2836234.0,
65
+ "step": 125
 
 
 
 
 
 
 
 
 
66
  },
67
  {
68
+ "epoch": 1.4433734939759035,
69
+ "grad_norm": 0.0941459909081459,
70
+ "learning_rate": 0.000511,
71
+ "loss": 0.1445,
72
+ "mean_token_accuracy": 0.9606501096487046,
73
+ "num_tokens": 3403671.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  "step": 150
75
  },
76
  {
77
+ "epoch": 1.6843373493975904,
78
+ "grad_norm": 0.07419874519109726,
79
+ "learning_rate": 0.000511,
80
+ "loss": 0.1184,
81
+ "mean_token_accuracy": 0.9665295648574829,
82
+ "num_tokens": 3972278.0,
83
+ "step": 175
 
84
  },
85
  {
86
+ "epoch": 1.9253012048192772,
87
+ "grad_norm": 0.08383649587631226,
88
+ "learning_rate": 0.000511,
89
+ "loss": 0.1309,
90
+ "mean_token_accuracy": 0.9640201306343079,
91
+ "num_tokens": 4538970.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  "step": 200
93
  },
94
  {
95
+ "epoch": 2.0,
96
+ "eval_loss": 0.16037927567958832,
97
+ "eval_mean_token_accuracy": 0.9557588328706458,
98
+ "eval_num_tokens": 4690728.0,
99
+ "eval_runtime": 4.2478,
100
+ "eval_samples_per_second": 86.868,
101
+ "eval_steps_per_second": 11.065,
102
  "step": 208
103
  }
104
  ],
105
+ "logging_steps": 25,
106
+ "max_steps": 728,
107
  "num_input_tokens_seen": 0,
108
  "num_train_epochs": 7,
109
  "save_steps": 500,
 
119
  "attributes": {}
120
  }
121
  },
122
+ "total_flos": 2.0514658423144448e+17,
123
  "train_batch_size": 2,
124
  "trial_name": null,
125
  "trial_params": null
checkpoint-208/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb24a2e21c60358ea1e7c21423d18ace82a12d90960084fd56c8a7388a71c974
3
  size 6097
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a92f33d5ca39ba292c6b171cffeb00f4d1c361214bebb8604f8ce3482d3b7c8c
3
  size 6097
checkpoint-312/adapter_config.json CHANGED
@@ -15,7 +15,7 @@
15
  "loftq_config": {},
16
  "lora_alpha": 16,
17
  "lora_bias": false,
18
- "lora_dropout": 0.05,
19
  "megatron_config": null,
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
@@ -25,13 +25,13 @@
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "gate_proj",
 
29
  "o_proj",
30
- "up_proj",
31
  "v_proj",
32
- "k_proj",
33
- "down_proj",
34
- "q_proj"
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
 
15
  "loftq_config": {},
16
  "lora_alpha": 16,
17
  "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
  "megatron_config": null,
20
  "megatron_core": "megatron.core",
21
  "modules_to_save": null,
 
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
+ "down_proj",
29
+ "k_proj",
30
  "o_proj",
31
+ "q_proj",
32
  "v_proj",
33
+ "up_proj",
34
+ "gate_proj"
 
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
checkpoint-312/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:553445b0bc853e965e43188ff4f3af2066675d8b2aa535b50f463ce82e72c5ce
3
  size 645975704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:062140058de69da2ff74416b96e510ff3ea8e3630e3cfa2414ae1fa5bed530bd
3
  size 645975704
checkpoint-312/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d86273ba53204d574c3c13f3700de47e0480acbb2994ae06d3fd7b4d44cc1a05
3
  size 1292087499
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a282343663fb90d99583879419e008ecf5ff31aa87f4664cfb11cd42543b327a
3
  size 1292087499
checkpoint-312/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:503880c3e18b0a0a7d070feaf37d865b85b4c8cbe4833f2dc572248b7556301e
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60094a06d6d79464dba44020816cd1c2f7e2a5da0bd09c1e533ad3eddb688564
3
  size 15429
checkpoint-312/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f75a725dd878216d7b7084e23c639cb7108c692d446c5a9195ffea137d301dbc
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:785ee1730140ccaba6453ba94a5a713f346a9c29e9b86ce8e7c83f6634525222
3
  size 15429
checkpoint-312/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e276e3be8d04824150633c0ebed694dcf7871c57f168fa21f75f97b608dff4ad
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fc115052040f16323733a9ece8dd57daa47ec295a6c498facac0b395731b471
3
  size 15429
checkpoint-312/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:acf5a9a218be990f3c3d05fb0163857515aa84ba415c75d204f587b23c31dbc2
3
  size 15429
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49dc588331596d74bb4f1f27781ca80a1dfff453105267c466abff7513f86cff
3
  size 15429
checkpoint-312/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88c1f19fbaac09a7b01b826d2a3eb05434d8b50a36c13d838feee781e2642515
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f4e419a1d6b526779af1dd5f4f57538634cc30d6affb1f8eceaed3cbe949aa4
3
  size 1465
checkpoint-312/special_tokens_map.json CHANGED
@@ -21,5 +21,11 @@
21
  "rstrip": false,
22
  "single_word": false
23
  },
24
- "pad_token": "<|im_end|>"
 
 
 
 
 
 
25
  }
 
21
  "rstrip": false,
22
  "single_word": false
23
  },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
  }
checkpoint-312/tokenizer_config.json CHANGED
@@ -200,7 +200,7 @@
200
  "errors": "replace",
201
  "extra_special_tokens": {},
202
  "model_max_length": 131072,
203
- "pad_token": "<|im_end|>",
204
  "split_special_tokens": false,
205
  "tokenizer_class": "Qwen2Tokenizer",
206
  "unk_token": null
 
200
  "errors": "replace",
201
  "extra_special_tokens": {},
202
  "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
  "split_special_tokens": false,
205
  "tokenizer_class": "Qwen2Tokenizer",
206
  "unk_token": null
checkpoint-312/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 6.0,
6
  "eval_steps": 500,
7
  "global_step": 312,
8
  "is_hyper_param_search": false,
@@ -10,347 +10,146 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.1927710843373494,
14
- "grad_norm": 95.26730346679688,
15
- "learning_rate": 0.0004180909090909091,
16
- "loss": 22.4465,
17
- "mean_token_accuracy": 0.08969678990542888,
18
- "num_tokens": 1310720.0,
19
- "step": 10
20
- },
21
- {
22
- "epoch": 0.3855421686746988,
23
- "grad_norm": 0.0,
24
- "learning_rate": 0.0005103526968014265,
25
- "loss": 0.8012,
26
- "mean_token_accuracy": 0.13804710581898688,
27
- "num_tokens": 2621440.0,
28
- "step": 20
29
- },
30
- {
31
- "epoch": 0.5783132530120482,
32
- "grad_norm": 0.0,
33
- "learning_rate": 0.0005077286477833616,
34
- "loss": 452.9577,
35
- "mean_token_accuracy": 0.05160275483503938,
36
- "num_tokens": 3932160.0,
37
- "step": 30
38
- },
39
- {
40
- "epoch": 0.7710843373493976,
41
- "grad_norm": 0.0,
42
- "learning_rate": 0.0005031081504278389,
43
- "loss": 470.5136,
44
- "mean_token_accuracy": 0.03822226445190609,
45
- "num_tokens": 5242880.0,
46
- "step": 40
47
- },
48
- {
49
- "epoch": 0.963855421686747,
50
- "grad_norm": 26.303752899169922,
51
- "learning_rate": 0.0004965277770447238,
52
- "loss": 167.1384,
53
- "mean_token_accuracy": 0.057517293840646744,
54
- "num_tokens": 6553600.0,
55
  "step": 50
56
  },
57
  {
58
- "epoch": 1.0,
59
- "eval_loss": NaN,
60
- "eval_mean_token_accuracy": 0.13677339731378757,
61
- "eval_num_tokens": 6713344.0,
62
- "eval_runtime": 8.9806,
63
- "eval_samples_per_second": 41.089,
64
- "eval_steps_per_second": 5.233,
65
- "step": 52
66
- },
67
- {
68
- "epoch": 1.1542168674698796,
69
- "grad_norm": 28.755094528198242,
70
- "learning_rate": 0.00048803961281790017,
71
- "loss": 27.9726,
72
- "mean_token_accuracy": 0.03075966710531259,
73
- "num_tokens": 7761920.0,
74
- "step": 60
75
  },
76
  {
77
- "epoch": 1.346987951807229,
78
- "grad_norm": 0.9693858027458191,
79
- "learning_rate": 0.000477710843538941,
80
- "loss": 2.2869,
81
- "mean_token_accuracy": 0.10747051909565926,
82
- "num_tokens": 9072640.0,
83
- "step": 70
84
- },
85
- {
86
- "epoch": 1.5397590361445783,
87
- "grad_norm": 0.36548200249671936,
88
- "learning_rate": 0.0004656232238159615,
89
- "loss": 60.0031,
90
- "mean_token_accuracy": 0.10124717205762863,
91
- "num_tokens": 10383360.0,
92
- "step": 80
93
- },
94
- {
95
- "epoch": 1.7325301204819277,
96
- "grad_norm": 0.8749092817306519,
97
- "learning_rate": 0.0004518724299669051,
98
- "loss": 0.8994,
99
- "mean_token_accuracy": 0.1551567144691944,
100
- "num_tokens": 11694080.0,
101
- "step": 90
102
- },
103
- {
104
- "epoch": 1.9253012048192772,
105
- "grad_norm": 0.0,
106
- "learning_rate": 0.0004365673027192623,
107
- "loss": 2.2759,
108
- "mean_token_accuracy": 0.13096993789076805,
109
- "num_tokens": 13004800.0,
110
  "step": 100
111
  },
112
  {
113
- "epoch": 2.0,
114
- "eval_loss": NaN,
115
- "eval_mean_token_accuracy": 0.14378934084100925,
116
- "eval_num_tokens": 13426688.0,
117
- "eval_runtime": 8.9689,
118
- "eval_samples_per_second": 41.142,
119
- "eval_steps_per_second": 5.24,
120
  "step": 104
121
  },
122
  {
123
- "epoch": 2.1156626506024097,
124
- "grad_norm": 0.6895984411239624,
125
- "learning_rate": 0.0004198289857104298,
126
- "loss": 0.2064,
127
- "mean_token_accuracy": 0.08719592305678356,
128
- "num_tokens": 14213120.0,
129
- "step": 110
130
  },
131
  {
132
- "epoch": 2.3084337349397592,
133
- "grad_norm": 0.7038294672966003,
134
- "learning_rate": 0.0004017899666076801,
135
- "loss": 1.3155,
136
- "mean_token_accuracy": 0.1053241491317749,
137
- "num_tokens": 15523840.0,
138
- "step": 120
139
- },
140
- {
141
- "epoch": 2.5012048192771084,
142
- "grad_norm": 27.594745635986328,
143
- "learning_rate": 0.0003825930284374996,
144
- "loss": 0.0836,
145
- "mean_token_accuracy": 0.07201291918754578,
146
- "num_tokens": 16834560.0,
147
- "step": 130
148
- },
149
- {
150
- "epoch": 2.693975903614458,
151
- "grad_norm": 0.0,
152
- "learning_rate": 0.00036239011942476655,
153
- "loss": 1.364,
154
- "mean_token_accuracy": 0.15817394778132438,
155
- "num_tokens": 18145280.0,
156
- "step": 140
157
- },
158
- {
159
- "epoch": 2.886746987951807,
160
- "grad_norm": 0.0,
161
- "learning_rate": 0.00034134115028725524,
162
- "loss": 3.5977,
163
- "mean_token_accuracy": 0.10589548945426941,
164
- "num_tokens": 19456000.0,
165
  "step": 150
166
  },
167
  {
168
- "epoch": 3.0,
169
- "eval_loss": NaN,
170
- "eval_mean_token_accuracy": 0.14411297884393245,
171
- "eval_num_tokens": 20140032.0,
172
- "eval_runtime": 8.9831,
173
- "eval_samples_per_second": 41.077,
174
- "eval_steps_per_second": 5.232,
175
- "step": 156
176
- },
177
- {
178
- "epoch": 3.07710843373494,
179
- "grad_norm": 0.1902359277009964,
180
- "learning_rate": 0.0003196127285051592,
181
- "loss": 8.9424,
182
- "mean_token_accuracy": 0.062061098557484304,
183
- "num_tokens": 20664320.0,
184
- "step": 160
185
- },
186
- {
187
- "epoch": 3.269879518072289,
188
- "grad_norm": 0.3390277326107025,
189
- "learning_rate": 0.00029737683958418377,
190
- "loss": 12.1371,
191
- "mean_token_accuracy": 0.07337962239980697,
192
- "num_tokens": 21975040.0,
193
- "step": 170
194
- },
195
- {
196
- "epoch": 3.4626506024096386,
197
- "grad_norm": 0.0,
198
- "learning_rate": 0.00027480948575031854,
199
- "loss": 42.6417,
200
- "mean_token_accuracy": 0.08556168600916862,
201
- "num_tokens": 23285760.0,
202
- "step": 180
203
  },
204
  {
205
- "epoch": 3.6554216867469878,
206
- "grad_norm": 0.0,
207
- "learning_rate": 0.0002520892928513346,
208
- "loss": 1.5423,
209
- "mean_token_accuracy": 0.13269576877355577,
210
- "num_tokens": 24596480.0,
211
- "step": 190
212
- },
213
- {
214
- "epoch": 3.8481927710843373,
215
- "grad_norm": 0.19443857669830322,
216
- "learning_rate": 0.0002293960964917063,
217
- "loss": 0.3356,
218
- "mean_token_accuracy": 0.17251307517290115,
219
- "num_tokens": 25907200.0,
220
  "step": 200
221
  },
222
  {
223
- "epoch": 4.0,
224
- "eval_loss": NaN,
225
- "eval_mean_token_accuracy": 0.14541077867467353,
226
- "eval_num_tokens": 26853376.0,
227
- "eval_runtime": 8.9721,
228
- "eval_samples_per_second": 41.127,
229
- "eval_steps_per_second": 5.238,
230
  "step": 208
231
  },
232
  {
233
- "epoch": 4.03855421686747,
234
- "grad_norm": 0.009991697035729885,
235
- "learning_rate": 0.00020690951859202796,
236
- "loss": 0.4818,
237
- "mean_token_accuracy": 0.16193881593173062,
238
- "num_tokens": 27115520.0,
239
- "step": 210
240
- },
241
- {
242
- "epoch": 4.231325301204819,
243
- "grad_norm": 0.0,
244
- "learning_rate": 0.0001848075456397883,
245
- "loss": 0.3988,
246
- "mean_token_accuracy": 0.12297056466341019,
247
- "num_tokens": 28426240.0,
248
- "step": 220
249
- },
250
- {
251
- "epoch": 4.424096385542168,
252
- "grad_norm": 0.007613173220306635,
253
- "learning_rate": 0.00016326511988497662,
254
- "loss": 0.0274,
255
- "mean_token_accuracy": 0.11160993352532386,
256
- "num_tokens": 29736960.0,
257
- "step": 230
258
  },
259
  {
260
- "epoch": 4.6168674698795185,
261
- "grad_norm": 0.0015245635295286775,
262
- "learning_rate": 0.0001424527546315377,
263
- "loss": 2.8338,
264
- "mean_token_accuracy": 0.06221988797187805,
265
- "num_tokens": 31047680.0,
266
- "step": 240
267
- },
268
- {
269
- "epoch": 4.809638554216868,
270
- "grad_norm": 0.26247891783714294,
271
- "learning_rate": 0.00012253518458496144,
272
- "loss": 0.2528,
273
- "mean_token_accuracy": 0.09861943274736404,
274
- "num_tokens": 32358400.0,
275
  "step": 250
276
  },
277
  {
278
- "epoch": 5.0,
279
- "grad_norm": 0.00404323311522603,
280
- "learning_rate": 0.00010367006193882396,
281
- "loss": 0.2094,
282
- "mean_token_accuracy": 0.13716269520264637,
283
- "num_tokens": 33566720.0,
284
- "step": 260
285
- },
286
- {
287
- "epoch": 5.0,
288
- "eval_loss": NaN,
289
- "eval_mean_token_accuracy": 0.14629162118789998,
290
- "eval_num_tokens": 33566720.0,
291
- "eval_runtime": 9.0068,
292
- "eval_samples_per_second": 40.969,
293
- "eval_steps_per_second": 5.218,
294
- "step": 260
295
- },
296
- {
297
- "epoch": 5.192771084337349,
298
- "grad_norm": 0.05429690331220627,
299
- "learning_rate": 8.600670852105292e-05,
300
- "loss": 1.0623,
301
- "mean_token_accuracy": 0.09910124614834785,
302
- "num_tokens": 34877440.0,
303
- "step": 270
304
- },
305
- {
306
- "epoch": 5.385542168674699,
307
- "grad_norm": 0.0,
308
- "learning_rate": 6.968493387697466e-05,
309
- "loss": 1.6628,
310
- "mean_token_accuracy": 0.12442896366119385,
311
- "num_tokens": 36188160.0,
312
- "step": 280
313
- },
314
- {
315
- "epoch": 5.578313253012048,
316
- "grad_norm": 0.01778862252831459,
317
- "learning_rate": 5.483392864428595e-05,
318
- "loss": 0.027,
319
- "mean_token_accuracy": 0.14972642660140992,
320
- "num_tokens": 37498880.0,
321
- "step": 290
322
  },
323
  {
324
- "epoch": 5.771084337349397,
325
- "grad_norm": 0.045461323112249374,
326
- "learning_rate": 4.1571241979147114e-05,
327
- "loss": 2.1638,
328
- "mean_token_accuracy": 0.07477418482303619,
329
- "num_tokens": 38809600.0,
330
  "step": 300
331
  },
332
  {
333
- "epoch": 5.9638554216867465,
334
- "grad_norm": 0.0,
335
- "learning_rate": 3.000185112730528e-05,
336
- "loss": 0.4219,
337
- "mean_token_accuracy": 0.09914562478661537,
338
- "num_tokens": 40120320.0,
339
- "step": 310
340
- },
341
- {
342
- "epoch": 6.0,
343
- "eval_loss": NaN,
344
- "eval_mean_token_accuracy": 0.14657345097115698,
345
- "eval_num_tokens": 40280064.0,
346
- "eval_runtime": 8.9922,
347
- "eval_samples_per_second": 41.036,
348
- "eval_steps_per_second": 5.227,
349
  "step": 312
350
  }
351
  ],
352
- "logging_steps": 10,
353
- "max_steps": 364,
354
  "num_input_tokens_seen": 0,
355
  "num_train_epochs": 7,
356
  "save_steps": 500,
@@ -366,7 +165,7 @@
366
  "attributes": {}
367
  }
368
  },
369
- "total_flos": 1.7702513789926113e+18,
370
  "train_batch_size": 2,
371
  "trial_name": null,
372
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
  "eval_steps": 500,
7
  "global_step": 312,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.24096385542168675,
14
+ "grad_norm": 0.2236759215593338,
15
+ "learning_rate": 0.000511,
16
+ "loss": 0.4204,
17
+ "mean_token_accuracy": 0.900120057463646,
18
+ "num_tokens": 567991.0,
19
+ "step": 25
20
+ },
21
+ {
22
+ "epoch": 0.4819277108433735,
23
+ "grad_norm": 0.1322442889213562,
24
+ "learning_rate": 0.000511,
25
+ "loss": 0.2913,
26
+ "mean_token_accuracy": 0.9270081639289856,
27
+ "num_tokens": 1135343.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  "step": 50
29
  },
30
  {
31
+ "epoch": 0.7228915662650602,
32
+ "grad_norm": 0.19739408791065216,
33
+ "learning_rate": 0.000511,
34
+ "loss": 0.2186,
35
+ "mean_token_accuracy": 0.9418566429615021,
36
+ "num_tokens": 1703784.0,
37
+ "step": 75
 
 
 
 
 
 
 
 
 
 
38
  },
39
  {
40
+ "epoch": 0.963855421686747,
41
+ "grad_norm": 0.17215745151042938,
42
+ "learning_rate": 0.000511,
43
+ "loss": 0.1963,
44
+ "mean_token_accuracy": 0.9479192215204238,
45
+ "num_tokens": 2269891.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  "step": 100
47
  },
48
  {
49
+ "epoch": 1.0,
50
+ "eval_loss": 0.19681453704833984,
51
+ "eval_mean_token_accuracy": 0.9478744319144715,
52
+ "eval_num_tokens": 2345494.0,
53
+ "eval_runtime": 4.3187,
54
+ "eval_samples_per_second": 85.442,
55
+ "eval_steps_per_second": 10.883,
56
  "step": 104
57
  },
58
  {
59
+ "epoch": 1.202409638554217,
60
+ "grad_norm": 0.10346771776676178,
61
+ "learning_rate": 0.000511,
62
+ "loss": 0.165,
63
+ "mean_token_accuracy": 0.9550067053900825,
64
+ "num_tokens": 2836234.0,
65
+ "step": 125
66
  },
67
  {
68
+ "epoch": 1.4433734939759035,
69
+ "grad_norm": 0.0941459909081459,
70
+ "learning_rate": 0.000511,
71
+ "loss": 0.1445,
72
+ "mean_token_accuracy": 0.9606501096487046,
73
+ "num_tokens": 3403671.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  "step": 150
75
  },
76
  {
77
+ "epoch": 1.6843373493975904,
78
+ "grad_norm": 0.07419874519109726,
79
+ "learning_rate": 0.000511,
80
+ "loss": 0.1184,
81
+ "mean_token_accuracy": 0.9665295648574829,
82
+ "num_tokens": 3972278.0,
83
+ "step": 175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  },
85
  {
86
+ "epoch": 1.9253012048192772,
87
+ "grad_norm": 0.08383649587631226,
88
+ "learning_rate": 0.000511,
89
+ "loss": 0.1309,
90
+ "mean_token_accuracy": 0.9640201306343079,
91
+ "num_tokens": 4538970.0,
 
 
 
 
 
 
 
 
 
92
  "step": 200
93
  },
94
  {
95
+ "epoch": 2.0,
96
+ "eval_loss": 0.16037927567958832,
97
+ "eval_mean_token_accuracy": 0.9557588328706458,
98
+ "eval_num_tokens": 4690728.0,
99
+ "eval_runtime": 4.2478,
100
+ "eval_samples_per_second": 86.868,
101
+ "eval_steps_per_second": 11.065,
102
  "step": 208
103
  },
104
  {
105
+ "epoch": 2.163855421686747,
106
+ "grad_norm": 0.09131479263305664,
107
+ "learning_rate": 0.000511,
108
+ "loss": 0.1127,
109
+ "mean_token_accuracy": 0.9709722676662483,
110
+ "num_tokens": 5091564.0,
111
+ "step": 225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  },
113
  {
114
+ "epoch": 2.404819277108434,
115
+ "grad_norm": 0.09491455554962158,
116
+ "learning_rate": 0.000511,
117
+ "loss": 0.1007,
118
+ "mean_token_accuracy": 0.9710033702850341,
119
+ "num_tokens": 5659070.0,
 
 
 
 
 
 
 
 
 
120
  "step": 250
121
  },
122
  {
123
+ "epoch": 2.6457831325301204,
124
+ "grad_norm": 0.07198868691921234,
125
+ "learning_rate": 0.000511,
126
+ "loss": 0.0858,
127
+ "mean_token_accuracy": 0.9747626584768295,
128
+ "num_tokens": 6228488.0,
129
+ "step": 275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  },
131
  {
132
+ "epoch": 2.886746987951807,
133
+ "grad_norm": 0.07914356887340546,
134
+ "learning_rate": 0.000511,
135
+ "loss": 0.0961,
136
+ "mean_token_accuracy": 0.9724871903657913,
137
+ "num_tokens": 6795848.0,
138
  "step": 300
139
  },
140
  {
141
+ "epoch": 3.0,
142
+ "eval_loss": 0.16401147842407227,
143
+ "eval_mean_token_accuracy": 0.958565741143328,
144
+ "eval_num_tokens": 7036588.0,
145
+ "eval_runtime": 4.2348,
146
+ "eval_samples_per_second": 87.135,
147
+ "eval_steps_per_second": 11.099,
 
 
 
 
 
 
 
 
 
148
  "step": 312
149
  }
150
  ],
151
+ "logging_steps": 25,
152
+ "max_steps": 728,
153
  "num_input_tokens_seen": 0,
154
  "num_train_epochs": 7,
155
  "save_steps": 500,
 
165
  "attributes": {}
166
  }
167
  },
168
+ "total_flos": 3.077171409898701e+17,
169
  "train_batch_size": 2,
170
  "trial_name": null,
171
  "trial_params": null
checkpoint-312/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb24a2e21c60358ea1e7c21423d18ace82a12d90960084fd56c8a7388a71c974
3
  size 6097
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a92f33d5ca39ba292c6b171cffeb00f4d1c361214bebb8604f8ce3482d3b7c8c
3
  size 6097
checkpoint-416/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.0
checkpoint-416/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.1,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 64,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "down_proj",
29
+ "k_proj",
30
+ "o_proj",
31
+ "q_proj",
32
+ "v_proj",
33
+ "up_proj",
34
+ "gate_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
checkpoint-416/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:542d542fbe24ec80310418b793ff4ba1972ae6587fdf5669491fc92c83b08a09
3
+ size 645975704
checkpoint-416/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-416/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-416/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-416/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b165c5df29ba238a2a6fe6aee452efc2a9acfdba35ea32f0b467fd2d02c5353
3
+ size 1292087499
checkpoint-416/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:152c5038eb73ee59d2dde5d5b103ac1fbd66c3a40e654eb5c0300cac7dbc116d
3
+ size 15429
checkpoint-416/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bf88dea0c41ff8af9c7036f185396537d2c81ba9abbc7a1f1b60ece0652d2ad
3
+ size 15429
checkpoint-416/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8efe00d663f3c5a76caedcb1606763b501b981c15ca59077f9933614d1cf693e
3
+ size 15429