ThomasTheMaker commited on
Commit
e6a8f10
·
verified ·
1 Parent(s): db1e772

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +58 -0
  2. checkpoint-1000/config.json +31 -0
  3. checkpoint-1000/generation_config.json +9 -0
  4. checkpoint-1000/merges.txt +0 -0
  5. checkpoint-1000/model.safetensors +3 -0
  6. checkpoint-1000/optimizer.pt +3 -0
  7. checkpoint-1000/rng_state.pth +3 -0
  8. checkpoint-1000/scheduler.pt +3 -0
  9. checkpoint-1000/special_tokens_map.json +42 -0
  10. checkpoint-1000/tokenizer.json +0 -0
  11. checkpoint-1000/tokenizer_config.json +168 -0
  12. checkpoint-1000/trainer_state.json +134 -0
  13. checkpoint-1000/training_args.bin +3 -0
  14. checkpoint-1000/vocab.json +0 -0
  15. checkpoint-1500/config.json +31 -0
  16. checkpoint-1500/generation_config.json +9 -0
  17. checkpoint-1500/merges.txt +0 -0
  18. checkpoint-1500/model.safetensors +3 -0
  19. checkpoint-1500/optimizer.pt +3 -0
  20. checkpoint-1500/rng_state.pth +3 -0
  21. checkpoint-1500/scheduler.pt +3 -0
  22. checkpoint-1500/special_tokens_map.json +42 -0
  23. checkpoint-1500/tokenizer.json +0 -0
  24. checkpoint-1500/tokenizer_config.json +168 -0
  25. checkpoint-1500/trainer_state.json +184 -0
  26. checkpoint-1500/training_args.bin +3 -0
  27. checkpoint-1500/vocab.json +0 -0
  28. checkpoint-2000/config.json +31 -0
  29. checkpoint-2000/generation_config.json +9 -0
  30. checkpoint-2000/merges.txt +0 -0
  31. checkpoint-2000/model.safetensors +3 -0
  32. checkpoint-2000/optimizer.pt +3 -0
  33. checkpoint-2000/rng_state.pth +3 -0
  34. checkpoint-2000/scheduler.pt +3 -0
  35. checkpoint-2000/special_tokens_map.json +42 -0
  36. checkpoint-2000/tokenizer.json +0 -0
  37. checkpoint-2000/tokenizer_config.json +168 -0
  38. checkpoint-2000/trainer_state.json +234 -0
  39. checkpoint-2000/training_args.bin +3 -0
  40. checkpoint-2000/vocab.json +0 -0
  41. checkpoint-2067/config.json +31 -0
  42. checkpoint-2067/generation_config.json +9 -0
  43. checkpoint-2067/merges.txt +0 -0
  44. checkpoint-2067/model.safetensors +3 -0
  45. checkpoint-2067/optimizer.pt +3 -0
  46. checkpoint-2067/rng_state.pth +3 -0
  47. checkpoint-2067/scheduler.pt +3 -0
  48. checkpoint-2067/special_tokens_map.json +42 -0
  49. checkpoint-2067/tokenizer.json +0 -0
  50. checkpoint-2067/tokenizer_config.json +168 -0
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: HuggingFaceTB/SmolLM2-135M
3
+ library_name: transformers
4
+ model_name: SmolLM2-360M-synthetic-concise-reasoning
5
+ tags:
6
+ - generated_from_trainer
7
+ - sft
8
+ - trl
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for SmolLM2-360M-synthetic-concise-reasoning
13
+
14
+ This model is a fine-tuned version of [HuggingFaceTB/SmolLM2-135M](https://huggingface.co/HuggingFaceTB/SmolLM2-135M).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="argilla/SmolLM2-360M-synthetic-concise-reasoning", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+
31
+
32
+
33
+ This model was trained with SFT.
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.22.2
38
+ - Transformers: 4.56.1
39
+ - Pytorch: 2.6.0+cu118
40
+ - Datasets: 4.0.0
41
+ - Tokenizers: 0.22.0
42
+
43
+ ## Citations
44
+
45
+
46
+
47
+ Cite TRL as:
48
+
49
+ ```bibtex
50
+ @misc{vonwerra2022trl,
51
+ title = {{TRL: Transformer Reinforcement Learning}},
52
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
53
+ year = 2020,
54
+ journal = {GitHub repository},
55
+ publisher = {GitHub},
56
+ howpublished = {\url{https://github.com/huggingface/trl}}
57
+ }
58
+ ```
checkpoint-1000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "float32",
9
+ "eos_token_id": 0,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 576,
13
+ "initializer_range": 0.041666666666666664,
14
+ "intermediate_size": 1536,
15
+ "is_llama_config": true,
16
+ "max_position_embeddings": 8192,
17
+ "mlp_bias": false,
18
+ "model_type": "llama",
19
+ "num_attention_heads": 9,
20
+ "num_hidden_layers": 30,
21
+ "num_key_value_heads": 3,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_interleaved": false,
25
+ "rope_scaling": null,
26
+ "rope_theta": 100000,
27
+ "tie_word_embeddings": true,
28
+ "transformers_version": "4.56.1",
29
+ "use_cache": true,
30
+ "vocab_size": 49152
31
+ }
checkpoint-1000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": [
5
+ 0,
6
+ 2
7
+ ],
8
+ "transformers_version": "4.56.1"
9
+ }
checkpoint-1000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a50959752f862381ff7f39dfccb85402123aa683c228a2be568318ccad2d4d86
3
+ size 538090408
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5aa72712c84d57764e814f5703010be6b080fc1e84b74eaaf4dfc24bc6cba25
3
+ size 1076349050
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
3
+ size 14244
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82923676bac30b43d960a8f638e325f0748321ea701521152093eaaf7a41a687
3
+ size 1064
checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<|im_start|>",
5
+ "<|im_end|>",
6
+ "<repo_name>",
7
+ "<reponame>",
8
+ "<file_sep>",
9
+ "<filename>",
10
+ "<gh_stars>",
11
+ "<issue_start>",
12
+ "<issue_comment>",
13
+ "<issue_closed>",
14
+ "<jupyter_start>",
15
+ "<jupyter_text>",
16
+ "<jupyter_code>",
17
+ "<jupyter_output>",
18
+ "<jupyter_script>",
19
+ "<empty_output>"
20
+ ],
21
+ "bos_token": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "eos_token": {
29
+ "content": "<|endoftext|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "unk_token": {
36
+ "content": "<|endoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
checkpoint-1000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<repo_name>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<reponame>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<file_sep>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<filename>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<gh_stars>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<jupyter_script>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<empty_output>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ }
140
+ },
141
+ "additional_special_tokens": [
142
+ "<|endoftext|>",
143
+ "<|im_start|>",
144
+ "<|im_end|>",
145
+ "<repo_name>",
146
+ "<reponame>",
147
+ "<file_sep>",
148
+ "<filename>",
149
+ "<gh_stars>",
150
+ "<issue_start>",
151
+ "<issue_comment>",
152
+ "<issue_closed>",
153
+ "<jupyter_start>",
154
+ "<jupyter_text>",
155
+ "<jupyter_code>",
156
+ "<jupyter_output>",
157
+ "<jupyter_script>",
158
+ "<empty_output>"
159
+ ],
160
+ "bos_token": "<|endoftext|>",
161
+ "clean_up_tokenization_spaces": false,
162
+ "eos_token": "<|endoftext|>",
163
+ "extra_special_tokens": {},
164
+ "model_max_length": 8192,
165
+ "tokenizer_class": "GPT2Tokenizer",
166
+ "unk_token": "<|endoftext|>",
167
+ "vocab_size": 49152
168
+ }
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.4837929366231253,
6
+ "eval_steps": 500,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.9665795528888703,
14
+ "epoch": 0.04837929366231253,
15
+ "grad_norm": 4.359325885772705,
16
+ "learning_rate": 4.760522496371553e-05,
17
+ "loss": 1.7769,
18
+ "mean_token_accuracy": 0.5657356014847755,
19
+ "num_tokens": 50640.0,
20
+ "step": 100
21
+ },
22
+ {
23
+ "entropy": 1.9170209395885467,
24
+ "epoch": 0.09675858732462506,
25
+ "grad_norm": 2.7793142795562744,
26
+ "learning_rate": 4.5186260280599906e-05,
27
+ "loss": 1.7405,
28
+ "mean_token_accuracy": 0.567837278842926,
29
+ "num_tokens": 99726.0,
30
+ "step": 200
31
+ },
32
+ {
33
+ "entropy": 1.8398987126350403,
34
+ "epoch": 0.14513788098693758,
35
+ "grad_norm": 2.6684820652008057,
36
+ "learning_rate": 4.276729559748428e-05,
37
+ "loss": 1.6837,
38
+ "mean_token_accuracy": 0.5797487896680832,
39
+ "num_tokens": 150973.0,
40
+ "step": 300
41
+ },
42
+ {
43
+ "entropy": 1.854783646464348,
44
+ "epoch": 0.1935171746492501,
45
+ "grad_norm": 2.2839877605438232,
46
+ "learning_rate": 4.0348330914368655e-05,
47
+ "loss": 1.6963,
48
+ "mean_token_accuracy": 0.5789705204963684,
49
+ "num_tokens": 203850.0,
50
+ "step": 400
51
+ },
52
+ {
53
+ "entropy": 1.8420159757137298,
54
+ "epoch": 0.24189646831156264,
55
+ "grad_norm": 5.425283908843994,
56
+ "learning_rate": 3.792936623125303e-05,
57
+ "loss": 1.6699,
58
+ "mean_token_accuracy": 0.5854617989063263,
59
+ "num_tokens": 251974.0,
60
+ "step": 500
61
+ },
62
+ {
63
+ "entropy": 1.8752372413873672,
64
+ "epoch": 0.29027576197387517,
65
+ "grad_norm": 3.1188437938690186,
66
+ "learning_rate": 3.55104015481374e-05,
67
+ "loss": 1.7033,
68
+ "mean_token_accuracy": 0.5757111895084381,
69
+ "num_tokens": 301563.0,
70
+ "step": 600
71
+ },
72
+ {
73
+ "entropy": 1.8328426551818848,
74
+ "epoch": 0.3386550556361877,
75
+ "grad_norm": 3.9171409606933594,
76
+ "learning_rate": 3.309143686502178e-05,
77
+ "loss": 1.6552,
78
+ "mean_token_accuracy": 0.5872596988081932,
79
+ "num_tokens": 352401.0,
80
+ "step": 700
81
+ },
82
+ {
83
+ "entropy": 1.7965506362915038,
84
+ "epoch": 0.3870343492985002,
85
+ "grad_norm": 3.305162191390991,
86
+ "learning_rate": 3.0672472181906144e-05,
87
+ "loss": 1.6148,
88
+ "mean_token_accuracy": 0.591159172654152,
89
+ "num_tokens": 401032.0,
90
+ "step": 800
91
+ },
92
+ {
93
+ "entropy": 1.828446706533432,
94
+ "epoch": 0.43541364296081275,
95
+ "grad_norm": 3.2749598026275635,
96
+ "learning_rate": 2.8253507498790522e-05,
97
+ "loss": 1.6748,
98
+ "mean_token_accuracy": 0.5816510277986526,
99
+ "num_tokens": 453704.0,
100
+ "step": 900
101
+ },
102
+ {
103
+ "entropy": 1.8458160006999969,
104
+ "epoch": 0.4837929366231253,
105
+ "grad_norm": 4.897568702697754,
106
+ "learning_rate": 2.5834542815674896e-05,
107
+ "loss": 1.6887,
108
+ "mean_token_accuracy": 0.5766134199500084,
109
+ "num_tokens": 503598.0,
110
+ "step": 1000
111
+ }
112
+ ],
113
+ "logging_steps": 100,
114
+ "max_steps": 2067,
115
+ "num_input_tokens_seen": 0,
116
+ "num_train_epochs": 1,
117
+ "save_steps": 500,
118
+ "stateful_callbacks": {
119
+ "TrainerControl": {
120
+ "args": {
121
+ "should_epoch_stop": false,
122
+ "should_evaluate": false,
123
+ "should_log": false,
124
+ "should_save": true,
125
+ "should_training_stop": false
126
+ },
127
+ "attributes": {}
128
+ }
129
+ },
130
+ "total_flos": 427061513942784.0,
131
+ "train_batch_size": 2,
132
+ "trial_name": null,
133
+ "trial_params": null
134
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0243dd8856240fbf0fd7008bf18425c39b8b15885c5015aadc422caea0037a44
3
+ size 5752
checkpoint-1000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1500/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "float32",
9
+ "eos_token_id": 0,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 576,
13
+ "initializer_range": 0.041666666666666664,
14
+ "intermediate_size": 1536,
15
+ "is_llama_config": true,
16
+ "max_position_embeddings": 8192,
17
+ "mlp_bias": false,
18
+ "model_type": "llama",
19
+ "num_attention_heads": 9,
20
+ "num_hidden_layers": 30,
21
+ "num_key_value_heads": 3,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_interleaved": false,
25
+ "rope_scaling": null,
26
+ "rope_theta": 100000,
27
+ "tie_word_embeddings": true,
28
+ "transformers_version": "4.56.1",
29
+ "use_cache": true,
30
+ "vocab_size": 49152
31
+ }
checkpoint-1500/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": [
5
+ 0,
6
+ 2
7
+ ],
8
+ "transformers_version": "4.56.1"
9
+ }
checkpoint-1500/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91f8300ae0214997e3ac0b2c45d763ac14b0b94bc62a09a4ba6028845abdc6d0
3
+ size 538090408
checkpoint-1500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab2984dd7b044339b6b5b36845e08c04249dad9ab18d87ff0a6305d118b41031
3
+ size 1076349050
checkpoint-1500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
3
+ size 14244
checkpoint-1500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05c5a8254ba3982d5a11403ecd64e131e7bc842516095781dbccca9a89adde29
3
+ size 1064
checkpoint-1500/special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<|im_start|>",
5
+ "<|im_end|>",
6
+ "<repo_name>",
7
+ "<reponame>",
8
+ "<file_sep>",
9
+ "<filename>",
10
+ "<gh_stars>",
11
+ "<issue_start>",
12
+ "<issue_comment>",
13
+ "<issue_closed>",
14
+ "<jupyter_start>",
15
+ "<jupyter_text>",
16
+ "<jupyter_code>",
17
+ "<jupyter_output>",
18
+ "<jupyter_script>",
19
+ "<empty_output>"
20
+ ],
21
+ "bos_token": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "eos_token": {
29
+ "content": "<|endoftext|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "unk_token": {
36
+ "content": "<|endoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
checkpoint-1500/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-1500/tokenizer_config.json ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<repo_name>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<reponame>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<file_sep>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<filename>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<gh_stars>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<jupyter_script>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<empty_output>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ }
140
+ },
141
+ "additional_special_tokens": [
142
+ "<|endoftext|>",
143
+ "<|im_start|>",
144
+ "<|im_end|>",
145
+ "<repo_name>",
146
+ "<reponame>",
147
+ "<file_sep>",
148
+ "<filename>",
149
+ "<gh_stars>",
150
+ "<issue_start>",
151
+ "<issue_comment>",
152
+ "<issue_closed>",
153
+ "<jupyter_start>",
154
+ "<jupyter_text>",
155
+ "<jupyter_code>",
156
+ "<jupyter_output>",
157
+ "<jupyter_script>",
158
+ "<empty_output>"
159
+ ],
160
+ "bos_token": "<|endoftext|>",
161
+ "clean_up_tokenization_spaces": false,
162
+ "eos_token": "<|endoftext|>",
163
+ "extra_special_tokens": {},
164
+ "model_max_length": 8192,
165
+ "tokenizer_class": "GPT2Tokenizer",
166
+ "unk_token": "<|endoftext|>",
167
+ "vocab_size": 49152
168
+ }
checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.7256894049346879,
6
+ "eval_steps": 500,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.9665795528888703,
14
+ "epoch": 0.04837929366231253,
15
+ "grad_norm": 4.359325885772705,
16
+ "learning_rate": 4.760522496371553e-05,
17
+ "loss": 1.7769,
18
+ "mean_token_accuracy": 0.5657356014847755,
19
+ "num_tokens": 50640.0,
20
+ "step": 100
21
+ },
22
+ {
23
+ "entropy": 1.9170209395885467,
24
+ "epoch": 0.09675858732462506,
25
+ "grad_norm": 2.7793142795562744,
26
+ "learning_rate": 4.5186260280599906e-05,
27
+ "loss": 1.7405,
28
+ "mean_token_accuracy": 0.567837278842926,
29
+ "num_tokens": 99726.0,
30
+ "step": 200
31
+ },
32
+ {
33
+ "entropy": 1.8398987126350403,
34
+ "epoch": 0.14513788098693758,
35
+ "grad_norm": 2.6684820652008057,
36
+ "learning_rate": 4.276729559748428e-05,
37
+ "loss": 1.6837,
38
+ "mean_token_accuracy": 0.5797487896680832,
39
+ "num_tokens": 150973.0,
40
+ "step": 300
41
+ },
42
+ {
43
+ "entropy": 1.854783646464348,
44
+ "epoch": 0.1935171746492501,
45
+ "grad_norm": 2.2839877605438232,
46
+ "learning_rate": 4.0348330914368655e-05,
47
+ "loss": 1.6963,
48
+ "mean_token_accuracy": 0.5789705204963684,
49
+ "num_tokens": 203850.0,
50
+ "step": 400
51
+ },
52
+ {
53
+ "entropy": 1.8420159757137298,
54
+ "epoch": 0.24189646831156264,
55
+ "grad_norm": 5.425283908843994,
56
+ "learning_rate": 3.792936623125303e-05,
57
+ "loss": 1.6699,
58
+ "mean_token_accuracy": 0.5854617989063263,
59
+ "num_tokens": 251974.0,
60
+ "step": 500
61
+ },
62
+ {
63
+ "entropy": 1.8752372413873672,
64
+ "epoch": 0.29027576197387517,
65
+ "grad_norm": 3.1188437938690186,
66
+ "learning_rate": 3.55104015481374e-05,
67
+ "loss": 1.7033,
68
+ "mean_token_accuracy": 0.5757111895084381,
69
+ "num_tokens": 301563.0,
70
+ "step": 600
71
+ },
72
+ {
73
+ "entropy": 1.8328426551818848,
74
+ "epoch": 0.3386550556361877,
75
+ "grad_norm": 3.9171409606933594,
76
+ "learning_rate": 3.309143686502178e-05,
77
+ "loss": 1.6552,
78
+ "mean_token_accuracy": 0.5872596988081932,
79
+ "num_tokens": 352401.0,
80
+ "step": 700
81
+ },
82
+ {
83
+ "entropy": 1.7965506362915038,
84
+ "epoch": 0.3870343492985002,
85
+ "grad_norm": 3.305162191390991,
86
+ "learning_rate": 3.0672472181906144e-05,
87
+ "loss": 1.6148,
88
+ "mean_token_accuracy": 0.591159172654152,
89
+ "num_tokens": 401032.0,
90
+ "step": 800
91
+ },
92
+ {
93
+ "entropy": 1.828446706533432,
94
+ "epoch": 0.43541364296081275,
95
+ "grad_norm": 3.2749598026275635,
96
+ "learning_rate": 2.8253507498790522e-05,
97
+ "loss": 1.6748,
98
+ "mean_token_accuracy": 0.5816510277986526,
99
+ "num_tokens": 453704.0,
100
+ "step": 900
101
+ },
102
+ {
103
+ "entropy": 1.8458160006999969,
104
+ "epoch": 0.4837929366231253,
105
+ "grad_norm": 4.897568702697754,
106
+ "learning_rate": 2.5834542815674896e-05,
107
+ "loss": 1.6887,
108
+ "mean_token_accuracy": 0.5766134199500084,
109
+ "num_tokens": 503598.0,
110
+ "step": 1000
111
+ },
112
+ {
113
+ "entropy": 1.7846248948574066,
114
+ "epoch": 0.5321722302854378,
115
+ "grad_norm": 2.791334629058838,
116
+ "learning_rate": 2.3415578132559267e-05,
117
+ "loss": 1.6216,
118
+ "mean_token_accuracy": 0.5865279313921928,
119
+ "num_tokens": 556410.0,
120
+ "step": 1100
121
+ },
122
+ {
123
+ "entropy": 1.8374267256259917,
124
+ "epoch": 0.5805515239477503,
125
+ "grad_norm": 3.3878092765808105,
126
+ "learning_rate": 2.0996613449443638e-05,
127
+ "loss": 1.6959,
128
+ "mean_token_accuracy": 0.5801611566543579,
129
+ "num_tokens": 609825.0,
130
+ "step": 1200
131
+ },
132
+ {
133
+ "entropy": 1.8186497938632966,
134
+ "epoch": 0.6289308176100629,
135
+ "grad_norm": 2.651001214981079,
136
+ "learning_rate": 1.8577648766328012e-05,
137
+ "loss": 1.6508,
138
+ "mean_token_accuracy": 0.5890785497426987,
139
+ "num_tokens": 661304.0,
140
+ "step": 1300
141
+ },
142
+ {
143
+ "entropy": 1.83908866584301,
144
+ "epoch": 0.6773101112723754,
145
+ "grad_norm": 3.2351417541503906,
146
+ "learning_rate": 1.6158684083212386e-05,
147
+ "loss": 1.6724,
148
+ "mean_token_accuracy": 0.5820642611384392,
149
+ "num_tokens": 711260.0,
150
+ "step": 1400
151
+ },
152
+ {
153
+ "entropy": 1.801530545949936,
154
+ "epoch": 0.7256894049346879,
155
+ "grad_norm": 2.7686285972595215,
156
+ "learning_rate": 1.373971940009676e-05,
157
+ "loss": 1.6177,
158
+ "mean_token_accuracy": 0.5914771935343742,
159
+ "num_tokens": 761107.0,
160
+ "step": 1500
161
+ }
162
+ ],
163
+ "logging_steps": 100,
164
+ "max_steps": 2067,
165
+ "num_input_tokens_seen": 0,
166
+ "num_train_epochs": 1,
167
+ "save_steps": 500,
168
+ "stateful_callbacks": {
169
+ "TrainerControl": {
170
+ "args": {
171
+ "should_epoch_stop": false,
172
+ "should_evaluate": false,
173
+ "should_log": false,
174
+ "should_save": true,
175
+ "should_training_stop": false
176
+ },
177
+ "attributes": {}
178
+ }
179
+ },
180
+ "total_flos": 646531805394432.0,
181
+ "train_batch_size": 2,
182
+ "trial_name": null,
183
+ "trial_params": null
184
+ }
checkpoint-1500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0243dd8856240fbf0fd7008bf18425c39b8b15885c5015aadc422caea0037a44
3
+ size 5752
checkpoint-1500/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2000/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "float32",
9
+ "eos_token_id": 0,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 576,
13
+ "initializer_range": 0.041666666666666664,
14
+ "intermediate_size": 1536,
15
+ "is_llama_config": true,
16
+ "max_position_embeddings": 8192,
17
+ "mlp_bias": false,
18
+ "model_type": "llama",
19
+ "num_attention_heads": 9,
20
+ "num_hidden_layers": 30,
21
+ "num_key_value_heads": 3,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_interleaved": false,
25
+ "rope_scaling": null,
26
+ "rope_theta": 100000,
27
+ "tie_word_embeddings": true,
28
+ "transformers_version": "4.56.1",
29
+ "use_cache": true,
30
+ "vocab_size": 49152
31
+ }
checkpoint-2000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": [
5
+ 0,
6
+ 2
7
+ ],
8
+ "transformers_version": "4.56.1"
9
+ }
checkpoint-2000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:584e31ce045b77e2d8caa1ae7e8e7ee76fdba17301428089ffca350c7fb7337e
3
+ size 538090408
checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a58cdd2b12225dd40c4962180667e3bd9db0c8c8239fc00a63ff0a79863ae6f8
3
+ size 1076349050
checkpoint-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
3
+ size 14244
checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42928213bfcfe4bec3101791e0c7e3efce3a6ae55386c28d08b0a8b5952cf77d
3
+ size 1064
checkpoint-2000/special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<|im_start|>",
5
+ "<|im_end|>",
6
+ "<repo_name>",
7
+ "<reponame>",
8
+ "<file_sep>",
9
+ "<filename>",
10
+ "<gh_stars>",
11
+ "<issue_start>",
12
+ "<issue_comment>",
13
+ "<issue_closed>",
14
+ "<jupyter_start>",
15
+ "<jupyter_text>",
16
+ "<jupyter_code>",
17
+ "<jupyter_output>",
18
+ "<jupyter_script>",
19
+ "<empty_output>"
20
+ ],
21
+ "bos_token": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "eos_token": {
29
+ "content": "<|endoftext|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "unk_token": {
36
+ "content": "<|endoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
checkpoint-2000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2000/tokenizer_config.json ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<repo_name>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<reponame>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<file_sep>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<filename>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<gh_stars>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<jupyter_script>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<empty_output>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ }
140
+ },
141
+ "additional_special_tokens": [
142
+ "<|endoftext|>",
143
+ "<|im_start|>",
144
+ "<|im_end|>",
145
+ "<repo_name>",
146
+ "<reponame>",
147
+ "<file_sep>",
148
+ "<filename>",
149
+ "<gh_stars>",
150
+ "<issue_start>",
151
+ "<issue_comment>",
152
+ "<issue_closed>",
153
+ "<jupyter_start>",
154
+ "<jupyter_text>",
155
+ "<jupyter_code>",
156
+ "<jupyter_output>",
157
+ "<jupyter_script>",
158
+ "<empty_output>"
159
+ ],
160
+ "bos_token": "<|endoftext|>",
161
+ "clean_up_tokenization_spaces": false,
162
+ "eos_token": "<|endoftext|>",
163
+ "extra_special_tokens": {},
164
+ "model_max_length": 8192,
165
+ "tokenizer_class": "GPT2Tokenizer",
166
+ "unk_token": "<|endoftext|>",
167
+ "vocab_size": 49152
168
+ }
checkpoint-2000/trainer_state.json ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.9675858732462506,
6
+ "eval_steps": 500,
7
+ "global_step": 2000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.9665795528888703,
14
+ "epoch": 0.04837929366231253,
15
+ "grad_norm": 4.359325885772705,
16
+ "learning_rate": 4.760522496371553e-05,
17
+ "loss": 1.7769,
18
+ "mean_token_accuracy": 0.5657356014847755,
19
+ "num_tokens": 50640.0,
20
+ "step": 100
21
+ },
22
+ {
23
+ "entropy": 1.9170209395885467,
24
+ "epoch": 0.09675858732462506,
25
+ "grad_norm": 2.7793142795562744,
26
+ "learning_rate": 4.5186260280599906e-05,
27
+ "loss": 1.7405,
28
+ "mean_token_accuracy": 0.567837278842926,
29
+ "num_tokens": 99726.0,
30
+ "step": 200
31
+ },
32
+ {
33
+ "entropy": 1.8398987126350403,
34
+ "epoch": 0.14513788098693758,
35
+ "grad_norm": 2.6684820652008057,
36
+ "learning_rate": 4.276729559748428e-05,
37
+ "loss": 1.6837,
38
+ "mean_token_accuracy": 0.5797487896680832,
39
+ "num_tokens": 150973.0,
40
+ "step": 300
41
+ },
42
+ {
43
+ "entropy": 1.854783646464348,
44
+ "epoch": 0.1935171746492501,
45
+ "grad_norm": 2.2839877605438232,
46
+ "learning_rate": 4.0348330914368655e-05,
47
+ "loss": 1.6963,
48
+ "mean_token_accuracy": 0.5789705204963684,
49
+ "num_tokens": 203850.0,
50
+ "step": 400
51
+ },
52
+ {
53
+ "entropy": 1.8420159757137298,
54
+ "epoch": 0.24189646831156264,
55
+ "grad_norm": 5.425283908843994,
56
+ "learning_rate": 3.792936623125303e-05,
57
+ "loss": 1.6699,
58
+ "mean_token_accuracy": 0.5854617989063263,
59
+ "num_tokens": 251974.0,
60
+ "step": 500
61
+ },
62
+ {
63
+ "entropy": 1.8752372413873672,
64
+ "epoch": 0.29027576197387517,
65
+ "grad_norm": 3.1188437938690186,
66
+ "learning_rate": 3.55104015481374e-05,
67
+ "loss": 1.7033,
68
+ "mean_token_accuracy": 0.5757111895084381,
69
+ "num_tokens": 301563.0,
70
+ "step": 600
71
+ },
72
+ {
73
+ "entropy": 1.8328426551818848,
74
+ "epoch": 0.3386550556361877,
75
+ "grad_norm": 3.9171409606933594,
76
+ "learning_rate": 3.309143686502178e-05,
77
+ "loss": 1.6552,
78
+ "mean_token_accuracy": 0.5872596988081932,
79
+ "num_tokens": 352401.0,
80
+ "step": 700
81
+ },
82
+ {
83
+ "entropy": 1.7965506362915038,
84
+ "epoch": 0.3870343492985002,
85
+ "grad_norm": 3.305162191390991,
86
+ "learning_rate": 3.0672472181906144e-05,
87
+ "loss": 1.6148,
88
+ "mean_token_accuracy": 0.591159172654152,
89
+ "num_tokens": 401032.0,
90
+ "step": 800
91
+ },
92
+ {
93
+ "entropy": 1.828446706533432,
94
+ "epoch": 0.43541364296081275,
95
+ "grad_norm": 3.2749598026275635,
96
+ "learning_rate": 2.8253507498790522e-05,
97
+ "loss": 1.6748,
98
+ "mean_token_accuracy": 0.5816510277986526,
99
+ "num_tokens": 453704.0,
100
+ "step": 900
101
+ },
102
+ {
103
+ "entropy": 1.8458160006999969,
104
+ "epoch": 0.4837929366231253,
105
+ "grad_norm": 4.897568702697754,
106
+ "learning_rate": 2.5834542815674896e-05,
107
+ "loss": 1.6887,
108
+ "mean_token_accuracy": 0.5766134199500084,
109
+ "num_tokens": 503598.0,
110
+ "step": 1000
111
+ },
112
+ {
113
+ "entropy": 1.7846248948574066,
114
+ "epoch": 0.5321722302854378,
115
+ "grad_norm": 2.791334629058838,
116
+ "learning_rate": 2.3415578132559267e-05,
117
+ "loss": 1.6216,
118
+ "mean_token_accuracy": 0.5865279313921928,
119
+ "num_tokens": 556410.0,
120
+ "step": 1100
121
+ },
122
+ {
123
+ "entropy": 1.8374267256259917,
124
+ "epoch": 0.5805515239477503,
125
+ "grad_norm": 3.3878092765808105,
126
+ "learning_rate": 2.0996613449443638e-05,
127
+ "loss": 1.6959,
128
+ "mean_token_accuracy": 0.5801611566543579,
129
+ "num_tokens": 609825.0,
130
+ "step": 1200
131
+ },
132
+ {
133
+ "entropy": 1.8186497938632966,
134
+ "epoch": 0.6289308176100629,
135
+ "grad_norm": 2.651001214981079,
136
+ "learning_rate": 1.8577648766328012e-05,
137
+ "loss": 1.6508,
138
+ "mean_token_accuracy": 0.5890785497426987,
139
+ "num_tokens": 661304.0,
140
+ "step": 1300
141
+ },
142
+ {
143
+ "entropy": 1.83908866584301,
144
+ "epoch": 0.6773101112723754,
145
+ "grad_norm": 3.2351417541503906,
146
+ "learning_rate": 1.6158684083212386e-05,
147
+ "loss": 1.6724,
148
+ "mean_token_accuracy": 0.5820642611384392,
149
+ "num_tokens": 711260.0,
150
+ "step": 1400
151
+ },
152
+ {
153
+ "entropy": 1.801530545949936,
154
+ "epoch": 0.7256894049346879,
155
+ "grad_norm": 2.7686285972595215,
156
+ "learning_rate": 1.373971940009676e-05,
157
+ "loss": 1.6177,
158
+ "mean_token_accuracy": 0.5914771935343742,
159
+ "num_tokens": 761107.0,
160
+ "step": 1500
161
+ },
162
+ {
163
+ "entropy": 1.8067762792110442,
164
+ "epoch": 0.7740686985970004,
165
+ "grad_norm": 4.601089000701904,
166
+ "learning_rate": 1.1320754716981132e-05,
167
+ "loss": 1.6515,
168
+ "mean_token_accuracy": 0.5850321623682976,
169
+ "num_tokens": 810527.0,
170
+ "step": 1600
171
+ },
172
+ {
173
+ "entropy": 1.8136928272247315,
174
+ "epoch": 0.822447992259313,
175
+ "grad_norm": 2.1550955772399902,
176
+ "learning_rate": 8.901790033865507e-06,
177
+ "loss": 1.6373,
178
+ "mean_token_accuracy": 0.5840527075529098,
179
+ "num_tokens": 862714.0,
180
+ "step": 1700
181
+ },
182
+ {
183
+ "entropy": 1.7681069767475128,
184
+ "epoch": 0.8708272859216255,
185
+ "grad_norm": 2.4196958541870117,
186
+ "learning_rate": 6.48282535074988e-06,
187
+ "loss": 1.5913,
188
+ "mean_token_accuracy": 0.6007963755726814,
189
+ "num_tokens": 914855.0,
190
+ "step": 1800
191
+ },
192
+ {
193
+ "entropy": 1.838038477897644,
194
+ "epoch": 0.919206579583938,
195
+ "grad_norm": 5.095026016235352,
196
+ "learning_rate": 4.063860667634252e-06,
197
+ "loss": 1.6622,
198
+ "mean_token_accuracy": 0.5819806972146034,
199
+ "num_tokens": 966292.0,
200
+ "step": 1900
201
+ },
202
+ {
203
+ "entropy": 1.8221340310573577,
204
+ "epoch": 0.9675858732462506,
205
+ "grad_norm": 2.757889747619629,
206
+ "learning_rate": 1.6448959845186262e-06,
207
+ "loss": 1.6351,
208
+ "mean_token_accuracy": 0.5879274764657021,
209
+ "num_tokens": 1014330.0,
210
+ "step": 2000
211
+ }
212
+ ],
213
+ "logging_steps": 100,
214
+ "max_steps": 2067,
215
+ "num_input_tokens_seen": 0,
216
+ "num_train_epochs": 1,
217
+ "save_steps": 500,
218
+ "stateful_callbacks": {
219
+ "TrainerControl": {
220
+ "args": {
221
+ "should_epoch_stop": false,
222
+ "should_evaluate": false,
223
+ "should_log": false,
224
+ "should_save": true,
225
+ "should_training_stop": false
226
+ },
227
+ "attributes": {}
228
+ }
229
+ },
230
+ "total_flos": 861103143827712.0,
231
+ "train_batch_size": 2,
232
+ "trial_name": null,
233
+ "trial_params": null
234
+ }
checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0243dd8856240fbf0fd7008bf18425c39b8b15885c5015aadc422caea0037a44
3
+ size 5752
checkpoint-2000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2067/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "float32",
9
+ "eos_token_id": 0,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 576,
13
+ "initializer_range": 0.041666666666666664,
14
+ "intermediate_size": 1536,
15
+ "is_llama_config": true,
16
+ "max_position_embeddings": 8192,
17
+ "mlp_bias": false,
18
+ "model_type": "llama",
19
+ "num_attention_heads": 9,
20
+ "num_hidden_layers": 30,
21
+ "num_key_value_heads": 3,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-05,
24
+ "rope_interleaved": false,
25
+ "rope_scaling": null,
26
+ "rope_theta": 100000,
27
+ "tie_word_embeddings": true,
28
+ "transformers_version": "4.56.1",
29
+ "use_cache": true,
30
+ "vocab_size": 49152
31
+ }
checkpoint-2067/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": [
5
+ 0,
6
+ 2
7
+ ],
8
+ "transformers_version": "4.56.1"
9
+ }
checkpoint-2067/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2067/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d3bedef1d4d4cb4182ef659f6bad3a5970545a62dfd847b56ceab9a66747861
3
+ size 538090408
checkpoint-2067/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88abf750276bd19226157a24e3b489d79535dad0710c1bf6b9b2cd9bf274aa6c
3
+ size 1076349050
checkpoint-2067/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
3
+ size 14244
checkpoint-2067/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66a8911dbca2c60588eab8a3220f7ca25f8eb8068ad402110ebe854c5ac21ac6
3
+ size 1064
checkpoint-2067/special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<|im_start|>",
5
+ "<|im_end|>",
6
+ "<repo_name>",
7
+ "<reponame>",
8
+ "<file_sep>",
9
+ "<filename>",
10
+ "<gh_stars>",
11
+ "<issue_start>",
12
+ "<issue_comment>",
13
+ "<issue_closed>",
14
+ "<jupyter_start>",
15
+ "<jupyter_text>",
16
+ "<jupyter_code>",
17
+ "<jupyter_output>",
18
+ "<jupyter_script>",
19
+ "<empty_output>"
20
+ ],
21
+ "bos_token": {
22
+ "content": "<|endoftext|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false
27
+ },
28
+ "eos_token": {
29
+ "content": "<|endoftext|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "unk_token": {
36
+ "content": "<|endoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
checkpoint-2067/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-2067/tokenizer_config.json ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<repo_name>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<reponame>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "5": {
45
+ "content": "<file_sep>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "6": {
53
+ "content": "<filename>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "7": {
61
+ "content": "<gh_stars>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "8": {
69
+ "content": "<issue_start>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "9": {
77
+ "content": "<issue_comment>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "10": {
85
+ "content": "<issue_closed>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "11": {
93
+ "content": "<jupyter_start>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "12": {
101
+ "content": "<jupyter_text>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "13": {
109
+ "content": "<jupyter_code>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "14": {
117
+ "content": "<jupyter_output>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "15": {
125
+ "content": "<jupyter_script>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "16": {
133
+ "content": "<empty_output>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ }
140
+ },
141
+ "additional_special_tokens": [
142
+ "<|endoftext|>",
143
+ "<|im_start|>",
144
+ "<|im_end|>",
145
+ "<repo_name>",
146
+ "<reponame>",
147
+ "<file_sep>",
148
+ "<filename>",
149
+ "<gh_stars>",
150
+ "<issue_start>",
151
+ "<issue_comment>",
152
+ "<issue_closed>",
153
+ "<jupyter_start>",
154
+ "<jupyter_text>",
155
+ "<jupyter_code>",
156
+ "<jupyter_output>",
157
+ "<jupyter_script>",
158
+ "<empty_output>"
159
+ ],
160
+ "bos_token": "<|endoftext|>",
161
+ "clean_up_tokenization_spaces": false,
162
+ "eos_token": "<|endoftext|>",
163
+ "extra_special_tokens": {},
164
+ "model_max_length": 8192,
165
+ "tokenizer_class": "GPT2Tokenizer",
166
+ "unk_token": "<|endoftext|>",
167
+ "vocab_size": 49152
168
+ }