Lorry0727 commited on
Commit
3e201dd
·
verified ·
1 Parent(s): 006bae1

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: other
4
+ base_model: Qwen/Qwen2-Math-1.5B
5
+ tags:
6
+ - llama-factory
7
+ - full
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: sft
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # sft
18
+
19
+ This model is a fine-tuned version of [Qwen/Qwen2-Math-1.5B](https://huggingface.co/Qwen/Qwen2-Math-1.5B) on the [meta-math/MetaMathQA](https://huggingface.co/datasets/meta-math/MetaMathQA) 50K random selected dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 1e-05
39
+ - train_batch_size: 16
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - distributed_type: multi-GPU
43
+ - num_devices: 4
44
+ - gradient_accumulation_steps: 2
45
+ - total_train_batch_size: 128
46
+ - total_eval_batch_size: 32
47
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
+ - lr_scheduler_type: cosine
49
+ - lr_scheduler_warmup_ratio: 0.1
50
+ - num_epochs: 1.0
51
+
52
+ ### Training results
53
+
54
+
55
+
56
+ ### Framework versions
57
+
58
+ - Transformers 4.52.4
59
+ - Pytorch 2.7.1+cu126
60
+ - Datasets 3.6.0
61
+ - Tokenizers 0.21.1
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 24007129694208.0,
4
+ "train_loss": 0.18847447641365364,
5
+ "train_runtime": 1067.9437,
6
+ "train_samples_per_second": 46.819,
7
+ "train_steps_per_second": 0.366
8
+ }
chat_template.jinja ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
2
+ You are a helpful assistant<|im_end|>
3
+ ' }}{% endif %}{{'<|im_start|>' + message['role'] + '
4
+ ' + message['content'] + '<|im_end|>' + '
5
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
6
+ ' }}{% endif %}
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151643,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "max_position_embeddings": 4096,
13
+ "max_window_layers": 21,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 12,
16
+ "num_hidden_layers": 28,
17
+ "num_key_value_heads": 2,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_scaling": null,
20
+ "rope_theta": 10000,
21
+ "sliding_window": 32768,
22
+ "tie_word_embeddings": true,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.52.4",
25
+ "use_cache": false,
26
+ "use_mrope": false,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 151936
29
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.52.4"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b80bddb2b1100803e73bd13b1405e04ed80f6c70b42b3d6ef4c296f56f47810
3
+ size 3087467144
runs/Jun05_16-20-01_a800/events.out.tfevents.1749140416.a800 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bec90c3c46eba3b46098db51bf75c94707dde729f3d349d04952a98fbbe393c
3
+ size 9331
runs/Jun05_16-31-58_a800/events.out.tfevents.1749141132.a800 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a8f1db1fb65739143475e08a5ab820ffe6ceeb610b009c0eddee2b8bb5786ec
3
+ size 13906
special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcfe42da0a4497e8b2b172c1f9f4ec423a46dc12907f4349c55025f670422ba9
3
+ size 11418266
tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "<|im_end|>",
36
+ "errors": "replace",
37
+ "extra_special_tokens": {},
38
+ "model_max_length": 32768,
39
+ "pad_token": "<|endoftext|>",
40
+ "padding_side": "right",
41
+ "split_special_tokens": false,
42
+ "tokenizer_class": "Qwen2Tokenizer",
43
+ "unk_token": null
44
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 24007129694208.0,
4
+ "train_loss": 0.18847447641365364,
5
+ "train_runtime": 1067.9437,
6
+ "train_samples_per_second": 46.819,
7
+ "train_steps_per_second": 0.366
8
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 391, "loss": 0.5442, "lr": 2.25e-06, "epoch": 0.02557544757033248, "percentage": 2.56, "elapsed_time": "0:00:31", "remaining_time": "0:20:00"}
2
+ {"current_steps": 20, "total_steps": 391, "loss": 0.4196, "lr": 4.75e-06, "epoch": 0.05115089514066496, "percentage": 5.12, "elapsed_time": "0:00:56", "remaining_time": "0:17:28"}
3
+ {"current_steps": 30, "total_steps": 391, "loss": 0.2928, "lr": 7.25e-06, "epoch": 0.07672634271099744, "percentage": 7.67, "elapsed_time": "0:01:22", "remaining_time": "0:16:31"}
4
+ {"current_steps": 40, "total_steps": 391, "loss": 0.2508, "lr": 9.75e-06, "epoch": 0.10230179028132992, "percentage": 10.23, "elapsed_time": "0:01:48", "remaining_time": "0:15:53"}
5
+ {"current_steps": 50, "total_steps": 391, "loss": 0.2316, "lr": 9.983786540671052e-06, "epoch": 0.1278772378516624, "percentage": 12.79, "elapsed_time": "0:02:15", "remaining_time": "0:15:22"}
6
+ {"current_steps": 60, "total_steps": 391, "loss": 0.2198, "lr": 9.927874998629714e-06, "epoch": 0.1534526854219949, "percentage": 15.35, "elapsed_time": "0:02:41", "remaining_time": "0:14:50"}
7
+ {"current_steps": 70, "total_steps": 391, "loss": 0.2047, "lr": 9.83251270794707e-06, "epoch": 0.17902813299232737, "percentage": 17.9, "elapsed_time": "0:03:07", "remaining_time": "0:14:17"}
8
+ {"current_steps": 80, "total_steps": 391, "loss": 0.2058, "lr": 9.698463103929542e-06, "epoch": 0.20460358056265984, "percentage": 20.46, "elapsed_time": "0:03:34", "remaining_time": "0:13:52"}
9
+ {"current_steps": 90, "total_steps": 391, "loss": 0.1903, "lr": 9.526799338236828e-06, "epoch": 0.23017902813299232, "percentage": 23.02, "elapsed_time": "0:03:59", "remaining_time": "0:13:21"}
10
+ {"current_steps": 100, "total_steps": 391, "loss": 0.1835, "lr": 9.318895687625752e-06, "epoch": 0.2557544757033248, "percentage": 25.58, "elapsed_time": "0:04:26", "remaining_time": "0:12:54"}
11
+ {"current_steps": 110, "total_steps": 391, "loss": 0.1774, "lr": 9.076416551997721e-06, "epoch": 0.2813299232736573, "percentage": 28.13, "elapsed_time": "0:04:51", "remaining_time": "0:12:25"}
12
+ {"current_steps": 120, "total_steps": 391, "loss": 0.175, "lr": 8.801303129827352e-06, "epoch": 0.3069053708439898, "percentage": 30.69, "elapsed_time": "0:05:19", "remaining_time": "0:12:00"}
13
+ {"current_steps": 130, "total_steps": 391, "loss": 0.1688, "lr": 8.495757877643857e-06, "epoch": 0.33248081841432225, "percentage": 33.25, "elapsed_time": "0:05:47", "remaining_time": "0:11:37"}
14
+ {"current_steps": 140, "total_steps": 391, "loss": 0.1679, "lr": 8.162226877976886e-06, "epoch": 0.35805626598465473, "percentage": 35.81, "elapsed_time": "0:06:12", "remaining_time": "0:11:08"}
15
+ {"current_steps": 150, "total_steps": 391, "loss": 0.1659, "lr": 7.803380256922495e-06, "epoch": 0.3836317135549872, "percentage": 38.36, "elapsed_time": "0:06:38", "remaining_time": "0:10:39"}
16
+ {"current_steps": 160, "total_steps": 391, "loss": 0.1614, "lr": 7.422090808099014e-06, "epoch": 0.4092071611253197, "percentage": 40.92, "elapsed_time": "0:07:02", "remaining_time": "0:10:10"}
17
+ {"current_steps": 170, "total_steps": 391, "loss": 0.1658, "lr": 7.021410994121525e-06, "epoch": 0.43478260869565216, "percentage": 43.48, "elapsed_time": "0:07:32", "remaining_time": "0:09:48"}
18
+ {"current_steps": 180, "total_steps": 391, "loss": 0.162, "lr": 6.6045485097126585e-06, "epoch": 0.46035805626598464, "percentage": 46.04, "elapsed_time": "0:07:59", "remaining_time": "0:09:21"}
19
+ {"current_steps": 190, "total_steps": 391, "loss": 0.1603, "lr": 6.1748406020824115e-06, "epoch": 0.4859335038363171, "percentage": 48.59, "elapsed_time": "0:08:28", "remaining_time": "0:08:57"}
20
+ {"current_steps": 200, "total_steps": 391, "loss": 0.1607, "lr": 5.735727354158581e-06, "epoch": 0.5115089514066496, "percentage": 51.15, "elapsed_time": "0:08:54", "remaining_time": "0:08:29"}
21
+ {"current_steps": 210, "total_steps": 391, "loss": 0.1566, "lr": 5.290724144552379e-06, "epoch": 0.5370843989769821, "percentage": 53.71, "elapsed_time": "0:09:22", "remaining_time": "0:08:04"}
22
+ {"current_steps": 220, "total_steps": 391, "loss": 0.1617, "lr": 4.8433935047346e-06, "epoch": 0.5626598465473146, "percentage": 56.27, "elapsed_time": "0:09:50", "remaining_time": "0:07:39"}
23
+ {"current_steps": 230, "total_steps": 391, "loss": 0.1577, "lr": 4.397316598723385e-06, "epoch": 0.5882352941176471, "percentage": 58.82, "elapsed_time": "0:10:15", "remaining_time": "0:07:10"}
24
+ {"current_steps": 240, "total_steps": 391, "loss": 0.1595, "lr": 3.956064553606708e-06, "epoch": 0.6138107416879796, "percentage": 61.38, "elapsed_time": "0:10:41", "remaining_time": "0:06:43"}
25
+ {"current_steps": 250, "total_steps": 391, "loss": 0.154, "lr": 3.523169870416795e-06, "epoch": 0.639386189258312, "percentage": 63.94, "elapsed_time": "0:11:09", "remaining_time": "0:06:17"}
26
+ {"current_steps": 260, "total_steps": 391, "loss": 0.1531, "lr": 3.1020981442305187e-06, "epoch": 0.6649616368286445, "percentage": 66.5, "elapsed_time": "0:11:36", "remaining_time": "0:05:50"}
27
+ {"current_steps": 270, "total_steps": 391, "loss": 0.1579, "lr": 2.6962203198941587e-06, "epoch": 0.690537084398977, "percentage": 69.05, "elapsed_time": "0:12:01", "remaining_time": "0:05:23"}
28
+ {"current_steps": 280, "total_steps": 391, "loss": 0.1586, "lr": 2.308785705482982e-06, "epoch": 0.7161125319693095, "percentage": 71.61, "elapsed_time": "0:12:25", "remaining_time": "0:04:55"}
29
+ {"current_steps": 290, "total_steps": 391, "loss": 0.1575, "lr": 1.942895959539939e-06, "epoch": 0.7416879795396419, "percentage": 74.17, "elapsed_time": "0:12:52", "remaining_time": "0:04:28"}
30
+ {"current_steps": 300, "total_steps": 391, "loss": 0.1553, "lr": 1.6014802603420044e-06, "epoch": 0.7672634271099744, "percentage": 76.73, "elapsed_time": "0:13:22", "remaining_time": "0:04:03"}
31
+ {"current_steps": 310, "total_steps": 391, "loss": 0.1532, "lr": 1.2872718559798852e-06, "epoch": 0.7928388746803069, "percentage": 79.28, "elapsed_time": "0:13:47", "remaining_time": "0:03:36"}
32
+ {"current_steps": 320, "total_steps": 391, "loss": 0.1562, "lr": 1.0027861829824953e-06, "epoch": 0.8184143222506394, "percentage": 81.84, "elapsed_time": "0:14:12", "remaining_time": "0:03:09"}
33
+ {"current_steps": 330, "total_steps": 391, "loss": 0.155, "lr": 7.50300728660407e-07, "epoch": 0.8439897698209718, "percentage": 84.4, "elapsed_time": "0:14:38", "remaining_time": "0:02:42"}
34
+ {"current_steps": 340, "total_steps": 391, "loss": 0.1523, "lr": 5.318367983829393e-07, "epoch": 0.8695652173913043, "percentage": 86.96, "elapsed_time": "0:15:07", "remaining_time": "0:02:16"}
35
+ {"current_steps": 350, "total_steps": 391, "loss": 0.1521, "lr": 3.49143333753309e-07, "epoch": 0.8951406649616368, "percentage": 89.51, "elapsed_time": "0:15:32", "remaining_time": "0:01:49"}
36
+ {"current_steps": 360, "total_steps": 391, "loss": 0.1523, "lr": 2.0368291122759898e-07, "epoch": 0.9207161125319693, "percentage": 92.07, "elapsed_time": "0:15:59", "remaining_time": "0:01:22"}
37
+ {"current_steps": 370, "total_steps": 391, "loss": 0.1514, "lr": 9.662003326740166e-08, "epoch": 0.9462915601023018, "percentage": 94.63, "elapsed_time": "0:16:27", "remaining_time": "0:00:56"}
38
+ {"current_steps": 380, "total_steps": 391, "loss": 0.1501, "lr": 2.8811805762860578e-08, "epoch": 0.9718670076726342, "percentage": 97.19, "elapsed_time": "0:16:55", "remaining_time": "0:00:29"}
39
+ {"current_steps": 390, "total_steps": 391, "loss": 0.152, "lr": 8.010763592264381e-10, "epoch": 0.9974424552429667, "percentage": 99.74, "elapsed_time": "0:17:19", "remaining_time": "0:00:02"}
40
+ {"current_steps": 391, "total_steps": 391, "epoch": 1.0, "percentage": 100.0, "elapsed_time": "0:17:49", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 391,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02557544757033248,
14
+ "grad_norm": 5.1979091027844655,
15
+ "learning_rate": 2.25e-06,
16
+ "loss": 0.5442,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.05115089514066496,
21
+ "grad_norm": 1.555199471497413,
22
+ "learning_rate": 4.75e-06,
23
+ "loss": 0.4196,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.07672634271099744,
28
+ "grad_norm": 0.8843964610571793,
29
+ "learning_rate": 7.25e-06,
30
+ "loss": 0.2928,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.10230179028132992,
35
+ "grad_norm": 0.9254467392549297,
36
+ "learning_rate": 9.75e-06,
37
+ "loss": 0.2508,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.1278772378516624,
42
+ "grad_norm": 0.9595475930941818,
43
+ "learning_rate": 9.983786540671052e-06,
44
+ "loss": 0.2316,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.1534526854219949,
49
+ "grad_norm": 0.8734055150087038,
50
+ "learning_rate": 9.927874998629714e-06,
51
+ "loss": 0.2198,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.17902813299232737,
56
+ "grad_norm": 0.8904948291987135,
57
+ "learning_rate": 9.83251270794707e-06,
58
+ "loss": 0.2047,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.20460358056265984,
63
+ "grad_norm": 0.8941187964853939,
64
+ "learning_rate": 9.698463103929542e-06,
65
+ "loss": 0.2058,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.23017902813299232,
70
+ "grad_norm": 0.8232662082320363,
71
+ "learning_rate": 9.526799338236828e-06,
72
+ "loss": 0.1903,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.2557544757033248,
77
+ "grad_norm": 0.8507117163294342,
78
+ "learning_rate": 9.318895687625752e-06,
79
+ "loss": 0.1835,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.2813299232736573,
84
+ "grad_norm": 0.7198977204875759,
85
+ "learning_rate": 9.076416551997721e-06,
86
+ "loss": 0.1774,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.3069053708439898,
91
+ "grad_norm": 0.6072199252093429,
92
+ "learning_rate": 8.801303129827352e-06,
93
+ "loss": 0.175,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.33248081841432225,
98
+ "grad_norm": 0.552937855243384,
99
+ "learning_rate": 8.495757877643857e-06,
100
+ "loss": 0.1688,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.35805626598465473,
105
+ "grad_norm": 0.5078356984647172,
106
+ "learning_rate": 8.162226877976886e-06,
107
+ "loss": 0.1679,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.3836317135549872,
112
+ "grad_norm": 0.4569791962942034,
113
+ "learning_rate": 7.803380256922495e-06,
114
+ "loss": 0.1659,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.4092071611253197,
119
+ "grad_norm": 0.43754448097194093,
120
+ "learning_rate": 7.422090808099014e-06,
121
+ "loss": 0.1614,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.43478260869565216,
126
+ "grad_norm": 0.3965482560875204,
127
+ "learning_rate": 7.021410994121525e-06,
128
+ "loss": 0.1658,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.46035805626598464,
133
+ "grad_norm": 0.40365524431178995,
134
+ "learning_rate": 6.6045485097126585e-06,
135
+ "loss": 0.162,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.4859335038363171,
140
+ "grad_norm": 0.38726639073470187,
141
+ "learning_rate": 6.1748406020824115e-06,
142
+ "loss": 0.1603,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.5115089514066496,
147
+ "grad_norm": 0.3760876442217351,
148
+ "learning_rate": 5.735727354158581e-06,
149
+ "loss": 0.1607,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.5370843989769821,
154
+ "grad_norm": 0.3698546651087282,
155
+ "learning_rate": 5.290724144552379e-06,
156
+ "loss": 0.1566,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.5626598465473146,
161
+ "grad_norm": 0.378775457895615,
162
+ "learning_rate": 4.8433935047346e-06,
163
+ "loss": 0.1617,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.5882352941176471,
168
+ "grad_norm": 0.37700547911055976,
169
+ "learning_rate": 4.397316598723385e-06,
170
+ "loss": 0.1577,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.6138107416879796,
175
+ "grad_norm": 0.34954174520163833,
176
+ "learning_rate": 3.956064553606708e-06,
177
+ "loss": 0.1595,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.639386189258312,
182
+ "grad_norm": 0.357317340726361,
183
+ "learning_rate": 3.523169870416795e-06,
184
+ "loss": 0.154,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.6649616368286445,
189
+ "grad_norm": 0.3684051566901591,
190
+ "learning_rate": 3.1020981442305187e-06,
191
+ "loss": 0.1531,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.690537084398977,
196
+ "grad_norm": 0.3895498086957201,
197
+ "learning_rate": 2.6962203198941587e-06,
198
+ "loss": 0.1579,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.7161125319693095,
203
+ "grad_norm": 0.3731438076456078,
204
+ "learning_rate": 2.308785705482982e-06,
205
+ "loss": 0.1586,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.7416879795396419,
210
+ "grad_norm": 0.41906481058101075,
211
+ "learning_rate": 1.942895959539939e-06,
212
+ "loss": 0.1575,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.7672634271099744,
217
+ "grad_norm": 0.3896314041500002,
218
+ "learning_rate": 1.6014802603420044e-06,
219
+ "loss": 0.1553,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.7928388746803069,
224
+ "grad_norm": 0.385720584527069,
225
+ "learning_rate": 1.2872718559798852e-06,
226
+ "loss": 0.1532,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.8184143222506394,
231
+ "grad_norm": 0.32351933058141824,
232
+ "learning_rate": 1.0027861829824953e-06,
233
+ "loss": 0.1562,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.8439897698209718,
238
+ "grad_norm": 0.3670621953960731,
239
+ "learning_rate": 7.50300728660407e-07,
240
+ "loss": 0.155,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.8695652173913043,
245
+ "grad_norm": 0.3829568746421653,
246
+ "learning_rate": 5.318367983829393e-07,
247
+ "loss": 0.1523,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.8951406649616368,
252
+ "grad_norm": 0.3775945531822611,
253
+ "learning_rate": 3.49143333753309e-07,
254
+ "loss": 0.1521,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.9207161125319693,
259
+ "grad_norm": 0.36031263149371584,
260
+ "learning_rate": 2.0368291122759898e-07,
261
+ "loss": 0.1523,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.9462915601023018,
266
+ "grad_norm": 0.3348863130660584,
267
+ "learning_rate": 9.662003326740166e-08,
268
+ "loss": 0.1514,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.9718670076726342,
273
+ "grad_norm": 0.360326022397128,
274
+ "learning_rate": 2.8811805762860578e-08,
275
+ "loss": 0.1501,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.9974424552429667,
280
+ "grad_norm": 0.37241654765357973,
281
+ "learning_rate": 8.010763592264381e-10,
282
+ "loss": 0.152,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 1.0,
287
+ "step": 391,
288
+ "total_flos": 24007129694208.0,
289
+ "train_loss": 0.18847447641365364,
290
+ "train_runtime": 1067.9437,
291
+ "train_samples_per_second": 46.819,
292
+ "train_steps_per_second": 0.366
293
+ }
294
+ ],
295
+ "logging_steps": 10,
296
+ "max_steps": 391,
297
+ "num_input_tokens_seen": 0,
298
+ "num_train_epochs": 1,
299
+ "save_steps": 500,
300
+ "stateful_callbacks": {
301
+ "TrainerControl": {
302
+ "args": {
303
+ "should_epoch_stop": false,
304
+ "should_evaluate": false,
305
+ "should_log": false,
306
+ "should_save": true,
307
+ "should_training_stop": true
308
+ },
309
+ "attributes": {}
310
+ }
311
+ },
312
+ "total_flos": 24007129694208.0,
313
+ "train_batch_size": 16,
314
+ "trial_name": null,
315
+ "trial_params": null
316
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c95d9ece093116d8e3ae93ffc96a4f1e139a0964f50940249d7df0d4fefc873
3
+ size 7953
training_loss.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff