KotshinZ commited on
Commit
62723f4
·
verified ·
1 Parent(s): 674527b

End of training

Browse files
README.md CHANGED
@@ -6,6 +6,8 @@ model_name: gpt2-RMT-2
6
  tags:
7
  - generated_from_trainer
8
  - open-r1
 
 
9
  licence: license
10
  ---
11
 
@@ -27,7 +29,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/s18574s18574-/huggingface/runs/j0nfe37z)
31
 
32
 
33
  This model was trained with SFT.
 
6
  tags:
7
  - generated_from_trainer
8
  - open-r1
9
+ - trl
10
+ - sft
11
  licence: license
12
  ---
13
 
 
29
 
30
  ## Training procedure
31
 
32
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/s18574s18574-/huggingface/runs/lhnwfjsu)
33
 
34
 
35
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "eval_runtime": 0.2685,
3
  "eval_samples": 100,
4
- "eval_samples_per_second": 178.767,
5
- "eval_steps_per_second": 11.173,
6
- "total_flos": 0.0,
7
- "train_loss": 5.5703125,
8
- "train_runtime": 94.29,
9
  "train_samples": 19883,
10
- "train_samples_per_second": 11.211,
11
  "train_steps_per_second": 0.35
12
  }
 
1
  {
2
+ "eval_runtime": 0.2372,
3
  "eval_samples": 100,
4
+ "eval_samples_per_second": 202.381,
5
+ "eval_steps_per_second": 12.649,
6
+ "total_flos": 541940076511232.0,
7
+ "train_loss": 5.5710227272727275,
8
+ "train_runtime": 94.1692,
9
  "train_samples": 19883,
10
+ "train_samples_per_second": 11.226,
11
  "train_steps_per_second": 0.35
12
  }
config.json CHANGED
@@ -1,22 +1,28 @@
1
  {
2
- "_attn_implementation_autoset": true,
3
  "activation_function": "gelu_new",
 
4
  "architectures": [
5
- "GPT2LMHeadModel"
6
  ],
7
  "attn_pdrop": 0.1,
8
  "bos_token_id": 50256,
9
  "embd_pdrop": 0.1,
10
  "eos_token_id": 50256,
11
  "initializer_range": 0.02,
 
 
12
  "layer_norm_epsilon": 1e-05,
13
- "model_type": "gpt2",
 
 
14
  "n_ctx": 1024,
15
  "n_embd": 768,
16
  "n_head": 12,
17
  "n_inner": null,
18
  "n_layer": 12,
19
  "n_positions": 1024,
 
 
20
  "reorder_and_upcast_attn": false,
21
  "resid_pdrop": 0.1,
22
  "scale_attn_by_inverse_layer_idx": false,
 
1
  {
 
2
  "activation_function": "gelu_new",
3
+ "align": "left",
4
  "architectures": [
5
+ "RecurrentWrapper"
6
  ],
7
  "attn_pdrop": 0.1,
8
  "bos_token_id": 50256,
9
  "embd_pdrop": 0.1,
10
  "eos_token_id": 50256,
11
  "initializer_range": 0.02,
12
+ "input_seg_len": 1004,
13
+ "is_memory_all": false,
14
  "layer_norm_epsilon": 1e-05,
15
+ "max_n_segments": 2,
16
+ "memory_size": 10,
17
+ "model_type": "rmt",
18
  "n_ctx": 1024,
19
  "n_embd": 768,
20
  "n_head": 12,
21
  "n_inner": null,
22
  "n_layer": 12,
23
  "n_positions": 1024,
24
+ "num_mem_tokens": 10,
25
+ "output_seg_len": 1004,
26
  "reorder_and_upcast_attn": false,
27
  "resid_pdrop": 0.1,
28
  "scale_attn_by_inverse_layer_idx": false,
eval_results.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "eval_runtime": 0.2685,
3
  "eval_samples": 100,
4
- "eval_samples_per_second": 178.767,
5
- "eval_steps_per_second": 11.173
6
  }
 
1
  {
2
+ "eval_runtime": 0.2372,
3
  "eval_samples": 100,
4
+ "eval_samples_per_second": 202.381,
5
+ "eval_steps_per_second": 12.649
6
  }
generation_config.json CHANGED
@@ -2,5 +2,6 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 50256,
4
  "eos_token_id": 50256,
5
- "transformers_version": "4.50.0.dev0"
 
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 50256,
4
  "eos_token_id": 50256,
5
+ "transformers_version": "4.50.0.dev0",
6
+ "use_cache": false
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ddafb83cf1a59157af80d355275bd09239a37d622ab84d9c4df9fd1bb357cbce
3
- size 326089656
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f2c2cb3ce04aca9a075cf3e13f0f9653bf4b8a7da1160df52d27a2d3ee0ceb3
3
+ size 248912768
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 0.0,
3
- "train_loss": 5.5703125,
4
- "train_runtime": 94.29,
5
  "train_samples": 19883,
6
- "train_samples_per_second": 11.211,
7
  "train_steps_per_second": 0.35
8
  }
 
1
  {
2
+ "total_flos": 541940076511232.0,
3
+ "train_loss": 5.5710227272727275,
4
+ "train_runtime": 94.1692,
5
  "train_samples": 19883,
6
+ "train_samples_per_second": 11.226,
7
  "train_steps_per_second": 0.35
8
  }
trainer_state.json CHANGED
@@ -10,36 +10,36 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.030257186081694403,
13
- "grad_norm": 23.19076749983313,
14
  "learning_rate": 1.796093065705644e-05,
15
- "loss": 6.8891,
16
- "mean_token_accuracy": 0.14328600466251373,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.060514372163388806,
21
- "grad_norm": 10.552981359945921,
22
  "learning_rate": 8.382180034472353e-06,
23
- "loss": 5.3461,
24
- "mean_token_accuracy": 0.1898418039083481,
25
  "step": 20
26
  },
27
  {
28
  "epoch": 0.0907715582450832,
29
- "grad_norm": 8.191712743407091,
30
  "learning_rate": 5.234682881719766e-07,
31
- "loss": 4.75,
32
- "mean_token_accuracy": 0.2335878863930702,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.09984871406959153,
37
- "mean_token_accuracy": 0.23857259502013525,
38
  "step": 33,
39
- "total_flos": 0.0,
40
- "train_loss": 5.5703125,
41
- "train_runtime": 94.29,
42
- "train_samples_per_second": 11.211,
43
  "train_steps_per_second": 0.35
44
  }
45
  ],
@@ -60,7 +60,7 @@
60
  "attributes": {}
61
  }
62
  },
63
- "total_flos": 0.0,
64
  "train_batch_size": 8,
65
  "trial_name": null,
66
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.030257186081694403,
13
+ "grad_norm": 19.446511460556057,
14
  "learning_rate": 1.796093065705644e-05,
15
+ "loss": 6.9188,
16
+ "mean_token_accuracy": 0.14323783591389655,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.060514372163388806,
21
+ "grad_norm": 10.827592460467098,
22
  "learning_rate": 8.382180034472353e-06,
23
+ "loss": 5.3359,
24
+ "mean_token_accuracy": 0.19127428904175758,
25
  "step": 20
26
  },
27
  {
28
  "epoch": 0.0907715582450832,
29
+ "grad_norm": 8.615023934478028,
30
  "learning_rate": 5.234682881719766e-07,
31
+ "loss": 4.7375,
32
+ "mean_token_accuracy": 0.2355961874127388,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.09984871406959153,
37
+ "mean_token_accuracy": 0.2397766982515653,
38
  "step": 33,
39
+ "total_flos": 541940076511232.0,
40
+ "train_loss": 5.5710227272727275,
41
+ "train_runtime": 94.1692,
42
+ "train_samples_per_second": 11.226,
43
  "train_steps_per_second": 0.35
44
  }
45
  ],
 
60
  "attributes": {}
61
  }
62
  },
63
+ "total_flos": 541940076511232.0,
64
  "train_batch_size": 8,
65
  "trial_name": null,
66
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5fa711d6cdcb0b2b63e61e4b8e967fec2988c80fb803ba46477961ca838fb4d
3
  size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1d79e0dd26daa466cbf9b51285fc1b10493e4f85980a882ccde209f5af06f5c
3
  size 7352