KotshinZ commited on
Commit
aa7b490
·
verified ·
1 Parent(s): 62723f4

End of training

Browse files
README.md CHANGED
@@ -29,7 +29,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/s18574s18574-/huggingface/runs/lhnwfjsu)
33
 
34
 
35
  This model was trained with SFT.
 
29
 
30
  ## Training procedure
31
 
32
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/s18574s18574-/huggingface/runs/5p39svg5)
33
 
34
 
35
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "eval_runtime": 0.2372,
3
  "eval_samples": 100,
4
- "eval_samples_per_second": 202.381,
5
- "eval_steps_per_second": 12.649,
6
  "total_flos": 541940076511232.0,
7
- "train_loss": 5.5710227272727275,
8
- "train_runtime": 94.1692,
9
  "train_samples": 19883,
10
- "train_samples_per_second": 11.226,
11
  "train_steps_per_second": 0.35
12
  }
 
1
  {
2
+ "eval_runtime": 0.2379,
3
  "eval_samples": 100,
4
+ "eval_samples_per_second": 201.753,
5
+ "eval_steps_per_second": 12.61,
6
  "total_flos": 541940076511232.0,
7
+ "train_loss": 11.106060606060606,
8
+ "train_runtime": 94.1702,
9
  "train_samples": 19883,
10
+ "train_samples_per_second": 11.225,
11
  "train_steps_per_second": 0.35
12
  }
config.json CHANGED
@@ -2,9 +2,103 @@
2
  "activation_function": "gelu_new",
3
  "align": "left",
4
  "architectures": [
5
- "RecurrentWrapper"
6
  ],
7
  "attn_pdrop": 0.1,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  "bos_token_id": 50256,
9
  "embd_pdrop": 0.1,
10
  "eos_token_id": 50256,
 
2
  "activation_function": "gelu_new",
3
  "align": "left",
4
  "architectures": [
5
+ "RecurrentMemoryTransformer"
6
  ],
7
  "attn_pdrop": 0.1,
8
+ "base_model_config": {
9
+ "_attn_implementation_autoset": true,
10
+ "_name_or_path": "openai-community/gpt2",
11
+ "activation_function": "gelu_new",
12
+ "add_cross_attention": false,
13
+ "architectures": [
14
+ "GPT2LMHeadModel"
15
+ ],
16
+ "attn_pdrop": 0.1,
17
+ "bad_words_ids": null,
18
+ "begin_suppress_tokens": null,
19
+ "bos_token_id": 50256,
20
+ "chunk_size_feed_forward": 0,
21
+ "cross_attention_hidden_size": null,
22
+ "decoder_start_token_id": null,
23
+ "diversity_penalty": 0.0,
24
+ "do_sample": false,
25
+ "early_stopping": false,
26
+ "embd_pdrop": 0.1,
27
+ "encoder_no_repeat_ngram_size": 0,
28
+ "eos_token_id": 50256,
29
+ "exponential_decay_length_penalty": null,
30
+ "finetuning_task": null,
31
+ "forced_bos_token_id": null,
32
+ "forced_eos_token_id": null,
33
+ "id2label": {
34
+ "0": "LABEL_0",
35
+ "1": "LABEL_1"
36
+ },
37
+ "initializer_range": 0.02,
38
+ "is_decoder": false,
39
+ "is_encoder_decoder": false,
40
+ "label2id": {
41
+ "LABEL_0": 0,
42
+ "LABEL_1": 1
43
+ },
44
+ "layer_norm_epsilon": 1e-05,
45
+ "length_penalty": 1.0,
46
+ "max_length": 20,
47
+ "min_length": 0,
48
+ "model_type": "gpt2",
49
+ "n_ctx": 1024,
50
+ "n_embd": 768,
51
+ "n_head": 12,
52
+ "n_inner": null,
53
+ "n_layer": 12,
54
+ "n_positions": 1024,
55
+ "no_repeat_ngram_size": 0,
56
+ "num_beam_groups": 1,
57
+ "num_beams": 1,
58
+ "num_return_sequences": 1,
59
+ "output_attentions": false,
60
+ "output_hidden_states": false,
61
+ "output_scores": false,
62
+ "pad_token_id": null,
63
+ "prefix": null,
64
+ "problem_type": null,
65
+ "pruned_heads": {},
66
+ "remove_invalid_values": false,
67
+ "reorder_and_upcast_attn": false,
68
+ "repetition_penalty": 1.0,
69
+ "resid_pdrop": 0.1,
70
+ "return_dict": true,
71
+ "return_dict_in_generate": false,
72
+ "scale_attn_by_inverse_layer_idx": false,
73
+ "scale_attn_weights": true,
74
+ "sep_token_id": null,
75
+ "summary_activation": null,
76
+ "summary_first_dropout": 0.1,
77
+ "summary_proj_to_labels": true,
78
+ "summary_type": "cls_index",
79
+ "summary_use_proj": true,
80
+ "suppress_tokens": null,
81
+ "task_specific_params": {
82
+ "text-generation": {
83
+ "do_sample": true,
84
+ "max_length": 50
85
+ }
86
+ },
87
+ "temperature": 1.0,
88
+ "tf_legacy_loss": false,
89
+ "tie_encoder_decoder": false,
90
+ "tie_word_embeddings": true,
91
+ "tokenizer_class": null,
92
+ "top_k": 50,
93
+ "top_p": 1.0,
94
+ "torch_dtype": "bfloat16",
95
+ "torchscript": false,
96
+ "typical_p": 1.0,
97
+ "use_bfloat16": false,
98
+ "use_cache": false,
99
+ "vocab_size": 50257
100
+ },
101
+ "base_model_type": "gpt2",
102
  "bos_token_id": 50256,
103
  "embd_pdrop": 0.1,
104
  "eos_token_id": 50256,
eval_results.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "eval_runtime": 0.2372,
3
  "eval_samples": 100,
4
- "eval_samples_per_second": 202.381,
5
- "eval_steps_per_second": 12.649
6
  }
 
1
  {
2
+ "eval_runtime": 0.2379,
3
  "eval_samples": 100,
4
+ "eval_samples_per_second": 201.753,
5
+ "eval_steps_per_second": 12.61
6
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f2c2cb3ce04aca9a075cf3e13f0f9653bf4b8a7da1160df52d27a2d3ee0ceb3
3
- size 248912768
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a151f2bd4439693232966936e8838ed7a0f3181ee2be269d71c0f7dc5c91300
3
+ size 248915448
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 541940076511232.0,
3
- "train_loss": 5.5710227272727275,
4
- "train_runtime": 94.1692,
5
  "train_samples": 19883,
6
- "train_samples_per_second": 11.226,
7
  "train_steps_per_second": 0.35
8
  }
 
1
  {
2
  "total_flos": 541940076511232.0,
3
+ "train_loss": 11.106060606060606,
4
+ "train_runtime": 94.1702,
5
  "train_samples": 19883,
6
+ "train_samples_per_second": 11.225,
7
  "train_steps_per_second": 0.35
8
  }
trainer_state.json CHANGED
@@ -10,36 +10,36 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.030257186081694403,
13
- "grad_norm": 19.446511460556057,
14
  "learning_rate": 1.796093065705644e-05,
15
- "loss": 6.9188,
16
- "mean_token_accuracy": 0.14323783591389655,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.060514372163388806,
21
- "grad_norm": 10.827592460467098,
22
  "learning_rate": 8.382180034472353e-06,
23
- "loss": 5.3359,
24
- "mean_token_accuracy": 0.19127428904175758,
25
  "step": 20
26
  },
27
  {
28
  "epoch": 0.0907715582450832,
29
- "grad_norm": 8.615023934478028,
30
  "learning_rate": 5.234682881719766e-07,
31
- "loss": 4.7375,
32
- "mean_token_accuracy": 0.2355961874127388,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.09984871406959153,
37
- "mean_token_accuracy": 0.2397766982515653,
38
  "step": 33,
39
  "total_flos": 541940076511232.0,
40
- "train_loss": 5.5710227272727275,
41
- "train_runtime": 94.1692,
42
- "train_samples_per_second": 11.226,
43
  "train_steps_per_second": 0.35
44
  }
45
  ],
 
10
  "log_history": [
11
  {
12
  "epoch": 0.030257186081694403,
13
+ "grad_norm": 35.22744368526962,
14
  "learning_rate": 1.796093065705644e-05,
15
+ "loss": 13.7766,
16
+ "mean_token_accuracy": 0.1436693400144577,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.060514372163388806,
21
+ "grad_norm": 23.6320168043645,
22
  "learning_rate": 8.382180034472353e-06,
23
+ "loss": 10.65,
24
+ "mean_token_accuracy": 0.19171493500471115,
25
  "step": 20
26
  },
27
  {
28
  "epoch": 0.0907715582450832,
29
+ "grad_norm": 15.21905964248453,
30
  "learning_rate": 5.234682881719766e-07,
31
+ "loss": 9.4422,
32
+ "mean_token_accuracy": 0.23619858622550965,
33
  "step": 30
34
  },
35
  {
36
  "epoch": 0.09984871406959153,
37
+ "mean_token_accuracy": 0.23997420817613602,
38
  "step": 33,
39
  "total_flos": 541940076511232.0,
40
+ "train_loss": 11.106060606060606,
41
+ "train_runtime": 94.1702,
42
+ "train_samples_per_second": 11.225,
43
  "train_steps_per_second": 0.35
44
  }
45
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1d79e0dd26daa466cbf9b51285fc1b10493e4f85980a882ccde209f5af06f5c
3
  size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edbeaeb7aa6557be5c809a22a09614c50689d24c266e9e86dfcd26bb1fbec880
3
  size 7352