meekre36 commited on
Commit
3f281f9
·
verified ·
1 Parent(s): 078ed13

Upload 50 files

Browse files
Files changed (50) hide show
  1. gpt2_from_scratch_12layer/checkpoint-1000/config.json +35 -0
  2. gpt2_from_scratch_12layer/checkpoint-1000/generation_config.json +9 -0
  3. gpt2_from_scratch_12layer/checkpoint-1000/model.safetensors +3 -0
  4. gpt2_from_scratch_12layer/checkpoint-1000/optimizer.pt +3 -0
  5. gpt2_from_scratch_12layer/checkpoint-1000/rng_state.pth +3 -0
  6. gpt2_from_scratch_12layer/checkpoint-1000/scaler.pt +3 -0
  7. gpt2_from_scratch_12layer/checkpoint-1000/scheduler.pt +3 -0
  8. gpt2_from_scratch_12layer/checkpoint-1000/tokenizer.json +0 -0
  9. gpt2_from_scratch_12layer/checkpoint-1000/tokenizer_config.json +9 -0
  10. gpt2_from_scratch_12layer/checkpoint-1000/trainer_state.json +104 -0
  11. gpt2_from_scratch_12layer/checkpoint-1000/training_args.bin +3 -0
  12. gpt2_from_scratch_12layer/checkpoint-2000/config.json +35 -0
  13. gpt2_from_scratch_12layer/checkpoint-2000/generation_config.json +9 -0
  14. gpt2_from_scratch_12layer/checkpoint-2000/model.safetensors +3 -0
  15. gpt2_from_scratch_12layer/checkpoint-2000/optimizer.pt +3 -0
  16. gpt2_from_scratch_12layer/checkpoint-2000/rng_state.pth +3 -0
  17. gpt2_from_scratch_12layer/checkpoint-2000/scaler.pt +3 -0
  18. gpt2_from_scratch_12layer/checkpoint-2000/scheduler.pt +3 -0
  19. gpt2_from_scratch_12layer/checkpoint-2000/tokenizer.json +0 -0
  20. gpt2_from_scratch_12layer/checkpoint-2000/tokenizer_config.json +9 -0
  21. gpt2_from_scratch_12layer/checkpoint-2000/trainer_state.json +174 -0
  22. gpt2_from_scratch_12layer/checkpoint-2000/training_args.bin +3 -0
  23. gpt2_from_scratch_12layer/checkpoint-3000/config.json +35 -0
  24. gpt2_from_scratch_12layer/checkpoint-3000/generation_config.json +9 -0
  25. gpt2_from_scratch_12layer/checkpoint-3000/model.safetensors +3 -0
  26. gpt2_from_scratch_12layer/checkpoint-3000/optimizer.pt +3 -0
  27. gpt2_from_scratch_12layer/checkpoint-3000/rng_state.pth +3 -0
  28. gpt2_from_scratch_12layer/checkpoint-3000/scaler.pt +3 -0
  29. gpt2_from_scratch_12layer/checkpoint-3000/scheduler.pt +3 -0
  30. gpt2_from_scratch_12layer/checkpoint-3000/tokenizer.json +0 -0
  31. gpt2_from_scratch_12layer/checkpoint-3000/tokenizer_config.json +9 -0
  32. gpt2_from_scratch_12layer/checkpoint-3000/trainer_state.json +244 -0
  33. gpt2_from_scratch_12layer/checkpoint-3000/training_args.bin +3 -0
  34. gpt2_from_scratch_12layer/checkpoint-3406/config.json +35 -0
  35. gpt2_from_scratch_12layer/checkpoint-3406/generation_config.json +9 -0
  36. gpt2_from_scratch_12layer/checkpoint-3406/model.safetensors +3 -0
  37. gpt2_from_scratch_12layer/checkpoint-3406/optimizer.pt +3 -0
  38. gpt2_from_scratch_12layer/checkpoint-3406/rng_state.pth +3 -0
  39. gpt2_from_scratch_12layer/checkpoint-3406/scaler.pt +3 -0
  40. gpt2_from_scratch_12layer/checkpoint-3406/scheduler.pt +3 -0
  41. gpt2_from_scratch_12layer/checkpoint-3406/tokenizer.json +0 -0
  42. gpt2_from_scratch_12layer/checkpoint-3406/tokenizer_config.json +9 -0
  43. gpt2_from_scratch_12layer/checkpoint-3406/trainer_state.json +272 -0
  44. gpt2_from_scratch_12layer/checkpoint-3406/training_args.bin +3 -0
  45. gpt2_from_scratch_12layer/config.json +35 -0
  46. gpt2_from_scratch_12layer/generation_config.json +9 -0
  47. gpt2_from_scratch_12layer/model.safetensors +3 -0
  48. gpt2_from_scratch_12layer/tokenizer.json +0 -0
  49. gpt2_from_scratch_12layer/tokenizer_config.json +9 -0
  50. gpt2_from_scratch_12layer/training_args.bin +3 -0
gpt2_from_scratch_12layer/checkpoint-1000/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "dtype": "float32",
10
+ "embd_pdrop": 0.1,
11
+ "eos_token_id": 50256,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "pad_token_id": null,
22
+ "reorder_and_upcast_attn": false,
23
+ "resid_pdrop": 0.1,
24
+ "scale_attn_by_inverse_layer_idx": false,
25
+ "scale_attn_weights": true,
26
+ "summary_activation": null,
27
+ "summary_first_dropout": 0.1,
28
+ "summary_proj_to_labels": true,
29
+ "summary_type": "cls_index",
30
+ "summary_use_proj": true,
31
+ "tie_word_embeddings": true,
32
+ "transformers_version": "5.0.0",
33
+ "use_cache": false,
34
+ "vocab_size": 50257
35
+ }
gpt2_from_scratch_12layer/checkpoint-1000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "transformers_version": "5.0.0",
8
+ "use_cache": true
9
+ }
gpt2_from_scratch_12layer/checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b23254f47a6b7d6396d5cd5009f7b4e97c808f10c54cc13a522fc80a40a6f914
3
+ size 497774208
gpt2_from_scratch_12layer/checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1d31359d42cea3697b2d87c673ef4d4a81a0834d8d6163b209575a6d00bac41
3
+ size 995642298
gpt2_from_scratch_12layer/checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28325bd1f1d721c530c7ba38b64e73cb2cf1fdad7c3357d638f67a67744c8645
3
+ size 14244
gpt2_from_scratch_12layer/checkpoint-1000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d8fdcd0311eba9854fff738038ed4c1a269832665b4d88ba4e4e3d02a1a7e0e
3
+ size 988
gpt2_from_scratch_12layer/checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04a74a909d336b7124436e4cb0278258b381fc72bf5b206e1c024e4444ff4f32
3
+ size 1064
gpt2_from_scratch_12layer/checkpoint-1000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gpt2_from_scratch_12layer/checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "<pad>",
7
+ "tokenizer_class": "TokenizersBackend",
8
+ "unk_token": "<unk>"
9
+ }
gpt2_from_scratch_12layer/checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.2936641949930255,
6
+ "eval_steps": 500,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02936641949930255,
14
+ "grad_norm": 2.2842259407043457,
15
+ "learning_rate": 2.4750000000000004e-06,
16
+ "loss": 10.393255615234375,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.0587328389986051,
21
+ "grad_norm": 1.976091980934143,
22
+ "learning_rate": 4.975000000000001e-06,
23
+ "loss": 9.357327270507813,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.08809925849790765,
28
+ "grad_norm": 1.6418145895004272,
29
+ "learning_rate": 7.4750000000000004e-06,
30
+ "loss": 8.744969482421874,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.1174656779972102,
35
+ "grad_norm": 1.1453146934509277,
36
+ "learning_rate": 9.975e-06,
37
+ "loss": 8.003826904296876,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.14683209749651274,
42
+ "grad_norm": 0.6994723677635193,
43
+ "learning_rate": 1.2475e-05,
44
+ "loss": 7.452492065429688,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.1761985169958153,
49
+ "grad_norm": 0.4603072702884674,
50
+ "learning_rate": 1.4975e-05,
51
+ "loss": 7.1382373046875,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.20556493649511784,
56
+ "grad_norm": 0.4629450738430023,
57
+ "learning_rate": 1.7475e-05,
58
+ "loss": 6.968035888671875,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.2349313559944204,
63
+ "grad_norm": 0.5266813635826111,
64
+ "learning_rate": 1.9975e-05,
65
+ "loss": 6.8181103515625,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.2642977754937229,
70
+ "grad_norm": 0.5502268671989441,
71
+ "learning_rate": 2.2475e-05,
72
+ "loss": 6.682680053710937,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.2936641949930255,
77
+ "grad_norm": 0.537894606590271,
78
+ "learning_rate": 2.4975e-05,
79
+ "loss": 6.568981323242188,
80
+ "step": 1000
81
+ }
82
+ ],
83
+ "logging_steps": 100,
84
+ "max_steps": 3406,
85
+ "num_input_tokens_seen": 0,
86
+ "num_train_epochs": 1,
87
+ "save_steps": 1000,
88
+ "stateful_callbacks": {
89
+ "TrainerControl": {
90
+ "args": {
91
+ "should_epoch_stop": false,
92
+ "should_evaluate": false,
93
+ "should_log": false,
94
+ "should_save": true,
95
+ "should_training_stop": false
96
+ },
97
+ "attributes": {}
98
+ }
99
+ },
100
+ "total_flos": 1.6722690048e+16,
101
+ "train_batch_size": 4,
102
+ "trial_name": null,
103
+ "trial_params": null
104
+ }
gpt2_from_scratch_12layer/checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
3
+ size 4728
gpt2_from_scratch_12layer/checkpoint-2000/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "dtype": "float32",
10
+ "embd_pdrop": 0.1,
11
+ "eos_token_id": 50256,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "pad_token_id": null,
22
+ "reorder_and_upcast_attn": false,
23
+ "resid_pdrop": 0.1,
24
+ "scale_attn_by_inverse_layer_idx": false,
25
+ "scale_attn_weights": true,
26
+ "summary_activation": null,
27
+ "summary_first_dropout": 0.1,
28
+ "summary_proj_to_labels": true,
29
+ "summary_type": "cls_index",
30
+ "summary_use_proj": true,
31
+ "tie_word_embeddings": true,
32
+ "transformers_version": "5.0.0",
33
+ "use_cache": false,
34
+ "vocab_size": 50257
35
+ }
gpt2_from_scratch_12layer/checkpoint-2000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "transformers_version": "5.0.0",
8
+ "use_cache": true
9
+ }
gpt2_from_scratch_12layer/checkpoint-2000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:043f01835a7fc728da107643ad158db644191333dbde9b03bb14fc00283f9960
3
+ size 497774208
gpt2_from_scratch_12layer/checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e237c1420cfb71c0b92f02f0a00096093dd8b9b8d0c31f9e977a185ce40b82f3
3
+ size 995642298
gpt2_from_scratch_12layer/checkpoint-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f95a2c63a0eaf9d82cb93bfc10ea04f73b732c5d5ce79e4bff972ef3f9449c92
3
+ size 14244
gpt2_from_scratch_12layer/checkpoint-2000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c50a9cebe5d66d453d25b140738bff479749ac03e0a43597d8776bc22f6ed0c
3
+ size 988
gpt2_from_scratch_12layer/checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3ea9d13baff2282d300ceb3c3984a3388d1450303ffc8640c73967fa3325903
3
+ size 1064
gpt2_from_scratch_12layer/checkpoint-2000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gpt2_from_scratch_12layer/checkpoint-2000/tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "<pad>",
7
+ "tokenizer_class": "TokenizersBackend",
8
+ "unk_token": "<unk>"
9
+ }
gpt2_from_scratch_12layer/checkpoint-2000/trainer_state.json ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.587328389986051,
6
+ "eval_steps": 500,
7
+ "global_step": 2000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02936641949930255,
14
+ "grad_norm": 2.2842259407043457,
15
+ "learning_rate": 2.4750000000000004e-06,
16
+ "loss": 10.393255615234375,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.0587328389986051,
21
+ "grad_norm": 1.976091980934143,
22
+ "learning_rate": 4.975000000000001e-06,
23
+ "loss": 9.357327270507813,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.08809925849790765,
28
+ "grad_norm": 1.6418145895004272,
29
+ "learning_rate": 7.4750000000000004e-06,
30
+ "loss": 8.744969482421874,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.1174656779972102,
35
+ "grad_norm": 1.1453146934509277,
36
+ "learning_rate": 9.975e-06,
37
+ "loss": 8.003826904296876,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.14683209749651274,
42
+ "grad_norm": 0.6994723677635193,
43
+ "learning_rate": 1.2475e-05,
44
+ "loss": 7.452492065429688,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.1761985169958153,
49
+ "grad_norm": 0.4603072702884674,
50
+ "learning_rate": 1.4975e-05,
51
+ "loss": 7.1382373046875,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.20556493649511784,
56
+ "grad_norm": 0.4629450738430023,
57
+ "learning_rate": 1.7475e-05,
58
+ "loss": 6.968035888671875,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.2349313559944204,
63
+ "grad_norm": 0.5266813635826111,
64
+ "learning_rate": 1.9975e-05,
65
+ "loss": 6.8181103515625,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.2642977754937229,
70
+ "grad_norm": 0.5502268671989441,
71
+ "learning_rate": 2.2475e-05,
72
+ "loss": 6.682680053710937,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.2936641949930255,
77
+ "grad_norm": 0.537894606590271,
78
+ "learning_rate": 2.4975e-05,
79
+ "loss": 6.568981323242188,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.323030614492328,
84
+ "grad_norm": 0.5135723352432251,
85
+ "learning_rate": 2.7475e-05,
86
+ "loss": 6.471431884765625,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 0.3523970339916306,
91
+ "grad_norm": 0.606870710849762,
92
+ "learning_rate": 2.9975000000000004e-05,
93
+ "loss": 6.3824462890625,
94
+ "step": 1200
95
+ },
96
+ {
97
+ "epoch": 0.3817634534909331,
98
+ "grad_norm": 0.5291919112205505,
99
+ "learning_rate": 3.2474999999999997e-05,
100
+ "loss": 6.302595825195312,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 0.4111298729902357,
105
+ "grad_norm": 0.6090461015701294,
106
+ "learning_rate": 3.4975e-05,
107
+ "loss": 6.223634033203125,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.4404962924895382,
112
+ "grad_norm": 0.5523635149002075,
113
+ "learning_rate": 3.7475e-05,
114
+ "loss": 6.154580688476562,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 0.4698627119888408,
119
+ "grad_norm": 0.6641230583190918,
120
+ "learning_rate": 3.9975e-05,
121
+ "loss": 6.086353759765625,
122
+ "step": 1600
123
+ },
124
+ {
125
+ "epoch": 0.4992291314881433,
126
+ "grad_norm": 0.6724914908409119,
127
+ "learning_rate": 4.2475e-05,
128
+ "loss": 6.030512084960938,
129
+ "step": 1700
130
+ },
131
+ {
132
+ "epoch": 0.5285955509874458,
133
+ "grad_norm": 0.5981016755104065,
134
+ "learning_rate": 4.4975e-05,
135
+ "loss": 5.963157348632812,
136
+ "step": 1800
137
+ },
138
+ {
139
+ "epoch": 0.5579619704867484,
140
+ "grad_norm": 0.676860511302948,
141
+ "learning_rate": 4.7475e-05,
142
+ "loss": 5.894300537109375,
143
+ "step": 1900
144
+ },
145
+ {
146
+ "epoch": 0.587328389986051,
147
+ "grad_norm": 0.6556357741355896,
148
+ "learning_rate": 4.9975e-05,
149
+ "loss": 5.844266967773438,
150
+ "step": 2000
151
+ }
152
+ ],
153
+ "logging_steps": 100,
154
+ "max_steps": 3406,
155
+ "num_input_tokens_seen": 0,
156
+ "num_train_epochs": 1,
157
+ "save_steps": 1000,
158
+ "stateful_callbacks": {
159
+ "TrainerControl": {
160
+ "args": {
161
+ "should_epoch_stop": false,
162
+ "should_evaluate": false,
163
+ "should_log": false,
164
+ "should_save": true,
165
+ "should_training_stop": false
166
+ },
167
+ "attributes": {}
168
+ }
169
+ },
170
+ "total_flos": 3.3445380096e+16,
171
+ "train_batch_size": 4,
172
+ "trial_name": null,
173
+ "trial_params": null
174
+ }
gpt2_from_scratch_12layer/checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
3
+ size 4728
gpt2_from_scratch_12layer/checkpoint-3000/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "dtype": "float32",
10
+ "embd_pdrop": 0.1,
11
+ "eos_token_id": 50256,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "pad_token_id": null,
22
+ "reorder_and_upcast_attn": false,
23
+ "resid_pdrop": 0.1,
24
+ "scale_attn_by_inverse_layer_idx": false,
25
+ "scale_attn_weights": true,
26
+ "summary_activation": null,
27
+ "summary_first_dropout": 0.1,
28
+ "summary_proj_to_labels": true,
29
+ "summary_type": "cls_index",
30
+ "summary_use_proj": true,
31
+ "tie_word_embeddings": true,
32
+ "transformers_version": "5.0.0",
33
+ "use_cache": false,
34
+ "vocab_size": 50257
35
+ }
gpt2_from_scratch_12layer/checkpoint-3000/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "transformers_version": "5.0.0",
8
+ "use_cache": true
9
+ }
gpt2_from_scratch_12layer/checkpoint-3000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c4776ac0444ad6001151629fdfa49402a4713258dc905b86f078d45300d610f
3
+ size 497774208
gpt2_from_scratch_12layer/checkpoint-3000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfaf5b9fae33481d91060504d67e57291ff13d8fe917f28251b971b4e8b8684a
3
+ size 995642298
gpt2_from_scratch_12layer/checkpoint-3000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e10f3460a731b355fd0ba5229f0bad8de79ebe7909166a3ad7f90c89b83dda5
3
+ size 14244
gpt2_from_scratch_12layer/checkpoint-3000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21aba8ed0f38ed1c04994c10a9ca7e9925e55ef2ed51283c43ff8e2cce78585f
3
+ size 988
gpt2_from_scratch_12layer/checkpoint-3000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e7038e46af7868cb4110f4906a05bd0c0cfeb8b51264c714a479c44c4014e81
3
+ size 1064
gpt2_from_scratch_12layer/checkpoint-3000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gpt2_from_scratch_12layer/checkpoint-3000/tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "<pad>",
7
+ "tokenizer_class": "TokenizersBackend",
8
+ "unk_token": "<unk>"
9
+ }
gpt2_from_scratch_12layer/checkpoint-3000/trainer_state.json ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.8809925849790764,
6
+ "eval_steps": 500,
7
+ "global_step": 3000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02936641949930255,
14
+ "grad_norm": 2.2842259407043457,
15
+ "learning_rate": 2.4750000000000004e-06,
16
+ "loss": 10.393255615234375,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.0587328389986051,
21
+ "grad_norm": 1.976091980934143,
22
+ "learning_rate": 4.975000000000001e-06,
23
+ "loss": 9.357327270507813,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.08809925849790765,
28
+ "grad_norm": 1.6418145895004272,
29
+ "learning_rate": 7.4750000000000004e-06,
30
+ "loss": 8.744969482421874,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.1174656779972102,
35
+ "grad_norm": 1.1453146934509277,
36
+ "learning_rate": 9.975e-06,
37
+ "loss": 8.003826904296876,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.14683209749651274,
42
+ "grad_norm": 0.6994723677635193,
43
+ "learning_rate": 1.2475e-05,
44
+ "loss": 7.452492065429688,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.1761985169958153,
49
+ "grad_norm": 0.4603072702884674,
50
+ "learning_rate": 1.4975e-05,
51
+ "loss": 7.1382373046875,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.20556493649511784,
56
+ "grad_norm": 0.4629450738430023,
57
+ "learning_rate": 1.7475e-05,
58
+ "loss": 6.968035888671875,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.2349313559944204,
63
+ "grad_norm": 0.5266813635826111,
64
+ "learning_rate": 1.9975e-05,
65
+ "loss": 6.8181103515625,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.2642977754937229,
70
+ "grad_norm": 0.5502268671989441,
71
+ "learning_rate": 2.2475e-05,
72
+ "loss": 6.682680053710937,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.2936641949930255,
77
+ "grad_norm": 0.537894606590271,
78
+ "learning_rate": 2.4975e-05,
79
+ "loss": 6.568981323242188,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.323030614492328,
84
+ "grad_norm": 0.5135723352432251,
85
+ "learning_rate": 2.7475e-05,
86
+ "loss": 6.471431884765625,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 0.3523970339916306,
91
+ "grad_norm": 0.606870710849762,
92
+ "learning_rate": 2.9975000000000004e-05,
93
+ "loss": 6.3824462890625,
94
+ "step": 1200
95
+ },
96
+ {
97
+ "epoch": 0.3817634534909331,
98
+ "grad_norm": 0.5291919112205505,
99
+ "learning_rate": 3.2474999999999997e-05,
100
+ "loss": 6.302595825195312,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 0.4111298729902357,
105
+ "grad_norm": 0.6090461015701294,
106
+ "learning_rate": 3.4975e-05,
107
+ "loss": 6.223634033203125,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.4404962924895382,
112
+ "grad_norm": 0.5523635149002075,
113
+ "learning_rate": 3.7475e-05,
114
+ "loss": 6.154580688476562,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 0.4698627119888408,
119
+ "grad_norm": 0.6641230583190918,
120
+ "learning_rate": 3.9975e-05,
121
+ "loss": 6.086353759765625,
122
+ "step": 1600
123
+ },
124
+ {
125
+ "epoch": 0.4992291314881433,
126
+ "grad_norm": 0.6724914908409119,
127
+ "learning_rate": 4.2475e-05,
128
+ "loss": 6.030512084960938,
129
+ "step": 1700
130
+ },
131
+ {
132
+ "epoch": 0.5285955509874458,
133
+ "grad_norm": 0.5981016755104065,
134
+ "learning_rate": 4.4975e-05,
135
+ "loss": 5.963157348632812,
136
+ "step": 1800
137
+ },
138
+ {
139
+ "epoch": 0.5579619704867484,
140
+ "grad_norm": 0.676860511302948,
141
+ "learning_rate": 4.7475e-05,
142
+ "loss": 5.894300537109375,
143
+ "step": 1900
144
+ },
145
+ {
146
+ "epoch": 0.587328389986051,
147
+ "grad_norm": 0.6556357741355896,
148
+ "learning_rate": 4.9975e-05,
149
+ "loss": 5.844266967773438,
150
+ "step": 2000
151
+ },
152
+ {
153
+ "epoch": 0.6166948094853535,
154
+ "grad_norm": 0.7801370024681091,
155
+ "learning_rate": 4.647937411095306e-05,
156
+ "loss": 5.78113037109375,
157
+ "step": 2100
158
+ },
159
+ {
160
+ "epoch": 0.646061228984656,
161
+ "grad_norm": 0.7802927494049072,
162
+ "learning_rate": 4.292318634423898e-05,
163
+ "loss": 5.719049072265625,
164
+ "step": 2200
165
+ },
166
+ {
167
+ "epoch": 0.6754276484839586,
168
+ "grad_norm": 0.6435455083847046,
169
+ "learning_rate": 3.936699857752489e-05,
170
+ "loss": 5.66756591796875,
171
+ "step": 2300
172
+ },
173
+ {
174
+ "epoch": 0.7047940679832612,
175
+ "grad_norm": 0.6630441546440125,
176
+ "learning_rate": 3.581081081081081e-05,
177
+ "loss": 5.636476440429687,
178
+ "step": 2400
179
+ },
180
+ {
181
+ "epoch": 0.7341604874825637,
182
+ "grad_norm": 0.8329909443855286,
183
+ "learning_rate": 3.225462304409673e-05,
184
+ "loss": 5.58530029296875,
185
+ "step": 2500
186
+ },
187
+ {
188
+ "epoch": 0.7635269069818662,
189
+ "grad_norm": 0.74227374792099,
190
+ "learning_rate": 2.8698435277382645e-05,
191
+ "loss": 5.5438079833984375,
192
+ "step": 2600
193
+ },
194
+ {
195
+ "epoch": 0.7928933264811688,
196
+ "grad_norm": 0.6876152157783508,
197
+ "learning_rate": 2.5142247510668564e-05,
198
+ "loss": 5.508399658203125,
199
+ "step": 2700
200
+ },
201
+ {
202
+ "epoch": 0.8222597459804714,
203
+ "grad_norm": 0.6679750084877014,
204
+ "learning_rate": 2.158605974395448e-05,
205
+ "loss": 5.489833374023437,
206
+ "step": 2800
207
+ },
208
+ {
209
+ "epoch": 0.8516261654797739,
210
+ "grad_norm": 0.7488402724266052,
211
+ "learning_rate": 1.80298719772404e-05,
212
+ "loss": 5.467451171875,
213
+ "step": 2900
214
+ },
215
+ {
216
+ "epoch": 0.8809925849790764,
217
+ "grad_norm": 0.7311998009681702,
218
+ "learning_rate": 1.4473684210526317e-05,
219
+ "loss": 5.443863525390625,
220
+ "step": 3000
221
+ }
222
+ ],
223
+ "logging_steps": 100,
224
+ "max_steps": 3406,
225
+ "num_input_tokens_seen": 0,
226
+ "num_train_epochs": 1,
227
+ "save_steps": 1000,
228
+ "stateful_callbacks": {
229
+ "TrainerControl": {
230
+ "args": {
231
+ "should_epoch_stop": false,
232
+ "should_evaluate": false,
233
+ "should_log": false,
234
+ "should_save": true,
235
+ "should_training_stop": false
236
+ },
237
+ "attributes": {}
238
+ }
239
+ },
240
+ "total_flos": 5.0168070144e+16,
241
+ "train_batch_size": 4,
242
+ "trial_name": null,
243
+ "trial_params": null
244
+ }
gpt2_from_scratch_12layer/checkpoint-3000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
3
+ size 4728
gpt2_from_scratch_12layer/checkpoint-3406/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "dtype": "float32",
10
+ "embd_pdrop": 0.1,
11
+ "eos_token_id": 50256,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "pad_token_id": null,
22
+ "reorder_and_upcast_attn": false,
23
+ "resid_pdrop": 0.1,
24
+ "scale_attn_by_inverse_layer_idx": false,
25
+ "scale_attn_weights": true,
26
+ "summary_activation": null,
27
+ "summary_first_dropout": 0.1,
28
+ "summary_proj_to_labels": true,
29
+ "summary_type": "cls_index",
30
+ "summary_use_proj": true,
31
+ "tie_word_embeddings": true,
32
+ "transformers_version": "5.0.0",
33
+ "use_cache": false,
34
+ "vocab_size": 50257
35
+ }
gpt2_from_scratch_12layer/checkpoint-3406/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "transformers_version": "5.0.0",
8
+ "use_cache": true
9
+ }
gpt2_from_scratch_12layer/checkpoint-3406/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86199e7b994a7b0b267da4c3eda7f844a6fcf158c09ca8a2d64fd642ed4d044f
3
+ size 497774208
gpt2_from_scratch_12layer/checkpoint-3406/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a234f38b142d3331a7a53adf952f321a87a1a943ceb3a10c9a99fecda56470ae
3
+ size 995642298
gpt2_from_scratch_12layer/checkpoint-3406/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd2adfaeedd991b9ba6834e4ef7b91c840c2ead3fbf06beee6ad92b6087edec7
3
+ size 14244
gpt2_from_scratch_12layer/checkpoint-3406/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e410308f67ca645aabf384cead7bdf7525d526a77cf7e6bf1191440bee76dba
3
+ size 988
gpt2_from_scratch_12layer/checkpoint-3406/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d145063085675d13b94353d3de9c3206e1d55eccf9fe47bdda38c645520ea479
3
+ size 1064
gpt2_from_scratch_12layer/checkpoint-3406/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gpt2_from_scratch_12layer/checkpoint-3406/tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "<pad>",
7
+ "tokenizer_class": "TokenizersBackend",
8
+ "unk_token": "<unk>"
9
+ }
gpt2_from_scratch_12layer/checkpoint-3406/trainer_state.json ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3406,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02936641949930255,
14
+ "grad_norm": 2.2842259407043457,
15
+ "learning_rate": 2.4750000000000004e-06,
16
+ "loss": 10.393255615234375,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.0587328389986051,
21
+ "grad_norm": 1.976091980934143,
22
+ "learning_rate": 4.975000000000001e-06,
23
+ "loss": 9.357327270507813,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.08809925849790765,
28
+ "grad_norm": 1.6418145895004272,
29
+ "learning_rate": 7.4750000000000004e-06,
30
+ "loss": 8.744969482421874,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.1174656779972102,
35
+ "grad_norm": 1.1453146934509277,
36
+ "learning_rate": 9.975e-06,
37
+ "loss": 8.003826904296876,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.14683209749651274,
42
+ "grad_norm": 0.6994723677635193,
43
+ "learning_rate": 1.2475e-05,
44
+ "loss": 7.452492065429688,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.1761985169958153,
49
+ "grad_norm": 0.4603072702884674,
50
+ "learning_rate": 1.4975e-05,
51
+ "loss": 7.1382373046875,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.20556493649511784,
56
+ "grad_norm": 0.4629450738430023,
57
+ "learning_rate": 1.7475e-05,
58
+ "loss": 6.968035888671875,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.2349313559944204,
63
+ "grad_norm": 0.5266813635826111,
64
+ "learning_rate": 1.9975e-05,
65
+ "loss": 6.8181103515625,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.2642977754937229,
70
+ "grad_norm": 0.5502268671989441,
71
+ "learning_rate": 2.2475e-05,
72
+ "loss": 6.682680053710937,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.2936641949930255,
77
+ "grad_norm": 0.537894606590271,
78
+ "learning_rate": 2.4975e-05,
79
+ "loss": 6.568981323242188,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.323030614492328,
84
+ "grad_norm": 0.5135723352432251,
85
+ "learning_rate": 2.7475e-05,
86
+ "loss": 6.471431884765625,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 0.3523970339916306,
91
+ "grad_norm": 0.606870710849762,
92
+ "learning_rate": 2.9975000000000004e-05,
93
+ "loss": 6.3824462890625,
94
+ "step": 1200
95
+ },
96
+ {
97
+ "epoch": 0.3817634534909331,
98
+ "grad_norm": 0.5291919112205505,
99
+ "learning_rate": 3.2474999999999997e-05,
100
+ "loss": 6.302595825195312,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 0.4111298729902357,
105
+ "grad_norm": 0.6090461015701294,
106
+ "learning_rate": 3.4975e-05,
107
+ "loss": 6.223634033203125,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.4404962924895382,
112
+ "grad_norm": 0.5523635149002075,
113
+ "learning_rate": 3.7475e-05,
114
+ "loss": 6.154580688476562,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 0.4698627119888408,
119
+ "grad_norm": 0.6641230583190918,
120
+ "learning_rate": 3.9975e-05,
121
+ "loss": 6.086353759765625,
122
+ "step": 1600
123
+ },
124
+ {
125
+ "epoch": 0.4992291314881433,
126
+ "grad_norm": 0.6724914908409119,
127
+ "learning_rate": 4.2475e-05,
128
+ "loss": 6.030512084960938,
129
+ "step": 1700
130
+ },
131
+ {
132
+ "epoch": 0.5285955509874458,
133
+ "grad_norm": 0.5981016755104065,
134
+ "learning_rate": 4.4975e-05,
135
+ "loss": 5.963157348632812,
136
+ "step": 1800
137
+ },
138
+ {
139
+ "epoch": 0.5579619704867484,
140
+ "grad_norm": 0.676860511302948,
141
+ "learning_rate": 4.7475e-05,
142
+ "loss": 5.894300537109375,
143
+ "step": 1900
144
+ },
145
+ {
146
+ "epoch": 0.587328389986051,
147
+ "grad_norm": 0.6556357741355896,
148
+ "learning_rate": 4.9975e-05,
149
+ "loss": 5.844266967773438,
150
+ "step": 2000
151
+ },
152
+ {
153
+ "epoch": 0.6166948094853535,
154
+ "grad_norm": 0.7801370024681091,
155
+ "learning_rate": 4.647937411095306e-05,
156
+ "loss": 5.78113037109375,
157
+ "step": 2100
158
+ },
159
+ {
160
+ "epoch": 0.646061228984656,
161
+ "grad_norm": 0.7802927494049072,
162
+ "learning_rate": 4.292318634423898e-05,
163
+ "loss": 5.719049072265625,
164
+ "step": 2200
165
+ },
166
+ {
167
+ "epoch": 0.6754276484839586,
168
+ "grad_norm": 0.6435455083847046,
169
+ "learning_rate": 3.936699857752489e-05,
170
+ "loss": 5.66756591796875,
171
+ "step": 2300
172
+ },
173
+ {
174
+ "epoch": 0.7047940679832612,
175
+ "grad_norm": 0.6630441546440125,
176
+ "learning_rate": 3.581081081081081e-05,
177
+ "loss": 5.636476440429687,
178
+ "step": 2400
179
+ },
180
+ {
181
+ "epoch": 0.7341604874825637,
182
+ "grad_norm": 0.8329909443855286,
183
+ "learning_rate": 3.225462304409673e-05,
184
+ "loss": 5.58530029296875,
185
+ "step": 2500
186
+ },
187
+ {
188
+ "epoch": 0.7635269069818662,
189
+ "grad_norm": 0.74227374792099,
190
+ "learning_rate": 2.8698435277382645e-05,
191
+ "loss": 5.5438079833984375,
192
+ "step": 2600
193
+ },
194
+ {
195
+ "epoch": 0.7928933264811688,
196
+ "grad_norm": 0.6876152157783508,
197
+ "learning_rate": 2.5142247510668564e-05,
198
+ "loss": 5.508399658203125,
199
+ "step": 2700
200
+ },
201
+ {
202
+ "epoch": 0.8222597459804714,
203
+ "grad_norm": 0.6679750084877014,
204
+ "learning_rate": 2.158605974395448e-05,
205
+ "loss": 5.489833374023437,
206
+ "step": 2800
207
+ },
208
+ {
209
+ "epoch": 0.8516261654797739,
210
+ "grad_norm": 0.7488402724266052,
211
+ "learning_rate": 1.80298719772404e-05,
212
+ "loss": 5.467451171875,
213
+ "step": 2900
214
+ },
215
+ {
216
+ "epoch": 0.8809925849790764,
217
+ "grad_norm": 0.7311998009681702,
218
+ "learning_rate": 1.4473684210526317e-05,
219
+ "loss": 5.443863525390625,
220
+ "step": 3000
221
+ },
222
+ {
223
+ "epoch": 0.9103590044783789,
224
+ "grad_norm": 0.6423781514167786,
225
+ "learning_rate": 1.0917496443812234e-05,
226
+ "loss": 5.427085571289062,
227
+ "step": 3100
228
+ },
229
+ {
230
+ "epoch": 0.9397254239776816,
231
+ "grad_norm": 0.6591918468475342,
232
+ "learning_rate": 7.361308677098151e-06,
233
+ "loss": 5.414056396484375,
234
+ "step": 3200
235
+ },
236
+ {
237
+ "epoch": 0.9690918434769841,
238
+ "grad_norm": 0.6228283643722534,
239
+ "learning_rate": 3.8051209103840685e-06,
240
+ "loss": 5.39971435546875,
241
+ "step": 3300
242
+ },
243
+ {
244
+ "epoch": 0.9984582629762866,
245
+ "grad_norm": 0.6124479174613953,
246
+ "learning_rate": 2.4893314366998576e-07,
247
+ "loss": 5.410068969726563,
248
+ "step": 3400
249
+ }
250
+ ],
251
+ "logging_steps": 100,
252
+ "max_steps": 3406,
253
+ "num_input_tokens_seen": 0,
254
+ "num_train_epochs": 1,
255
+ "save_steps": 1000,
256
+ "stateful_callbacks": {
257
+ "TrainerControl": {
258
+ "args": {
259
+ "should_epoch_stop": false,
260
+ "should_evaluate": false,
261
+ "should_log": false,
262
+ "should_save": true,
263
+ "should_training_stop": true
264
+ },
265
+ "attributes": {}
266
+ }
267
+ },
268
+ "total_flos": 5.6944940285952e+16,
269
+ "train_batch_size": 4,
270
+ "trial_name": null,
271
+ "trial_params": null
272
+ }
gpt2_from_scratch_12layer/checkpoint-3406/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
3
+ size 4728
gpt2_from_scratch_12layer/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "add_cross_attention": false,
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "dtype": "float32",
10
+ "embd_pdrop": 0.1,
11
+ "eos_token_id": 50256,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "pad_token_id": null,
22
+ "reorder_and_upcast_attn": false,
23
+ "resid_pdrop": 0.1,
24
+ "scale_attn_by_inverse_layer_idx": false,
25
+ "scale_attn_weights": true,
26
+ "summary_activation": null,
27
+ "summary_first_dropout": 0.1,
28
+ "summary_proj_to_labels": true,
29
+ "summary_type": "cls_index",
30
+ "summary_use_proj": true,
31
+ "tie_word_embeddings": true,
32
+ "transformers_version": "5.0.0",
33
+ "use_cache": false,
34
+ "vocab_size": 50257
35
+ }
gpt2_from_scratch_12layer/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "transformers_version": "5.0.0",
8
+ "use_cache": true
9
+ }
gpt2_from_scratch_12layer/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86199e7b994a7b0b267da4c3eda7f844a6fcf158c09ca8a2d64fd642ed4d044f
3
+ size 497774208
gpt2_from_scratch_12layer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
gpt2_from_scratch_12layer/tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "<pad>",
7
+ "tokenizer_class": "TokenizersBackend",
8
+ "unk_token": "<unk>"
9
+ }
gpt2_from_scratch_12layer/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
3
+ size 4728