Upload 50 files
Browse files- gpt2_from_scratch_12layer/checkpoint-1000/config.json +35 -0
- gpt2_from_scratch_12layer/checkpoint-1000/generation_config.json +9 -0
- gpt2_from_scratch_12layer/checkpoint-1000/model.safetensors +3 -0
- gpt2_from_scratch_12layer/checkpoint-1000/optimizer.pt +3 -0
- gpt2_from_scratch_12layer/checkpoint-1000/rng_state.pth +3 -0
- gpt2_from_scratch_12layer/checkpoint-1000/scaler.pt +3 -0
- gpt2_from_scratch_12layer/checkpoint-1000/scheduler.pt +3 -0
- gpt2_from_scratch_12layer/checkpoint-1000/tokenizer.json +0 -0
- gpt2_from_scratch_12layer/checkpoint-1000/tokenizer_config.json +9 -0
- gpt2_from_scratch_12layer/checkpoint-1000/trainer_state.json +104 -0
- gpt2_from_scratch_12layer/checkpoint-1000/training_args.bin +3 -0
- gpt2_from_scratch_12layer/checkpoint-2000/config.json +35 -0
- gpt2_from_scratch_12layer/checkpoint-2000/generation_config.json +9 -0
- gpt2_from_scratch_12layer/checkpoint-2000/model.safetensors +3 -0
- gpt2_from_scratch_12layer/checkpoint-2000/optimizer.pt +3 -0
- gpt2_from_scratch_12layer/checkpoint-2000/rng_state.pth +3 -0
- gpt2_from_scratch_12layer/checkpoint-2000/scaler.pt +3 -0
- gpt2_from_scratch_12layer/checkpoint-2000/scheduler.pt +3 -0
- gpt2_from_scratch_12layer/checkpoint-2000/tokenizer.json +0 -0
- gpt2_from_scratch_12layer/checkpoint-2000/tokenizer_config.json +9 -0
- gpt2_from_scratch_12layer/checkpoint-2000/trainer_state.json +174 -0
- gpt2_from_scratch_12layer/checkpoint-2000/training_args.bin +3 -0
- gpt2_from_scratch_12layer/checkpoint-3000/config.json +35 -0
- gpt2_from_scratch_12layer/checkpoint-3000/generation_config.json +9 -0
- gpt2_from_scratch_12layer/checkpoint-3000/model.safetensors +3 -0
- gpt2_from_scratch_12layer/checkpoint-3000/optimizer.pt +3 -0
- gpt2_from_scratch_12layer/checkpoint-3000/rng_state.pth +3 -0
- gpt2_from_scratch_12layer/checkpoint-3000/scaler.pt +3 -0
- gpt2_from_scratch_12layer/checkpoint-3000/scheduler.pt +3 -0
- gpt2_from_scratch_12layer/checkpoint-3000/tokenizer.json +0 -0
- gpt2_from_scratch_12layer/checkpoint-3000/tokenizer_config.json +9 -0
- gpt2_from_scratch_12layer/checkpoint-3000/trainer_state.json +244 -0
- gpt2_from_scratch_12layer/checkpoint-3000/training_args.bin +3 -0
- gpt2_from_scratch_12layer/checkpoint-3406/config.json +35 -0
- gpt2_from_scratch_12layer/checkpoint-3406/generation_config.json +9 -0
- gpt2_from_scratch_12layer/checkpoint-3406/model.safetensors +3 -0
- gpt2_from_scratch_12layer/checkpoint-3406/optimizer.pt +3 -0
- gpt2_from_scratch_12layer/checkpoint-3406/rng_state.pth +3 -0
- gpt2_from_scratch_12layer/checkpoint-3406/scaler.pt +3 -0
- gpt2_from_scratch_12layer/checkpoint-3406/scheduler.pt +3 -0
- gpt2_from_scratch_12layer/checkpoint-3406/tokenizer.json +0 -0
- gpt2_from_scratch_12layer/checkpoint-3406/tokenizer_config.json +9 -0
- gpt2_from_scratch_12layer/checkpoint-3406/trainer_state.json +272 -0
- gpt2_from_scratch_12layer/checkpoint-3406/training_args.bin +3 -0
- gpt2_from_scratch_12layer/config.json +35 -0
- gpt2_from_scratch_12layer/generation_config.json +9 -0
- gpt2_from_scratch_12layer/model.safetensors +3 -0
- gpt2_from_scratch_12layer/tokenizer.json +0 -0
- gpt2_from_scratch_12layer/tokenizer_config.json +9 -0
- gpt2_from_scratch_12layer/training_args.bin +3 -0
gpt2_from_scratch_12layer/checkpoint-1000/config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_function": "gelu_new",
|
| 3 |
+
"add_cross_attention": false,
|
| 4 |
+
"architectures": [
|
| 5 |
+
"GPT2LMHeadModel"
|
| 6 |
+
],
|
| 7 |
+
"attn_pdrop": 0.1,
|
| 8 |
+
"bos_token_id": 50256,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"embd_pdrop": 0.1,
|
| 11 |
+
"eos_token_id": 50256,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"layer_norm_epsilon": 1e-05,
|
| 14 |
+
"model_type": "gpt2",
|
| 15 |
+
"n_ctx": 1024,
|
| 16 |
+
"n_embd": 768,
|
| 17 |
+
"n_head": 12,
|
| 18 |
+
"n_inner": null,
|
| 19 |
+
"n_layer": 12,
|
| 20 |
+
"n_positions": 1024,
|
| 21 |
+
"pad_token_id": null,
|
| 22 |
+
"reorder_and_upcast_attn": false,
|
| 23 |
+
"resid_pdrop": 0.1,
|
| 24 |
+
"scale_attn_by_inverse_layer_idx": false,
|
| 25 |
+
"scale_attn_weights": true,
|
| 26 |
+
"summary_activation": null,
|
| 27 |
+
"summary_first_dropout": 0.1,
|
| 28 |
+
"summary_proj_to_labels": true,
|
| 29 |
+
"summary_type": "cls_index",
|
| 30 |
+
"summary_use_proj": true,
|
| 31 |
+
"tie_word_embeddings": true,
|
| 32 |
+
"transformers_version": "5.0.0",
|
| 33 |
+
"use_cache": false,
|
| 34 |
+
"vocab_size": 50257
|
| 35 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-1000/generation_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 50256,
|
| 4 |
+
"eos_token_id": 50256,
|
| 5 |
+
"output_attentions": false,
|
| 6 |
+
"output_hidden_states": false,
|
| 7 |
+
"transformers_version": "5.0.0",
|
| 8 |
+
"use_cache": true
|
| 9 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-1000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b23254f47a6b7d6396d5cd5009f7b4e97c808f10c54cc13a522fc80a40a6f914
|
| 3 |
+
size 497774208
|
gpt2_from_scratch_12layer/checkpoint-1000/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c1d31359d42cea3697b2d87c673ef4d4a81a0834d8d6163b209575a6d00bac41
|
| 3 |
+
size 995642298
|
gpt2_from_scratch_12layer/checkpoint-1000/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:28325bd1f1d721c530c7ba38b64e73cb2cf1fdad7c3357d638f67a67744c8645
|
| 3 |
+
size 14244
|
gpt2_from_scratch_12layer/checkpoint-1000/scaler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d8fdcd0311eba9854fff738038ed4c1a269832665b4d88ba4e4e3d02a1a7e0e
|
| 3 |
+
size 988
|
gpt2_from_scratch_12layer/checkpoint-1000/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04a74a909d336b7124436e4cb0278258b381fc72bf5b206e1c024e4444ff4f32
|
| 3 |
+
size 1064
|
gpt2_from_scratch_12layer/checkpoint-1000/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2_from_scratch_12layer/checkpoint-1000/tokenizer_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"backend": "tokenizers",
|
| 3 |
+
"bos_token": "<s>",
|
| 4 |
+
"eos_token": "</s>",
|
| 5 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 6 |
+
"pad_token": "<pad>",
|
| 7 |
+
"tokenizer_class": "TokenizersBackend",
|
| 8 |
+
"unk_token": "<unk>"
|
| 9 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-1000/trainer_state.json
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.2936641949930255,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1000,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.02936641949930255,
|
| 14 |
+
"grad_norm": 2.2842259407043457,
|
| 15 |
+
"learning_rate": 2.4750000000000004e-06,
|
| 16 |
+
"loss": 10.393255615234375,
|
| 17 |
+
"step": 100
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.0587328389986051,
|
| 21 |
+
"grad_norm": 1.976091980934143,
|
| 22 |
+
"learning_rate": 4.975000000000001e-06,
|
| 23 |
+
"loss": 9.357327270507813,
|
| 24 |
+
"step": 200
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.08809925849790765,
|
| 28 |
+
"grad_norm": 1.6418145895004272,
|
| 29 |
+
"learning_rate": 7.4750000000000004e-06,
|
| 30 |
+
"loss": 8.744969482421874,
|
| 31 |
+
"step": 300
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.1174656779972102,
|
| 35 |
+
"grad_norm": 1.1453146934509277,
|
| 36 |
+
"learning_rate": 9.975e-06,
|
| 37 |
+
"loss": 8.003826904296876,
|
| 38 |
+
"step": 400
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.14683209749651274,
|
| 42 |
+
"grad_norm": 0.6994723677635193,
|
| 43 |
+
"learning_rate": 1.2475e-05,
|
| 44 |
+
"loss": 7.452492065429688,
|
| 45 |
+
"step": 500
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.1761985169958153,
|
| 49 |
+
"grad_norm": 0.4603072702884674,
|
| 50 |
+
"learning_rate": 1.4975e-05,
|
| 51 |
+
"loss": 7.1382373046875,
|
| 52 |
+
"step": 600
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.20556493649511784,
|
| 56 |
+
"grad_norm": 0.4629450738430023,
|
| 57 |
+
"learning_rate": 1.7475e-05,
|
| 58 |
+
"loss": 6.968035888671875,
|
| 59 |
+
"step": 700
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.2349313559944204,
|
| 63 |
+
"grad_norm": 0.5266813635826111,
|
| 64 |
+
"learning_rate": 1.9975e-05,
|
| 65 |
+
"loss": 6.8181103515625,
|
| 66 |
+
"step": 800
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.2642977754937229,
|
| 70 |
+
"grad_norm": 0.5502268671989441,
|
| 71 |
+
"learning_rate": 2.2475e-05,
|
| 72 |
+
"loss": 6.682680053710937,
|
| 73 |
+
"step": 900
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.2936641949930255,
|
| 77 |
+
"grad_norm": 0.537894606590271,
|
| 78 |
+
"learning_rate": 2.4975e-05,
|
| 79 |
+
"loss": 6.568981323242188,
|
| 80 |
+
"step": 1000
|
| 81 |
+
}
|
| 82 |
+
],
|
| 83 |
+
"logging_steps": 100,
|
| 84 |
+
"max_steps": 3406,
|
| 85 |
+
"num_input_tokens_seen": 0,
|
| 86 |
+
"num_train_epochs": 1,
|
| 87 |
+
"save_steps": 1000,
|
| 88 |
+
"stateful_callbacks": {
|
| 89 |
+
"TrainerControl": {
|
| 90 |
+
"args": {
|
| 91 |
+
"should_epoch_stop": false,
|
| 92 |
+
"should_evaluate": false,
|
| 93 |
+
"should_log": false,
|
| 94 |
+
"should_save": true,
|
| 95 |
+
"should_training_stop": false
|
| 96 |
+
},
|
| 97 |
+
"attributes": {}
|
| 98 |
+
}
|
| 99 |
+
},
|
| 100 |
+
"total_flos": 1.6722690048e+16,
|
| 101 |
+
"train_batch_size": 4,
|
| 102 |
+
"trial_name": null,
|
| 103 |
+
"trial_params": null
|
| 104 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-1000/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
|
| 3 |
+
size 4728
|
gpt2_from_scratch_12layer/checkpoint-2000/config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_function": "gelu_new",
|
| 3 |
+
"add_cross_attention": false,
|
| 4 |
+
"architectures": [
|
| 5 |
+
"GPT2LMHeadModel"
|
| 6 |
+
],
|
| 7 |
+
"attn_pdrop": 0.1,
|
| 8 |
+
"bos_token_id": 50256,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"embd_pdrop": 0.1,
|
| 11 |
+
"eos_token_id": 50256,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"layer_norm_epsilon": 1e-05,
|
| 14 |
+
"model_type": "gpt2",
|
| 15 |
+
"n_ctx": 1024,
|
| 16 |
+
"n_embd": 768,
|
| 17 |
+
"n_head": 12,
|
| 18 |
+
"n_inner": null,
|
| 19 |
+
"n_layer": 12,
|
| 20 |
+
"n_positions": 1024,
|
| 21 |
+
"pad_token_id": null,
|
| 22 |
+
"reorder_and_upcast_attn": false,
|
| 23 |
+
"resid_pdrop": 0.1,
|
| 24 |
+
"scale_attn_by_inverse_layer_idx": false,
|
| 25 |
+
"scale_attn_weights": true,
|
| 26 |
+
"summary_activation": null,
|
| 27 |
+
"summary_first_dropout": 0.1,
|
| 28 |
+
"summary_proj_to_labels": true,
|
| 29 |
+
"summary_type": "cls_index",
|
| 30 |
+
"summary_use_proj": true,
|
| 31 |
+
"tie_word_embeddings": true,
|
| 32 |
+
"transformers_version": "5.0.0",
|
| 33 |
+
"use_cache": false,
|
| 34 |
+
"vocab_size": 50257
|
| 35 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-2000/generation_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 50256,
|
| 4 |
+
"eos_token_id": 50256,
|
| 5 |
+
"output_attentions": false,
|
| 6 |
+
"output_hidden_states": false,
|
| 7 |
+
"transformers_version": "5.0.0",
|
| 8 |
+
"use_cache": true
|
| 9 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-2000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:043f01835a7fc728da107643ad158db644191333dbde9b03bb14fc00283f9960
|
| 3 |
+
size 497774208
|
gpt2_from_scratch_12layer/checkpoint-2000/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e237c1420cfb71c0b92f02f0a00096093dd8b9b8d0c31f9e977a185ce40b82f3
|
| 3 |
+
size 995642298
|
gpt2_from_scratch_12layer/checkpoint-2000/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f95a2c63a0eaf9d82cb93bfc10ea04f73b732c5d5ce79e4bff972ef3f9449c92
|
| 3 |
+
size 14244
|
gpt2_from_scratch_12layer/checkpoint-2000/scaler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c50a9cebe5d66d453d25b140738bff479749ac03e0a43597d8776bc22f6ed0c
|
| 3 |
+
size 988
|
gpt2_from_scratch_12layer/checkpoint-2000/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3ea9d13baff2282d300ceb3c3984a3388d1450303ffc8640c73967fa3325903
|
| 3 |
+
size 1064
|
gpt2_from_scratch_12layer/checkpoint-2000/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2_from_scratch_12layer/checkpoint-2000/tokenizer_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"backend": "tokenizers",
|
| 3 |
+
"bos_token": "<s>",
|
| 4 |
+
"eos_token": "</s>",
|
| 5 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 6 |
+
"pad_token": "<pad>",
|
| 7 |
+
"tokenizer_class": "TokenizersBackend",
|
| 8 |
+
"unk_token": "<unk>"
|
| 9 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-2000/trainer_state.json
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.587328389986051,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 2000,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.02936641949930255,
|
| 14 |
+
"grad_norm": 2.2842259407043457,
|
| 15 |
+
"learning_rate": 2.4750000000000004e-06,
|
| 16 |
+
"loss": 10.393255615234375,
|
| 17 |
+
"step": 100
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.0587328389986051,
|
| 21 |
+
"grad_norm": 1.976091980934143,
|
| 22 |
+
"learning_rate": 4.975000000000001e-06,
|
| 23 |
+
"loss": 9.357327270507813,
|
| 24 |
+
"step": 200
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.08809925849790765,
|
| 28 |
+
"grad_norm": 1.6418145895004272,
|
| 29 |
+
"learning_rate": 7.4750000000000004e-06,
|
| 30 |
+
"loss": 8.744969482421874,
|
| 31 |
+
"step": 300
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.1174656779972102,
|
| 35 |
+
"grad_norm": 1.1453146934509277,
|
| 36 |
+
"learning_rate": 9.975e-06,
|
| 37 |
+
"loss": 8.003826904296876,
|
| 38 |
+
"step": 400
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.14683209749651274,
|
| 42 |
+
"grad_norm": 0.6994723677635193,
|
| 43 |
+
"learning_rate": 1.2475e-05,
|
| 44 |
+
"loss": 7.452492065429688,
|
| 45 |
+
"step": 500
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.1761985169958153,
|
| 49 |
+
"grad_norm": 0.4603072702884674,
|
| 50 |
+
"learning_rate": 1.4975e-05,
|
| 51 |
+
"loss": 7.1382373046875,
|
| 52 |
+
"step": 600
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.20556493649511784,
|
| 56 |
+
"grad_norm": 0.4629450738430023,
|
| 57 |
+
"learning_rate": 1.7475e-05,
|
| 58 |
+
"loss": 6.968035888671875,
|
| 59 |
+
"step": 700
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.2349313559944204,
|
| 63 |
+
"grad_norm": 0.5266813635826111,
|
| 64 |
+
"learning_rate": 1.9975e-05,
|
| 65 |
+
"loss": 6.8181103515625,
|
| 66 |
+
"step": 800
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.2642977754937229,
|
| 70 |
+
"grad_norm": 0.5502268671989441,
|
| 71 |
+
"learning_rate": 2.2475e-05,
|
| 72 |
+
"loss": 6.682680053710937,
|
| 73 |
+
"step": 900
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.2936641949930255,
|
| 77 |
+
"grad_norm": 0.537894606590271,
|
| 78 |
+
"learning_rate": 2.4975e-05,
|
| 79 |
+
"loss": 6.568981323242188,
|
| 80 |
+
"step": 1000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.323030614492328,
|
| 84 |
+
"grad_norm": 0.5135723352432251,
|
| 85 |
+
"learning_rate": 2.7475e-05,
|
| 86 |
+
"loss": 6.471431884765625,
|
| 87 |
+
"step": 1100
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.3523970339916306,
|
| 91 |
+
"grad_norm": 0.606870710849762,
|
| 92 |
+
"learning_rate": 2.9975000000000004e-05,
|
| 93 |
+
"loss": 6.3824462890625,
|
| 94 |
+
"step": 1200
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.3817634534909331,
|
| 98 |
+
"grad_norm": 0.5291919112205505,
|
| 99 |
+
"learning_rate": 3.2474999999999997e-05,
|
| 100 |
+
"loss": 6.302595825195312,
|
| 101 |
+
"step": 1300
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.4111298729902357,
|
| 105 |
+
"grad_norm": 0.6090461015701294,
|
| 106 |
+
"learning_rate": 3.4975e-05,
|
| 107 |
+
"loss": 6.223634033203125,
|
| 108 |
+
"step": 1400
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.4404962924895382,
|
| 112 |
+
"grad_norm": 0.5523635149002075,
|
| 113 |
+
"learning_rate": 3.7475e-05,
|
| 114 |
+
"loss": 6.154580688476562,
|
| 115 |
+
"step": 1500
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.4698627119888408,
|
| 119 |
+
"grad_norm": 0.6641230583190918,
|
| 120 |
+
"learning_rate": 3.9975e-05,
|
| 121 |
+
"loss": 6.086353759765625,
|
| 122 |
+
"step": 1600
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.4992291314881433,
|
| 126 |
+
"grad_norm": 0.6724914908409119,
|
| 127 |
+
"learning_rate": 4.2475e-05,
|
| 128 |
+
"loss": 6.030512084960938,
|
| 129 |
+
"step": 1700
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.5285955509874458,
|
| 133 |
+
"grad_norm": 0.5981016755104065,
|
| 134 |
+
"learning_rate": 4.4975e-05,
|
| 135 |
+
"loss": 5.963157348632812,
|
| 136 |
+
"step": 1800
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.5579619704867484,
|
| 140 |
+
"grad_norm": 0.676860511302948,
|
| 141 |
+
"learning_rate": 4.7475e-05,
|
| 142 |
+
"loss": 5.894300537109375,
|
| 143 |
+
"step": 1900
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.587328389986051,
|
| 147 |
+
"grad_norm": 0.6556357741355896,
|
| 148 |
+
"learning_rate": 4.9975e-05,
|
| 149 |
+
"loss": 5.844266967773438,
|
| 150 |
+
"step": 2000
|
| 151 |
+
}
|
| 152 |
+
],
|
| 153 |
+
"logging_steps": 100,
|
| 154 |
+
"max_steps": 3406,
|
| 155 |
+
"num_input_tokens_seen": 0,
|
| 156 |
+
"num_train_epochs": 1,
|
| 157 |
+
"save_steps": 1000,
|
| 158 |
+
"stateful_callbacks": {
|
| 159 |
+
"TrainerControl": {
|
| 160 |
+
"args": {
|
| 161 |
+
"should_epoch_stop": false,
|
| 162 |
+
"should_evaluate": false,
|
| 163 |
+
"should_log": false,
|
| 164 |
+
"should_save": true,
|
| 165 |
+
"should_training_stop": false
|
| 166 |
+
},
|
| 167 |
+
"attributes": {}
|
| 168 |
+
}
|
| 169 |
+
},
|
| 170 |
+
"total_flos": 3.3445380096e+16,
|
| 171 |
+
"train_batch_size": 4,
|
| 172 |
+
"trial_name": null,
|
| 173 |
+
"trial_params": null
|
| 174 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-2000/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
|
| 3 |
+
size 4728
|
gpt2_from_scratch_12layer/checkpoint-3000/config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_function": "gelu_new",
|
| 3 |
+
"add_cross_attention": false,
|
| 4 |
+
"architectures": [
|
| 5 |
+
"GPT2LMHeadModel"
|
| 6 |
+
],
|
| 7 |
+
"attn_pdrop": 0.1,
|
| 8 |
+
"bos_token_id": 50256,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"embd_pdrop": 0.1,
|
| 11 |
+
"eos_token_id": 50256,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"layer_norm_epsilon": 1e-05,
|
| 14 |
+
"model_type": "gpt2",
|
| 15 |
+
"n_ctx": 1024,
|
| 16 |
+
"n_embd": 768,
|
| 17 |
+
"n_head": 12,
|
| 18 |
+
"n_inner": null,
|
| 19 |
+
"n_layer": 12,
|
| 20 |
+
"n_positions": 1024,
|
| 21 |
+
"pad_token_id": null,
|
| 22 |
+
"reorder_and_upcast_attn": false,
|
| 23 |
+
"resid_pdrop": 0.1,
|
| 24 |
+
"scale_attn_by_inverse_layer_idx": false,
|
| 25 |
+
"scale_attn_weights": true,
|
| 26 |
+
"summary_activation": null,
|
| 27 |
+
"summary_first_dropout": 0.1,
|
| 28 |
+
"summary_proj_to_labels": true,
|
| 29 |
+
"summary_type": "cls_index",
|
| 30 |
+
"summary_use_proj": true,
|
| 31 |
+
"tie_word_embeddings": true,
|
| 32 |
+
"transformers_version": "5.0.0",
|
| 33 |
+
"use_cache": false,
|
| 34 |
+
"vocab_size": 50257
|
| 35 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-3000/generation_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 50256,
|
| 4 |
+
"eos_token_id": 50256,
|
| 5 |
+
"output_attentions": false,
|
| 6 |
+
"output_hidden_states": false,
|
| 7 |
+
"transformers_version": "5.0.0",
|
| 8 |
+
"use_cache": true
|
| 9 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-3000/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8c4776ac0444ad6001151629fdfa49402a4713258dc905b86f078d45300d610f
|
| 3 |
+
size 497774208
|
gpt2_from_scratch_12layer/checkpoint-3000/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dfaf5b9fae33481d91060504d67e57291ff13d8fe917f28251b971b4e8b8684a
|
| 3 |
+
size 995642298
|
gpt2_from_scratch_12layer/checkpoint-3000/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1e10f3460a731b355fd0ba5229f0bad8de79ebe7909166a3ad7f90c89b83dda5
|
| 3 |
+
size 14244
|
gpt2_from_scratch_12layer/checkpoint-3000/scaler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21aba8ed0f38ed1c04994c10a9ca7e9925e55ef2ed51283c43ff8e2cce78585f
|
| 3 |
+
size 988
|
gpt2_from_scratch_12layer/checkpoint-3000/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e7038e46af7868cb4110f4906a05bd0c0cfeb8b51264c714a479c44c4014e81
|
| 3 |
+
size 1064
|
gpt2_from_scratch_12layer/checkpoint-3000/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2_from_scratch_12layer/checkpoint-3000/tokenizer_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"backend": "tokenizers",
|
| 3 |
+
"bos_token": "<s>",
|
| 4 |
+
"eos_token": "</s>",
|
| 5 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 6 |
+
"pad_token": "<pad>",
|
| 7 |
+
"tokenizer_class": "TokenizersBackend",
|
| 8 |
+
"unk_token": "<unk>"
|
| 9 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-3000/trainer_state.json
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.8809925849790764,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 3000,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.02936641949930255,
|
| 14 |
+
"grad_norm": 2.2842259407043457,
|
| 15 |
+
"learning_rate": 2.4750000000000004e-06,
|
| 16 |
+
"loss": 10.393255615234375,
|
| 17 |
+
"step": 100
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.0587328389986051,
|
| 21 |
+
"grad_norm": 1.976091980934143,
|
| 22 |
+
"learning_rate": 4.975000000000001e-06,
|
| 23 |
+
"loss": 9.357327270507813,
|
| 24 |
+
"step": 200
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.08809925849790765,
|
| 28 |
+
"grad_norm": 1.6418145895004272,
|
| 29 |
+
"learning_rate": 7.4750000000000004e-06,
|
| 30 |
+
"loss": 8.744969482421874,
|
| 31 |
+
"step": 300
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.1174656779972102,
|
| 35 |
+
"grad_norm": 1.1453146934509277,
|
| 36 |
+
"learning_rate": 9.975e-06,
|
| 37 |
+
"loss": 8.003826904296876,
|
| 38 |
+
"step": 400
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.14683209749651274,
|
| 42 |
+
"grad_norm": 0.6994723677635193,
|
| 43 |
+
"learning_rate": 1.2475e-05,
|
| 44 |
+
"loss": 7.452492065429688,
|
| 45 |
+
"step": 500
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.1761985169958153,
|
| 49 |
+
"grad_norm": 0.4603072702884674,
|
| 50 |
+
"learning_rate": 1.4975e-05,
|
| 51 |
+
"loss": 7.1382373046875,
|
| 52 |
+
"step": 600
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.20556493649511784,
|
| 56 |
+
"grad_norm": 0.4629450738430023,
|
| 57 |
+
"learning_rate": 1.7475e-05,
|
| 58 |
+
"loss": 6.968035888671875,
|
| 59 |
+
"step": 700
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.2349313559944204,
|
| 63 |
+
"grad_norm": 0.5266813635826111,
|
| 64 |
+
"learning_rate": 1.9975e-05,
|
| 65 |
+
"loss": 6.8181103515625,
|
| 66 |
+
"step": 800
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.2642977754937229,
|
| 70 |
+
"grad_norm": 0.5502268671989441,
|
| 71 |
+
"learning_rate": 2.2475e-05,
|
| 72 |
+
"loss": 6.682680053710937,
|
| 73 |
+
"step": 900
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.2936641949930255,
|
| 77 |
+
"grad_norm": 0.537894606590271,
|
| 78 |
+
"learning_rate": 2.4975e-05,
|
| 79 |
+
"loss": 6.568981323242188,
|
| 80 |
+
"step": 1000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.323030614492328,
|
| 84 |
+
"grad_norm": 0.5135723352432251,
|
| 85 |
+
"learning_rate": 2.7475e-05,
|
| 86 |
+
"loss": 6.471431884765625,
|
| 87 |
+
"step": 1100
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.3523970339916306,
|
| 91 |
+
"grad_norm": 0.606870710849762,
|
| 92 |
+
"learning_rate": 2.9975000000000004e-05,
|
| 93 |
+
"loss": 6.3824462890625,
|
| 94 |
+
"step": 1200
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.3817634534909331,
|
| 98 |
+
"grad_norm": 0.5291919112205505,
|
| 99 |
+
"learning_rate": 3.2474999999999997e-05,
|
| 100 |
+
"loss": 6.302595825195312,
|
| 101 |
+
"step": 1300
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.4111298729902357,
|
| 105 |
+
"grad_norm": 0.6090461015701294,
|
| 106 |
+
"learning_rate": 3.4975e-05,
|
| 107 |
+
"loss": 6.223634033203125,
|
| 108 |
+
"step": 1400
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.4404962924895382,
|
| 112 |
+
"grad_norm": 0.5523635149002075,
|
| 113 |
+
"learning_rate": 3.7475e-05,
|
| 114 |
+
"loss": 6.154580688476562,
|
| 115 |
+
"step": 1500
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.4698627119888408,
|
| 119 |
+
"grad_norm": 0.6641230583190918,
|
| 120 |
+
"learning_rate": 3.9975e-05,
|
| 121 |
+
"loss": 6.086353759765625,
|
| 122 |
+
"step": 1600
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.4992291314881433,
|
| 126 |
+
"grad_norm": 0.6724914908409119,
|
| 127 |
+
"learning_rate": 4.2475e-05,
|
| 128 |
+
"loss": 6.030512084960938,
|
| 129 |
+
"step": 1700
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.5285955509874458,
|
| 133 |
+
"grad_norm": 0.5981016755104065,
|
| 134 |
+
"learning_rate": 4.4975e-05,
|
| 135 |
+
"loss": 5.963157348632812,
|
| 136 |
+
"step": 1800
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.5579619704867484,
|
| 140 |
+
"grad_norm": 0.676860511302948,
|
| 141 |
+
"learning_rate": 4.7475e-05,
|
| 142 |
+
"loss": 5.894300537109375,
|
| 143 |
+
"step": 1900
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.587328389986051,
|
| 147 |
+
"grad_norm": 0.6556357741355896,
|
| 148 |
+
"learning_rate": 4.9975e-05,
|
| 149 |
+
"loss": 5.844266967773438,
|
| 150 |
+
"step": 2000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 0.6166948094853535,
|
| 154 |
+
"grad_norm": 0.7801370024681091,
|
| 155 |
+
"learning_rate": 4.647937411095306e-05,
|
| 156 |
+
"loss": 5.78113037109375,
|
| 157 |
+
"step": 2100
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 0.646061228984656,
|
| 161 |
+
"grad_norm": 0.7802927494049072,
|
| 162 |
+
"learning_rate": 4.292318634423898e-05,
|
| 163 |
+
"loss": 5.719049072265625,
|
| 164 |
+
"step": 2200
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 0.6754276484839586,
|
| 168 |
+
"grad_norm": 0.6435455083847046,
|
| 169 |
+
"learning_rate": 3.936699857752489e-05,
|
| 170 |
+
"loss": 5.66756591796875,
|
| 171 |
+
"step": 2300
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 0.7047940679832612,
|
| 175 |
+
"grad_norm": 0.6630441546440125,
|
| 176 |
+
"learning_rate": 3.581081081081081e-05,
|
| 177 |
+
"loss": 5.636476440429687,
|
| 178 |
+
"step": 2400
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.7341604874825637,
|
| 182 |
+
"grad_norm": 0.8329909443855286,
|
| 183 |
+
"learning_rate": 3.225462304409673e-05,
|
| 184 |
+
"loss": 5.58530029296875,
|
| 185 |
+
"step": 2500
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 0.7635269069818662,
|
| 189 |
+
"grad_norm": 0.74227374792099,
|
| 190 |
+
"learning_rate": 2.8698435277382645e-05,
|
| 191 |
+
"loss": 5.5438079833984375,
|
| 192 |
+
"step": 2600
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 0.7928933264811688,
|
| 196 |
+
"grad_norm": 0.6876152157783508,
|
| 197 |
+
"learning_rate": 2.5142247510668564e-05,
|
| 198 |
+
"loss": 5.508399658203125,
|
| 199 |
+
"step": 2700
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 0.8222597459804714,
|
| 203 |
+
"grad_norm": 0.6679750084877014,
|
| 204 |
+
"learning_rate": 2.158605974395448e-05,
|
| 205 |
+
"loss": 5.489833374023437,
|
| 206 |
+
"step": 2800
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 0.8516261654797739,
|
| 210 |
+
"grad_norm": 0.7488402724266052,
|
| 211 |
+
"learning_rate": 1.80298719772404e-05,
|
| 212 |
+
"loss": 5.467451171875,
|
| 213 |
+
"step": 2900
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 0.8809925849790764,
|
| 217 |
+
"grad_norm": 0.7311998009681702,
|
| 218 |
+
"learning_rate": 1.4473684210526317e-05,
|
| 219 |
+
"loss": 5.443863525390625,
|
| 220 |
+
"step": 3000
|
| 221 |
+
}
|
| 222 |
+
],
|
| 223 |
+
"logging_steps": 100,
|
| 224 |
+
"max_steps": 3406,
|
| 225 |
+
"num_input_tokens_seen": 0,
|
| 226 |
+
"num_train_epochs": 1,
|
| 227 |
+
"save_steps": 1000,
|
| 228 |
+
"stateful_callbacks": {
|
| 229 |
+
"TrainerControl": {
|
| 230 |
+
"args": {
|
| 231 |
+
"should_epoch_stop": false,
|
| 232 |
+
"should_evaluate": false,
|
| 233 |
+
"should_log": false,
|
| 234 |
+
"should_save": true,
|
| 235 |
+
"should_training_stop": false
|
| 236 |
+
},
|
| 237 |
+
"attributes": {}
|
| 238 |
+
}
|
| 239 |
+
},
|
| 240 |
+
"total_flos": 5.0168070144e+16,
|
| 241 |
+
"train_batch_size": 4,
|
| 242 |
+
"trial_name": null,
|
| 243 |
+
"trial_params": null
|
| 244 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-3000/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
|
| 3 |
+
size 4728
|
gpt2_from_scratch_12layer/checkpoint-3406/config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_function": "gelu_new",
|
| 3 |
+
"add_cross_attention": false,
|
| 4 |
+
"architectures": [
|
| 5 |
+
"GPT2LMHeadModel"
|
| 6 |
+
],
|
| 7 |
+
"attn_pdrop": 0.1,
|
| 8 |
+
"bos_token_id": 50256,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"embd_pdrop": 0.1,
|
| 11 |
+
"eos_token_id": 50256,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"layer_norm_epsilon": 1e-05,
|
| 14 |
+
"model_type": "gpt2",
|
| 15 |
+
"n_ctx": 1024,
|
| 16 |
+
"n_embd": 768,
|
| 17 |
+
"n_head": 12,
|
| 18 |
+
"n_inner": null,
|
| 19 |
+
"n_layer": 12,
|
| 20 |
+
"n_positions": 1024,
|
| 21 |
+
"pad_token_id": null,
|
| 22 |
+
"reorder_and_upcast_attn": false,
|
| 23 |
+
"resid_pdrop": 0.1,
|
| 24 |
+
"scale_attn_by_inverse_layer_idx": false,
|
| 25 |
+
"scale_attn_weights": true,
|
| 26 |
+
"summary_activation": null,
|
| 27 |
+
"summary_first_dropout": 0.1,
|
| 28 |
+
"summary_proj_to_labels": true,
|
| 29 |
+
"summary_type": "cls_index",
|
| 30 |
+
"summary_use_proj": true,
|
| 31 |
+
"tie_word_embeddings": true,
|
| 32 |
+
"transformers_version": "5.0.0",
|
| 33 |
+
"use_cache": false,
|
| 34 |
+
"vocab_size": 50257
|
| 35 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-3406/generation_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 50256,
|
| 4 |
+
"eos_token_id": 50256,
|
| 5 |
+
"output_attentions": false,
|
| 6 |
+
"output_hidden_states": false,
|
| 7 |
+
"transformers_version": "5.0.0",
|
| 8 |
+
"use_cache": true
|
| 9 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-3406/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86199e7b994a7b0b267da4c3eda7f844a6fcf158c09ca8a2d64fd642ed4d044f
|
| 3 |
+
size 497774208
|
gpt2_from_scratch_12layer/checkpoint-3406/optimizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a234f38b142d3331a7a53adf952f321a87a1a943ceb3a10c9a99fecda56470ae
|
| 3 |
+
size 995642298
|
gpt2_from_scratch_12layer/checkpoint-3406/rng_state.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cd2adfaeedd991b9ba6834e4ef7b91c840c2ead3fbf06beee6ad92b6087edec7
|
| 3 |
+
size 14244
|
gpt2_from_scratch_12layer/checkpoint-3406/scaler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e410308f67ca645aabf384cead7bdf7525d526a77cf7e6bf1191440bee76dba
|
| 3 |
+
size 988
|
gpt2_from_scratch_12layer/checkpoint-3406/scheduler.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d145063085675d13b94353d3de9c3206e1d55eccf9fe47bdda38c645520ea479
|
| 3 |
+
size 1064
|
gpt2_from_scratch_12layer/checkpoint-3406/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2_from_scratch_12layer/checkpoint-3406/tokenizer_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"backend": "tokenizers",
|
| 3 |
+
"bos_token": "<s>",
|
| 4 |
+
"eos_token": "</s>",
|
| 5 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 6 |
+
"pad_token": "<pad>",
|
| 7 |
+
"tokenizer_class": "TokenizersBackend",
|
| 8 |
+
"unk_token": "<unk>"
|
| 9 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-3406/trainer_state.json
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 3406,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.02936641949930255,
|
| 14 |
+
"grad_norm": 2.2842259407043457,
|
| 15 |
+
"learning_rate": 2.4750000000000004e-06,
|
| 16 |
+
"loss": 10.393255615234375,
|
| 17 |
+
"step": 100
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.0587328389986051,
|
| 21 |
+
"grad_norm": 1.976091980934143,
|
| 22 |
+
"learning_rate": 4.975000000000001e-06,
|
| 23 |
+
"loss": 9.357327270507813,
|
| 24 |
+
"step": 200
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.08809925849790765,
|
| 28 |
+
"grad_norm": 1.6418145895004272,
|
| 29 |
+
"learning_rate": 7.4750000000000004e-06,
|
| 30 |
+
"loss": 8.744969482421874,
|
| 31 |
+
"step": 300
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.1174656779972102,
|
| 35 |
+
"grad_norm": 1.1453146934509277,
|
| 36 |
+
"learning_rate": 9.975e-06,
|
| 37 |
+
"loss": 8.003826904296876,
|
| 38 |
+
"step": 400
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.14683209749651274,
|
| 42 |
+
"grad_norm": 0.6994723677635193,
|
| 43 |
+
"learning_rate": 1.2475e-05,
|
| 44 |
+
"loss": 7.452492065429688,
|
| 45 |
+
"step": 500
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.1761985169958153,
|
| 49 |
+
"grad_norm": 0.4603072702884674,
|
| 50 |
+
"learning_rate": 1.4975e-05,
|
| 51 |
+
"loss": 7.1382373046875,
|
| 52 |
+
"step": 600
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.20556493649511784,
|
| 56 |
+
"grad_norm": 0.4629450738430023,
|
| 57 |
+
"learning_rate": 1.7475e-05,
|
| 58 |
+
"loss": 6.968035888671875,
|
| 59 |
+
"step": 700
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.2349313559944204,
|
| 63 |
+
"grad_norm": 0.5266813635826111,
|
| 64 |
+
"learning_rate": 1.9975e-05,
|
| 65 |
+
"loss": 6.8181103515625,
|
| 66 |
+
"step": 800
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.2642977754937229,
|
| 70 |
+
"grad_norm": 0.5502268671989441,
|
| 71 |
+
"learning_rate": 2.2475e-05,
|
| 72 |
+
"loss": 6.682680053710937,
|
| 73 |
+
"step": 900
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.2936641949930255,
|
| 77 |
+
"grad_norm": 0.537894606590271,
|
| 78 |
+
"learning_rate": 2.4975e-05,
|
| 79 |
+
"loss": 6.568981323242188,
|
| 80 |
+
"step": 1000
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.323030614492328,
|
| 84 |
+
"grad_norm": 0.5135723352432251,
|
| 85 |
+
"learning_rate": 2.7475e-05,
|
| 86 |
+
"loss": 6.471431884765625,
|
| 87 |
+
"step": 1100
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.3523970339916306,
|
| 91 |
+
"grad_norm": 0.606870710849762,
|
| 92 |
+
"learning_rate": 2.9975000000000004e-05,
|
| 93 |
+
"loss": 6.3824462890625,
|
| 94 |
+
"step": 1200
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.3817634534909331,
|
| 98 |
+
"grad_norm": 0.5291919112205505,
|
| 99 |
+
"learning_rate": 3.2474999999999997e-05,
|
| 100 |
+
"loss": 6.302595825195312,
|
| 101 |
+
"step": 1300
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.4111298729902357,
|
| 105 |
+
"grad_norm": 0.6090461015701294,
|
| 106 |
+
"learning_rate": 3.4975e-05,
|
| 107 |
+
"loss": 6.223634033203125,
|
| 108 |
+
"step": 1400
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.4404962924895382,
|
| 112 |
+
"grad_norm": 0.5523635149002075,
|
| 113 |
+
"learning_rate": 3.7475e-05,
|
| 114 |
+
"loss": 6.154580688476562,
|
| 115 |
+
"step": 1500
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.4698627119888408,
|
| 119 |
+
"grad_norm": 0.6641230583190918,
|
| 120 |
+
"learning_rate": 3.9975e-05,
|
| 121 |
+
"loss": 6.086353759765625,
|
| 122 |
+
"step": 1600
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.4992291314881433,
|
| 126 |
+
"grad_norm": 0.6724914908409119,
|
| 127 |
+
"learning_rate": 4.2475e-05,
|
| 128 |
+
"loss": 6.030512084960938,
|
| 129 |
+
"step": 1700
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.5285955509874458,
|
| 133 |
+
"grad_norm": 0.5981016755104065,
|
| 134 |
+
"learning_rate": 4.4975e-05,
|
| 135 |
+
"loss": 5.963157348632812,
|
| 136 |
+
"step": 1800
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.5579619704867484,
|
| 140 |
+
"grad_norm": 0.676860511302948,
|
| 141 |
+
"learning_rate": 4.7475e-05,
|
| 142 |
+
"loss": 5.894300537109375,
|
| 143 |
+
"step": 1900
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.587328389986051,
|
| 147 |
+
"grad_norm": 0.6556357741355896,
|
| 148 |
+
"learning_rate": 4.9975e-05,
|
| 149 |
+
"loss": 5.844266967773438,
|
| 150 |
+
"step": 2000
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 0.6166948094853535,
|
| 154 |
+
"grad_norm": 0.7801370024681091,
|
| 155 |
+
"learning_rate": 4.647937411095306e-05,
|
| 156 |
+
"loss": 5.78113037109375,
|
| 157 |
+
"step": 2100
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 0.646061228984656,
|
| 161 |
+
"grad_norm": 0.7802927494049072,
|
| 162 |
+
"learning_rate": 4.292318634423898e-05,
|
| 163 |
+
"loss": 5.719049072265625,
|
| 164 |
+
"step": 2200
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 0.6754276484839586,
|
| 168 |
+
"grad_norm": 0.6435455083847046,
|
| 169 |
+
"learning_rate": 3.936699857752489e-05,
|
| 170 |
+
"loss": 5.66756591796875,
|
| 171 |
+
"step": 2300
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 0.7047940679832612,
|
| 175 |
+
"grad_norm": 0.6630441546440125,
|
| 176 |
+
"learning_rate": 3.581081081081081e-05,
|
| 177 |
+
"loss": 5.636476440429687,
|
| 178 |
+
"step": 2400
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.7341604874825637,
|
| 182 |
+
"grad_norm": 0.8329909443855286,
|
| 183 |
+
"learning_rate": 3.225462304409673e-05,
|
| 184 |
+
"loss": 5.58530029296875,
|
| 185 |
+
"step": 2500
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 0.7635269069818662,
|
| 189 |
+
"grad_norm": 0.74227374792099,
|
| 190 |
+
"learning_rate": 2.8698435277382645e-05,
|
| 191 |
+
"loss": 5.5438079833984375,
|
| 192 |
+
"step": 2600
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 0.7928933264811688,
|
| 196 |
+
"grad_norm": 0.6876152157783508,
|
| 197 |
+
"learning_rate": 2.5142247510668564e-05,
|
| 198 |
+
"loss": 5.508399658203125,
|
| 199 |
+
"step": 2700
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 0.8222597459804714,
|
| 203 |
+
"grad_norm": 0.6679750084877014,
|
| 204 |
+
"learning_rate": 2.158605974395448e-05,
|
| 205 |
+
"loss": 5.489833374023437,
|
| 206 |
+
"step": 2800
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 0.8516261654797739,
|
| 210 |
+
"grad_norm": 0.7488402724266052,
|
| 211 |
+
"learning_rate": 1.80298719772404e-05,
|
| 212 |
+
"loss": 5.467451171875,
|
| 213 |
+
"step": 2900
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 0.8809925849790764,
|
| 217 |
+
"grad_norm": 0.7311998009681702,
|
| 218 |
+
"learning_rate": 1.4473684210526317e-05,
|
| 219 |
+
"loss": 5.443863525390625,
|
| 220 |
+
"step": 3000
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 0.9103590044783789,
|
| 224 |
+
"grad_norm": 0.6423781514167786,
|
| 225 |
+
"learning_rate": 1.0917496443812234e-05,
|
| 226 |
+
"loss": 5.427085571289062,
|
| 227 |
+
"step": 3100
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 0.9397254239776816,
|
| 231 |
+
"grad_norm": 0.6591918468475342,
|
| 232 |
+
"learning_rate": 7.361308677098151e-06,
|
| 233 |
+
"loss": 5.414056396484375,
|
| 234 |
+
"step": 3200
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.9690918434769841,
|
| 238 |
+
"grad_norm": 0.6228283643722534,
|
| 239 |
+
"learning_rate": 3.8051209103840685e-06,
|
| 240 |
+
"loss": 5.39971435546875,
|
| 241 |
+
"step": 3300
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"epoch": 0.9984582629762866,
|
| 245 |
+
"grad_norm": 0.6124479174613953,
|
| 246 |
+
"learning_rate": 2.4893314366998576e-07,
|
| 247 |
+
"loss": 5.410068969726563,
|
| 248 |
+
"step": 3400
|
| 249 |
+
}
|
| 250 |
+
],
|
| 251 |
+
"logging_steps": 100,
|
| 252 |
+
"max_steps": 3406,
|
| 253 |
+
"num_input_tokens_seen": 0,
|
| 254 |
+
"num_train_epochs": 1,
|
| 255 |
+
"save_steps": 1000,
|
| 256 |
+
"stateful_callbacks": {
|
| 257 |
+
"TrainerControl": {
|
| 258 |
+
"args": {
|
| 259 |
+
"should_epoch_stop": false,
|
| 260 |
+
"should_evaluate": false,
|
| 261 |
+
"should_log": false,
|
| 262 |
+
"should_save": true,
|
| 263 |
+
"should_training_stop": true
|
| 264 |
+
},
|
| 265 |
+
"attributes": {}
|
| 266 |
+
}
|
| 267 |
+
},
|
| 268 |
+
"total_flos": 5.6944940285952e+16,
|
| 269 |
+
"train_batch_size": 4,
|
| 270 |
+
"trial_name": null,
|
| 271 |
+
"trial_params": null
|
| 272 |
+
}
|
gpt2_from_scratch_12layer/checkpoint-3406/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
|
| 3 |
+
size 4728
|
gpt2_from_scratch_12layer/config.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_function": "gelu_new",
|
| 3 |
+
"add_cross_attention": false,
|
| 4 |
+
"architectures": [
|
| 5 |
+
"GPT2LMHeadModel"
|
| 6 |
+
],
|
| 7 |
+
"attn_pdrop": 0.1,
|
| 8 |
+
"bos_token_id": 50256,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"embd_pdrop": 0.1,
|
| 11 |
+
"eos_token_id": 50256,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"layer_norm_epsilon": 1e-05,
|
| 14 |
+
"model_type": "gpt2",
|
| 15 |
+
"n_ctx": 1024,
|
| 16 |
+
"n_embd": 768,
|
| 17 |
+
"n_head": 12,
|
| 18 |
+
"n_inner": null,
|
| 19 |
+
"n_layer": 12,
|
| 20 |
+
"n_positions": 1024,
|
| 21 |
+
"pad_token_id": null,
|
| 22 |
+
"reorder_and_upcast_attn": false,
|
| 23 |
+
"resid_pdrop": 0.1,
|
| 24 |
+
"scale_attn_by_inverse_layer_idx": false,
|
| 25 |
+
"scale_attn_weights": true,
|
| 26 |
+
"summary_activation": null,
|
| 27 |
+
"summary_first_dropout": 0.1,
|
| 28 |
+
"summary_proj_to_labels": true,
|
| 29 |
+
"summary_type": "cls_index",
|
| 30 |
+
"summary_use_proj": true,
|
| 31 |
+
"tie_word_embeddings": true,
|
| 32 |
+
"transformers_version": "5.0.0",
|
| 33 |
+
"use_cache": false,
|
| 34 |
+
"vocab_size": 50257
|
| 35 |
+
}
|
gpt2_from_scratch_12layer/generation_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 50256,
|
| 4 |
+
"eos_token_id": 50256,
|
| 5 |
+
"output_attentions": false,
|
| 6 |
+
"output_hidden_states": false,
|
| 7 |
+
"transformers_version": "5.0.0",
|
| 8 |
+
"use_cache": true
|
| 9 |
+
}
|
gpt2_from_scratch_12layer/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:86199e7b994a7b0b267da4c3eda7f844a6fcf158c09ca8a2d64fd642ed4d044f
|
| 3 |
+
size 497774208
|
gpt2_from_scratch_12layer/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
gpt2_from_scratch_12layer/tokenizer_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"backend": "tokenizers",
|
| 3 |
+
"bos_token": "<s>",
|
| 4 |
+
"eos_token": "</s>",
|
| 5 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 6 |
+
"pad_token": "<pad>",
|
| 7 |
+
"tokenizer_class": "TokenizersBackend",
|
| 8 |
+
"unk_token": "<unk>"
|
| 9 |
+
}
|
gpt2_from_scratch_12layer/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
|
| 3 |
+
size 4728
|