meekre36 commited on Feb 7

Commit

3f281f9

verified ·

1 Parent(s): 078ed13

Upload 50 files

Browse files

Files changed (50) hide show

gpt2_from_scratch_12layer/checkpoint-1000/config.json +35 -0
gpt2_from_scratch_12layer/checkpoint-1000/generation_config.json +9 -0
gpt2_from_scratch_12layer/checkpoint-1000/model.safetensors +3 -0
gpt2_from_scratch_12layer/checkpoint-1000/optimizer.pt +3 -0
gpt2_from_scratch_12layer/checkpoint-1000/rng_state.pth +3 -0
gpt2_from_scratch_12layer/checkpoint-1000/scaler.pt +3 -0
gpt2_from_scratch_12layer/checkpoint-1000/scheduler.pt +3 -0
gpt2_from_scratch_12layer/checkpoint-1000/tokenizer.json +0 -0
gpt2_from_scratch_12layer/checkpoint-1000/tokenizer_config.json +9 -0
gpt2_from_scratch_12layer/checkpoint-1000/trainer_state.json +104 -0
gpt2_from_scratch_12layer/checkpoint-1000/training_args.bin +3 -0
gpt2_from_scratch_12layer/checkpoint-2000/config.json +35 -0
gpt2_from_scratch_12layer/checkpoint-2000/generation_config.json +9 -0
gpt2_from_scratch_12layer/checkpoint-2000/model.safetensors +3 -0
gpt2_from_scratch_12layer/checkpoint-2000/optimizer.pt +3 -0
gpt2_from_scratch_12layer/checkpoint-2000/rng_state.pth +3 -0
gpt2_from_scratch_12layer/checkpoint-2000/scaler.pt +3 -0
gpt2_from_scratch_12layer/checkpoint-2000/scheduler.pt +3 -0
gpt2_from_scratch_12layer/checkpoint-2000/tokenizer.json +0 -0
gpt2_from_scratch_12layer/checkpoint-2000/tokenizer_config.json +9 -0
gpt2_from_scratch_12layer/checkpoint-2000/trainer_state.json +174 -0
gpt2_from_scratch_12layer/checkpoint-2000/training_args.bin +3 -0
gpt2_from_scratch_12layer/checkpoint-3000/config.json +35 -0
gpt2_from_scratch_12layer/checkpoint-3000/generation_config.json +9 -0
gpt2_from_scratch_12layer/checkpoint-3000/model.safetensors +3 -0
gpt2_from_scratch_12layer/checkpoint-3000/optimizer.pt +3 -0
gpt2_from_scratch_12layer/checkpoint-3000/rng_state.pth +3 -0
gpt2_from_scratch_12layer/checkpoint-3000/scaler.pt +3 -0
gpt2_from_scratch_12layer/checkpoint-3000/scheduler.pt +3 -0
gpt2_from_scratch_12layer/checkpoint-3000/tokenizer.json +0 -0
gpt2_from_scratch_12layer/checkpoint-3000/tokenizer_config.json +9 -0
gpt2_from_scratch_12layer/checkpoint-3000/trainer_state.json +244 -0
gpt2_from_scratch_12layer/checkpoint-3000/training_args.bin +3 -0
gpt2_from_scratch_12layer/checkpoint-3406/config.json +35 -0
gpt2_from_scratch_12layer/checkpoint-3406/generation_config.json +9 -0
gpt2_from_scratch_12layer/checkpoint-3406/model.safetensors +3 -0
gpt2_from_scratch_12layer/checkpoint-3406/optimizer.pt +3 -0
gpt2_from_scratch_12layer/checkpoint-3406/rng_state.pth +3 -0
gpt2_from_scratch_12layer/checkpoint-3406/scaler.pt +3 -0
gpt2_from_scratch_12layer/checkpoint-3406/scheduler.pt +3 -0
gpt2_from_scratch_12layer/checkpoint-3406/tokenizer.json +0 -0
gpt2_from_scratch_12layer/checkpoint-3406/tokenizer_config.json +9 -0
gpt2_from_scratch_12layer/checkpoint-3406/trainer_state.json +272 -0
gpt2_from_scratch_12layer/checkpoint-3406/training_args.bin +3 -0
gpt2_from_scratch_12layer/config.json +35 -0
gpt2_from_scratch_12layer/generation_config.json +9 -0
gpt2_from_scratch_12layer/model.safetensors +3 -0
gpt2_from_scratch_12layer/tokenizer.json +0 -0
gpt2_from_scratch_12layer/tokenizer_config.json +9 -0
gpt2_from_scratch_12layer/training_args.bin +3 -0

gpt2_from_scratch_12layer/checkpoint-1000/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "activation_function": "gelu_new",
+  "add_cross_attention": false,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "pad_token_id": null,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "vocab_size": 50257
+}

gpt2_from_scratch_12layer/checkpoint-1000/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "transformers_version": "5.0.0",
+  "use_cache": true
+}

gpt2_from_scratch_12layer/checkpoint-1000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b23254f47a6b7d6396d5cd5009f7b4e97c808f10c54cc13a522fc80a40a6f914
+size 497774208

gpt2_from_scratch_12layer/checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1d31359d42cea3697b2d87c673ef4d4a81a0834d8d6163b209575a6d00bac41
+size 995642298

gpt2_from_scratch_12layer/checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28325bd1f1d721c530c7ba38b64e73cb2cf1fdad7c3357d638f67a67744c8645
+size 14244

gpt2_from_scratch_12layer/checkpoint-1000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d8fdcd0311eba9854fff738038ed4c1a269832665b4d88ba4e4e3d02a1a7e0e
+size 988

gpt2_from_scratch_12layer/checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:04a74a909d336b7124436e4cb0278258b381fc72bf5b206e1c024e4444ff4f32
+size 1064

gpt2_from_scratch_12layer/checkpoint-1000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gpt2_from_scratch_12layer/checkpoint-1000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>"
+}

gpt2_from_scratch_12layer/checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,104 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.2936641949930255,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02936641949930255,
+      "grad_norm": 2.2842259407043457,
+      "learning_rate": 2.4750000000000004e-06,
+      "loss": 10.393255615234375,
+      "step": 100
+    },
+    {
+      "epoch": 0.0587328389986051,
+      "grad_norm": 1.976091980934143,
+      "learning_rate": 4.975000000000001e-06,
+      "loss": 9.357327270507813,
+      "step": 200
+    },
+    {
+      "epoch": 0.08809925849790765,
+      "grad_norm": 1.6418145895004272,
+      "learning_rate": 7.4750000000000004e-06,
+      "loss": 8.744969482421874,
+      "step": 300
+    },
+    {
+      "epoch": 0.1174656779972102,
+      "grad_norm": 1.1453146934509277,
+      "learning_rate": 9.975e-06,
+      "loss": 8.003826904296876,
+      "step": 400
+    },
+    {
+      "epoch": 0.14683209749651274,
+      "grad_norm": 0.6994723677635193,
+      "learning_rate": 1.2475e-05,
+      "loss": 7.452492065429688,
+      "step": 500
+    },
+    {
+      "epoch": 0.1761985169958153,
+      "grad_norm": 0.4603072702884674,
+      "learning_rate": 1.4975e-05,
+      "loss": 7.1382373046875,
+      "step": 600
+    },
+    {
+      "epoch": 0.20556493649511784,
+      "grad_norm": 0.4629450738430023,
+      "learning_rate": 1.7475e-05,
+      "loss": 6.968035888671875,
+      "step": 700
+    },
+    {
+      "epoch": 0.2349313559944204,
+      "grad_norm": 0.5266813635826111,
+      "learning_rate": 1.9975e-05,
+      "loss": 6.8181103515625,
+      "step": 800
+    },
+    {
+      "epoch": 0.2642977754937229,
+      "grad_norm": 0.5502268671989441,
+      "learning_rate": 2.2475e-05,
+      "loss": 6.682680053710937,
+      "step": 900
+    },
+    {
+      "epoch": 0.2936641949930255,
+      "grad_norm": 0.537894606590271,
+      "learning_rate": 2.4975e-05,
+      "loss": 6.568981323242188,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 3406,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.6722690048e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

gpt2_from_scratch_12layer/checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
+size 4728

gpt2_from_scratch_12layer/checkpoint-2000/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "activation_function": "gelu_new",
+  "add_cross_attention": false,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "pad_token_id": null,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "vocab_size": 50257
+}

gpt2_from_scratch_12layer/checkpoint-2000/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "transformers_version": "5.0.0",
+  "use_cache": true
+}

gpt2_from_scratch_12layer/checkpoint-2000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:043f01835a7fc728da107643ad158db644191333dbde9b03bb14fc00283f9960
+size 497774208

gpt2_from_scratch_12layer/checkpoint-2000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e237c1420cfb71c0b92f02f0a00096093dd8b9b8d0c31f9e977a185ce40b82f3
+size 995642298

gpt2_from_scratch_12layer/checkpoint-2000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f95a2c63a0eaf9d82cb93bfc10ea04f73b732c5d5ce79e4bff972ef3f9449c92
+size 14244

gpt2_from_scratch_12layer/checkpoint-2000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c50a9cebe5d66d453d25b140738bff479749ac03e0a43597d8776bc22f6ed0c
+size 988

gpt2_from_scratch_12layer/checkpoint-2000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3ea9d13baff2282d300ceb3c3984a3388d1450303ffc8640c73967fa3325903
+size 1064

gpt2_from_scratch_12layer/checkpoint-2000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gpt2_from_scratch_12layer/checkpoint-2000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>"
+}

gpt2_from_scratch_12layer/checkpoint-2000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,174 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.587328389986051,
+  "eval_steps": 500,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02936641949930255,
+      "grad_norm": 2.2842259407043457,
+      "learning_rate": 2.4750000000000004e-06,
+      "loss": 10.393255615234375,
+      "step": 100
+    },
+    {
+      "epoch": 0.0587328389986051,
+      "grad_norm": 1.976091980934143,
+      "learning_rate": 4.975000000000001e-06,
+      "loss": 9.357327270507813,
+      "step": 200
+    },
+    {
+      "epoch": 0.08809925849790765,
+      "grad_norm": 1.6418145895004272,
+      "learning_rate": 7.4750000000000004e-06,
+      "loss": 8.744969482421874,
+      "step": 300
+    },
+    {
+      "epoch": 0.1174656779972102,
+      "grad_norm": 1.1453146934509277,
+      "learning_rate": 9.975e-06,
+      "loss": 8.003826904296876,
+      "step": 400
+    },
+    {
+      "epoch": 0.14683209749651274,
+      "grad_norm": 0.6994723677635193,
+      "learning_rate": 1.2475e-05,
+      "loss": 7.452492065429688,
+      "step": 500
+    },
+    {
+      "epoch": 0.1761985169958153,
+      "grad_norm": 0.4603072702884674,
+      "learning_rate": 1.4975e-05,
+      "loss": 7.1382373046875,
+      "step": 600
+    },
+    {
+      "epoch": 0.20556493649511784,
+      "grad_norm": 0.4629450738430023,
+      "learning_rate": 1.7475e-05,
+      "loss": 6.968035888671875,
+      "step": 700
+    },
+    {
+      "epoch": 0.2349313559944204,
+      "grad_norm": 0.5266813635826111,
+      "learning_rate": 1.9975e-05,
+      "loss": 6.8181103515625,
+      "step": 800
+    },
+    {
+      "epoch": 0.2642977754937229,
+      "grad_norm": 0.5502268671989441,
+      "learning_rate": 2.2475e-05,
+      "loss": 6.682680053710937,
+      "step": 900
+    },
+    {
+      "epoch": 0.2936641949930255,
+      "grad_norm": 0.537894606590271,
+      "learning_rate": 2.4975e-05,
+      "loss": 6.568981323242188,
+      "step": 1000
+    },
+    {
+      "epoch": 0.323030614492328,
+      "grad_norm": 0.5135723352432251,
+      "learning_rate": 2.7475e-05,
+      "loss": 6.471431884765625,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3523970339916306,
+      "grad_norm": 0.606870710849762,
+      "learning_rate": 2.9975000000000004e-05,
+      "loss": 6.3824462890625,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3817634534909331,
+      "grad_norm": 0.5291919112205505,
+      "learning_rate": 3.2474999999999997e-05,
+      "loss": 6.302595825195312,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4111298729902357,
+      "grad_norm": 0.6090461015701294,
+      "learning_rate": 3.4975e-05,
+      "loss": 6.223634033203125,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4404962924895382,
+      "grad_norm": 0.5523635149002075,
+      "learning_rate": 3.7475e-05,
+      "loss": 6.154580688476562,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4698627119888408,
+      "grad_norm": 0.6641230583190918,
+      "learning_rate": 3.9975e-05,
+      "loss": 6.086353759765625,
+      "step": 1600
+    },
+    {
+      "epoch": 0.4992291314881433,
+      "grad_norm": 0.6724914908409119,
+      "learning_rate": 4.2475e-05,
+      "loss": 6.030512084960938,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5285955509874458,
+      "grad_norm": 0.5981016755104065,
+      "learning_rate": 4.4975e-05,
+      "loss": 5.963157348632812,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5579619704867484,
+      "grad_norm": 0.676860511302948,
+      "learning_rate": 4.7475e-05,
+      "loss": 5.894300537109375,
+      "step": 1900
+    },
+    {
+      "epoch": 0.587328389986051,
+      "grad_norm": 0.6556357741355896,
+      "learning_rate": 4.9975e-05,
+      "loss": 5.844266967773438,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 3406,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.3445380096e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

gpt2_from_scratch_12layer/checkpoint-2000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
+size 4728

gpt2_from_scratch_12layer/checkpoint-3000/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "activation_function": "gelu_new",
+  "add_cross_attention": false,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "pad_token_id": null,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "vocab_size": 50257
+}

gpt2_from_scratch_12layer/checkpoint-3000/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "transformers_version": "5.0.0",
+  "use_cache": true
+}

gpt2_from_scratch_12layer/checkpoint-3000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c4776ac0444ad6001151629fdfa49402a4713258dc905b86f078d45300d610f
+size 497774208

gpt2_from_scratch_12layer/checkpoint-3000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dfaf5b9fae33481d91060504d67e57291ff13d8fe917f28251b971b4e8b8684a
+size 995642298

gpt2_from_scratch_12layer/checkpoint-3000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e10f3460a731b355fd0ba5229f0bad8de79ebe7909166a3ad7f90c89b83dda5
+size 14244

gpt2_from_scratch_12layer/checkpoint-3000/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21aba8ed0f38ed1c04994c10a9ca7e9925e55ef2ed51283c43ff8e2cce78585f
+size 988

gpt2_from_scratch_12layer/checkpoint-3000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e7038e46af7868cb4110f4906a05bd0c0cfeb8b51264c714a479c44c4014e81
+size 1064

gpt2_from_scratch_12layer/checkpoint-3000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gpt2_from_scratch_12layer/checkpoint-3000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>"
+}

gpt2_from_scratch_12layer/checkpoint-3000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,244 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.8809925849790764,
+  "eval_steps": 500,
+  "global_step": 3000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02936641949930255,
+      "grad_norm": 2.2842259407043457,
+      "learning_rate": 2.4750000000000004e-06,
+      "loss": 10.393255615234375,
+      "step": 100
+    },
+    {
+      "epoch": 0.0587328389986051,
+      "grad_norm": 1.976091980934143,
+      "learning_rate": 4.975000000000001e-06,
+      "loss": 9.357327270507813,
+      "step": 200
+    },
+    {
+      "epoch": 0.08809925849790765,
+      "grad_norm": 1.6418145895004272,
+      "learning_rate": 7.4750000000000004e-06,
+      "loss": 8.744969482421874,
+      "step": 300
+    },
+    {
+      "epoch": 0.1174656779972102,
+      "grad_norm": 1.1453146934509277,
+      "learning_rate": 9.975e-06,
+      "loss": 8.003826904296876,
+      "step": 400
+    },
+    {
+      "epoch": 0.14683209749651274,
+      "grad_norm": 0.6994723677635193,
+      "learning_rate": 1.2475e-05,
+      "loss": 7.452492065429688,
+      "step": 500
+    },
+    {
+      "epoch": 0.1761985169958153,
+      "grad_norm": 0.4603072702884674,
+      "learning_rate": 1.4975e-05,
+      "loss": 7.1382373046875,
+      "step": 600
+    },
+    {
+      "epoch": 0.20556493649511784,
+      "grad_norm": 0.4629450738430023,
+      "learning_rate": 1.7475e-05,
+      "loss": 6.968035888671875,
+      "step": 700
+    },
+    {
+      "epoch": 0.2349313559944204,
+      "grad_norm": 0.5266813635826111,
+      "learning_rate": 1.9975e-05,
+      "loss": 6.8181103515625,
+      "step": 800
+    },
+    {
+      "epoch": 0.2642977754937229,
+      "grad_norm": 0.5502268671989441,
+      "learning_rate": 2.2475e-05,
+      "loss": 6.682680053710937,
+      "step": 900
+    },
+    {
+      "epoch": 0.2936641949930255,
+      "grad_norm": 0.537894606590271,
+      "learning_rate": 2.4975e-05,
+      "loss": 6.568981323242188,
+      "step": 1000
+    },
+    {
+      "epoch": 0.323030614492328,
+      "grad_norm": 0.5135723352432251,
+      "learning_rate": 2.7475e-05,
+      "loss": 6.471431884765625,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3523970339916306,
+      "grad_norm": 0.606870710849762,
+      "learning_rate": 2.9975000000000004e-05,
+      "loss": 6.3824462890625,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3817634534909331,
+      "grad_norm": 0.5291919112205505,
+      "learning_rate": 3.2474999999999997e-05,
+      "loss": 6.302595825195312,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4111298729902357,
+      "grad_norm": 0.6090461015701294,
+      "learning_rate": 3.4975e-05,
+      "loss": 6.223634033203125,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4404962924895382,
+      "grad_norm": 0.5523635149002075,
+      "learning_rate": 3.7475e-05,
+      "loss": 6.154580688476562,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4698627119888408,
+      "grad_norm": 0.6641230583190918,
+      "learning_rate": 3.9975e-05,
+      "loss": 6.086353759765625,
+      "step": 1600
+    },
+    {
+      "epoch": 0.4992291314881433,
+      "grad_norm": 0.6724914908409119,
+      "learning_rate": 4.2475e-05,
+      "loss": 6.030512084960938,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5285955509874458,
+      "grad_norm": 0.5981016755104065,
+      "learning_rate": 4.4975e-05,
+      "loss": 5.963157348632812,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5579619704867484,
+      "grad_norm": 0.676860511302948,
+      "learning_rate": 4.7475e-05,
+      "loss": 5.894300537109375,
+      "step": 1900
+    },
+    {
+      "epoch": 0.587328389986051,
+      "grad_norm": 0.6556357741355896,
+      "learning_rate": 4.9975e-05,
+      "loss": 5.844266967773438,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6166948094853535,
+      "grad_norm": 0.7801370024681091,
+      "learning_rate": 4.647937411095306e-05,
+      "loss": 5.78113037109375,
+      "step": 2100
+    },
+    {
+      "epoch": 0.646061228984656,
+      "grad_norm": 0.7802927494049072,
+      "learning_rate": 4.292318634423898e-05,
+      "loss": 5.719049072265625,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6754276484839586,
+      "grad_norm": 0.6435455083847046,
+      "learning_rate": 3.936699857752489e-05,
+      "loss": 5.66756591796875,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7047940679832612,
+      "grad_norm": 0.6630441546440125,
+      "learning_rate": 3.581081081081081e-05,
+      "loss": 5.636476440429687,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7341604874825637,
+      "grad_norm": 0.8329909443855286,
+      "learning_rate": 3.225462304409673e-05,
+      "loss": 5.58530029296875,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7635269069818662,
+      "grad_norm": 0.74227374792099,
+      "learning_rate": 2.8698435277382645e-05,
+      "loss": 5.5438079833984375,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7928933264811688,
+      "grad_norm": 0.6876152157783508,
+      "learning_rate": 2.5142247510668564e-05,
+      "loss": 5.508399658203125,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8222597459804714,
+      "grad_norm": 0.6679750084877014,
+      "learning_rate": 2.158605974395448e-05,
+      "loss": 5.489833374023437,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8516261654797739,
+      "grad_norm": 0.7488402724266052,
+      "learning_rate": 1.80298719772404e-05,
+      "loss": 5.467451171875,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8809925849790764,
+      "grad_norm": 0.7311998009681702,
+      "learning_rate": 1.4473684210526317e-05,
+      "loss": 5.443863525390625,
+      "step": 3000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 3406,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.0168070144e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

gpt2_from_scratch_12layer/checkpoint-3000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
+size 4728

gpt2_from_scratch_12layer/checkpoint-3406/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "activation_function": "gelu_new",
+  "add_cross_attention": false,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "pad_token_id": null,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "vocab_size": 50257
+}

gpt2_from_scratch_12layer/checkpoint-3406/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "transformers_version": "5.0.0",
+  "use_cache": true
+}

gpt2_from_scratch_12layer/checkpoint-3406/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86199e7b994a7b0b267da4c3eda7f844a6fcf158c09ca8a2d64fd642ed4d044f
+size 497774208

gpt2_from_scratch_12layer/checkpoint-3406/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a234f38b142d3331a7a53adf952f321a87a1a943ceb3a10c9a99fecda56470ae
+size 995642298

gpt2_from_scratch_12layer/checkpoint-3406/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd2adfaeedd991b9ba6834e4ef7b91c840c2ead3fbf06beee6ad92b6087edec7
+size 14244

gpt2_from_scratch_12layer/checkpoint-3406/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e410308f67ca645aabf384cead7bdf7525d526a77cf7e6bf1191440bee76dba
+size 988

gpt2_from_scratch_12layer/checkpoint-3406/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d145063085675d13b94353d3de9c3206e1d55eccf9fe47bdda38c645520ea479
+size 1064

gpt2_from_scratch_12layer/checkpoint-3406/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gpt2_from_scratch_12layer/checkpoint-3406/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>"
+}

gpt2_from_scratch_12layer/checkpoint-3406/trainer_state.json ADDED Viewed

	@@ -0,0 +1,272 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 3406,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.02936641949930255,
+      "grad_norm": 2.2842259407043457,
+      "learning_rate": 2.4750000000000004e-06,
+      "loss": 10.393255615234375,
+      "step": 100
+    },
+    {
+      "epoch": 0.0587328389986051,
+      "grad_norm": 1.976091980934143,
+      "learning_rate": 4.975000000000001e-06,
+      "loss": 9.357327270507813,
+      "step": 200
+    },
+    {
+      "epoch": 0.08809925849790765,
+      "grad_norm": 1.6418145895004272,
+      "learning_rate": 7.4750000000000004e-06,
+      "loss": 8.744969482421874,
+      "step": 300
+    },
+    {
+      "epoch": 0.1174656779972102,
+      "grad_norm": 1.1453146934509277,
+      "learning_rate": 9.975e-06,
+      "loss": 8.003826904296876,
+      "step": 400
+    },
+    {
+      "epoch": 0.14683209749651274,
+      "grad_norm": 0.6994723677635193,
+      "learning_rate": 1.2475e-05,
+      "loss": 7.452492065429688,
+      "step": 500
+    },
+    {
+      "epoch": 0.1761985169958153,
+      "grad_norm": 0.4603072702884674,
+      "learning_rate": 1.4975e-05,
+      "loss": 7.1382373046875,
+      "step": 600
+    },
+    {
+      "epoch": 0.20556493649511784,
+      "grad_norm": 0.4629450738430023,
+      "learning_rate": 1.7475e-05,
+      "loss": 6.968035888671875,
+      "step": 700
+    },
+    {
+      "epoch": 0.2349313559944204,
+      "grad_norm": 0.5266813635826111,
+      "learning_rate": 1.9975e-05,
+      "loss": 6.8181103515625,
+      "step": 800
+    },
+    {
+      "epoch": 0.2642977754937229,
+      "grad_norm": 0.5502268671989441,
+      "learning_rate": 2.2475e-05,
+      "loss": 6.682680053710937,
+      "step": 900
+    },
+    {
+      "epoch": 0.2936641949930255,
+      "grad_norm": 0.537894606590271,
+      "learning_rate": 2.4975e-05,
+      "loss": 6.568981323242188,
+      "step": 1000
+    },
+    {
+      "epoch": 0.323030614492328,
+      "grad_norm": 0.5135723352432251,
+      "learning_rate": 2.7475e-05,
+      "loss": 6.471431884765625,
+      "step": 1100
+    },
+    {
+      "epoch": 0.3523970339916306,
+      "grad_norm": 0.606870710849762,
+      "learning_rate": 2.9975000000000004e-05,
+      "loss": 6.3824462890625,
+      "step": 1200
+    },
+    {
+      "epoch": 0.3817634534909331,
+      "grad_norm": 0.5291919112205505,
+      "learning_rate": 3.2474999999999997e-05,
+      "loss": 6.302595825195312,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4111298729902357,
+      "grad_norm": 0.6090461015701294,
+      "learning_rate": 3.4975e-05,
+      "loss": 6.223634033203125,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4404962924895382,
+      "grad_norm": 0.5523635149002075,
+      "learning_rate": 3.7475e-05,
+      "loss": 6.154580688476562,
+      "step": 1500
+    },
+    {
+      "epoch": 0.4698627119888408,
+      "grad_norm": 0.6641230583190918,
+      "learning_rate": 3.9975e-05,
+      "loss": 6.086353759765625,
+      "step": 1600
+    },
+    {
+      "epoch": 0.4992291314881433,
+      "grad_norm": 0.6724914908409119,
+      "learning_rate": 4.2475e-05,
+      "loss": 6.030512084960938,
+      "step": 1700
+    },
+    {
+      "epoch": 0.5285955509874458,
+      "grad_norm": 0.5981016755104065,
+      "learning_rate": 4.4975e-05,
+      "loss": 5.963157348632812,
+      "step": 1800
+    },
+    {
+      "epoch": 0.5579619704867484,
+      "grad_norm": 0.676860511302948,
+      "learning_rate": 4.7475e-05,
+      "loss": 5.894300537109375,
+      "step": 1900
+    },
+    {
+      "epoch": 0.587328389986051,
+      "grad_norm": 0.6556357741355896,
+      "learning_rate": 4.9975e-05,
+      "loss": 5.844266967773438,
+      "step": 2000
+    },
+    {
+      "epoch": 0.6166948094853535,
+      "grad_norm": 0.7801370024681091,
+      "learning_rate": 4.647937411095306e-05,
+      "loss": 5.78113037109375,
+      "step": 2100
+    },
+    {
+      "epoch": 0.646061228984656,
+      "grad_norm": 0.7802927494049072,
+      "learning_rate": 4.292318634423898e-05,
+      "loss": 5.719049072265625,
+      "step": 2200
+    },
+    {
+      "epoch": 0.6754276484839586,
+      "grad_norm": 0.6435455083847046,
+      "learning_rate": 3.936699857752489e-05,
+      "loss": 5.66756591796875,
+      "step": 2300
+    },
+    {
+      "epoch": 0.7047940679832612,
+      "grad_norm": 0.6630441546440125,
+      "learning_rate": 3.581081081081081e-05,
+      "loss": 5.636476440429687,
+      "step": 2400
+    },
+    {
+      "epoch": 0.7341604874825637,
+      "grad_norm": 0.8329909443855286,
+      "learning_rate": 3.225462304409673e-05,
+      "loss": 5.58530029296875,
+      "step": 2500
+    },
+    {
+      "epoch": 0.7635269069818662,
+      "grad_norm": 0.74227374792099,
+      "learning_rate": 2.8698435277382645e-05,
+      "loss": 5.5438079833984375,
+      "step": 2600
+    },
+    {
+      "epoch": 0.7928933264811688,
+      "grad_norm": 0.6876152157783508,
+      "learning_rate": 2.5142247510668564e-05,
+      "loss": 5.508399658203125,
+      "step": 2700
+    },
+    {
+      "epoch": 0.8222597459804714,
+      "grad_norm": 0.6679750084877014,
+      "learning_rate": 2.158605974395448e-05,
+      "loss": 5.489833374023437,
+      "step": 2800
+    },
+    {
+      "epoch": 0.8516261654797739,
+      "grad_norm": 0.7488402724266052,
+      "learning_rate": 1.80298719772404e-05,
+      "loss": 5.467451171875,
+      "step": 2900
+    },
+    {
+      "epoch": 0.8809925849790764,
+      "grad_norm": 0.7311998009681702,
+      "learning_rate": 1.4473684210526317e-05,
+      "loss": 5.443863525390625,
+      "step": 3000
+    },
+    {
+      "epoch": 0.9103590044783789,
+      "grad_norm": 0.6423781514167786,
+      "learning_rate": 1.0917496443812234e-05,
+      "loss": 5.427085571289062,
+      "step": 3100
+    },
+    {
+      "epoch": 0.9397254239776816,
+      "grad_norm": 0.6591918468475342,
+      "learning_rate": 7.361308677098151e-06,
+      "loss": 5.414056396484375,
+      "step": 3200
+    },
+    {
+      "epoch": 0.9690918434769841,
+      "grad_norm": 0.6228283643722534,
+      "learning_rate": 3.8051209103840685e-06,
+      "loss": 5.39971435546875,
+      "step": 3300
+    },
+    {
+      "epoch": 0.9984582629762866,
+      "grad_norm": 0.6124479174613953,
+      "learning_rate": 2.4893314366998576e-07,
+      "loss": 5.410068969726563,
+      "step": 3400
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 3406,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 1000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.6944940285952e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

gpt2_from_scratch_12layer/checkpoint-3406/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
+size 4728

gpt2_from_scratch_12layer/config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "activation_function": "gelu_new",
+  "add_cross_attention": false,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "dtype": "float32",
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "pad_token_id": null,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "vocab_size": 50257
+}

gpt2_from_scratch_12layer/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "transformers_version": "5.0.0",
+  "use_cache": true
+}

gpt2_from_scratch_12layer/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86199e7b994a7b0b267da4c3eda7f844a6fcf158c09ca8a2d64fd642ed4d044f
+size 497774208

gpt2_from_scratch_12layer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

gpt2_from_scratch_12layer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "TokenizersBackend",
+  "unk_token": "<unk>"
+}

gpt2_from_scratch_12layer/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7365dbdab00faac01ab21cf58f54309413f9c58f1fdc95c4ee7a1e881ad0856d
+size 4728