QuantaSparkLabs commited on 6 days ago

Commit

2243c4f

verified ·

1 Parent(s): 8c23b4b

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

checkpoint-100/config.json +34 -0
checkpoint-100/generation_config.json +10 -0
checkpoint-100/model.safetensors +3 -0
checkpoint-100/optimizer.pt +3 -0
checkpoint-100/rng_state.pth +3 -0
checkpoint-100/scheduler.pt +3 -0
checkpoint-100/trainer_state.json +104 -0
checkpoint-100/training_args.bin +3 -0
checkpoint-1000/config.json +34 -0
checkpoint-1000/generation_config.json +10 -0
checkpoint-1000/model.safetensors +3 -0
checkpoint-1000/optimizer.pt +3 -0
checkpoint-1000/rng_state.pth +3 -0
checkpoint-1000/scheduler.pt +3 -0
checkpoint-1000/trainer_state.json +734 -0
checkpoint-1000/training_args.bin +3 -0
checkpoint-1050/config.json +34 -0
checkpoint-1050/generation_config.json +10 -0
checkpoint-1050/model.safetensors +3 -0
checkpoint-1050/optimizer.pt +3 -0
checkpoint-1050/rng_state.pth +3 -0
checkpoint-1050/scheduler.pt +3 -0
checkpoint-1050/trainer_state.json +769 -0
checkpoint-1050/training_args.bin +3 -0
checkpoint-1100/config.json +34 -0
checkpoint-1100/generation_config.json +10 -0
checkpoint-1100/model.safetensors +3 -0
checkpoint-1100/optimizer.pt +3 -0
checkpoint-1100/rng_state.pth +3 -0
checkpoint-1100/scheduler.pt +3 -0
checkpoint-1100/trainer_state.json +804 -0
checkpoint-1100/training_args.bin +3 -0
checkpoint-1150/config.json +34 -0
checkpoint-1150/generation_config.json +10 -0
checkpoint-1150/model.safetensors +3 -0
checkpoint-1150/optimizer.pt +3 -0
checkpoint-1150/rng_state.pth +3 -0
checkpoint-1150/scheduler.pt +3 -0
checkpoint-1150/trainer_state.json +839 -0
checkpoint-1150/training_args.bin +3 -0
checkpoint-1200/config.json +34 -0
checkpoint-1200/generation_config.json +10 -0
checkpoint-1200/model.safetensors +3 -0
checkpoint-1200/optimizer.pt +3 -0
checkpoint-1200/rng_state.pth +3 -0
checkpoint-1200/scheduler.pt +3 -0
checkpoint-1200/trainer_state.json +874 -0
checkpoint-1200/training_args.bin +3 -0
checkpoint-1250/config.json +34 -0
checkpoint-1250/generation_config.json +10 -0

checkpoint-100/config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "activation_function": "gelu",
+  "add_cross_attention": false,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "embd_pdrop": 0.0,
+  "eos_token_id": 1,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 1,
+  "n_head": 1,
+  "n_inner": 1,
+  "n_layer": 1,
+  "n_positions": 1,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.0,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.12.0",
+  "use_cache": false,
+  "vocab_size": 2
+}

checkpoint-100/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 1,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "pad_token_id": 0,
+  "transformers_version": "5.12.0",
+  "use_cache": true
+}

checkpoint-100/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:880c2ec6c05c1e38595824792a1d3850592746c746ee2caa95bee8cb0e8ec77d
+size 1452

checkpoint-100/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e17dc36b91710af38919a82d28733c78849a9938e64d50b056fde953e08af55
+size 13823

checkpoint-100/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:656c952bc98f1ba6483f4b602ab79d3a7eb64d231d7b2b6ae517f06e7e137155
+size 14455

checkpoint-100/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a487ad8594fd184a1dbb5f7128f07682e4d70038d880a5d45dd29b502807a0b
+size 1465

checkpoint-100/trainer_state.json ADDED Viewed

	@@ -0,0 +1,104 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 12.5,
+  "eval_steps": 500,
+  "global_step": 100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009989987484355445,
+      "loss": 0.0,
+      "step": 10
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009964956195244054,
+      "loss": 0.0,
+      "step": 20
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009939924906132666,
+      "loss": 0.0,
+      "step": 30
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009914893617021277,
+      "loss": 0.0,
+      "step": 40
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009889862327909888,
+      "loss": 0.0,
+      "step": 50
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009864831038798498,
+      "loss": 0.0,
+      "step": 60
+    },
+    {
+      "epoch": 8.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009839799749687109,
+      "loss": 0.0,
+      "step": 70
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00981476846057572,
+      "loss": 0.0,
+      "step": 80
+    },
+    {
+      "epoch": 11.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00978973717146433,
+      "loss": 0.0,
+      "step": 90
+    },
+    {
+      "epoch": 12.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009764705882352941,
+      "loss": 0.0,
+      "step": 100
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 500,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 675648.0,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-100/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6208b173b902d75ab9814341e70009f66ecc0ecd70d024038fcaaf650010173
+size 5137

checkpoint-1000/config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "activation_function": "gelu",
+  "add_cross_attention": false,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "embd_pdrop": 0.0,
+  "eos_token_id": 1,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 1,
+  "n_head": 1,
+  "n_inner": 1,
+  "n_layer": 1,
+  "n_positions": 1,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.0,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.12.0",
+  "use_cache": false,
+  "vocab_size": 2
+}

checkpoint-1000/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 1,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "pad_token_id": 0,
+  "transformers_version": "5.12.0",
+  "use_cache": true
+}

checkpoint-1000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:880c2ec6c05c1e38595824792a1d3850592746c746ee2caa95bee8cb0e8ec77d
+size 1452

checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4683cfa458233e00c198c54038d450bfd06ca52f719705e01fc34a4845b539a2
+size 13823

checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c92683043c9a8610fa78e10e63d70e47ebd8152c60d1cab4d893b74a45bb5db4
+size 14455

checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:433f845f661e9be47ccaee189de347cf46b20ad6176b2cfd945b4c290cad9fc8
+size 1465

checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,734 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 125.0,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009989987484355445,
+      "loss": 0.0,
+      "step": 10
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009964956195244054,
+      "loss": 0.0,
+      "step": 20
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009939924906132666,
+      "loss": 0.0,
+      "step": 30
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009914893617021277,
+      "loss": 0.0,
+      "step": 40
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009889862327909888,
+      "loss": 0.0,
+      "step": 50
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009864831038798498,
+      "loss": 0.0,
+      "step": 60
+    },
+    {
+      "epoch": 8.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009839799749687109,
+      "loss": 0.0,
+      "step": 70
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00981476846057572,
+      "loss": 0.0,
+      "step": 80
+    },
+    {
+      "epoch": 11.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00978973717146433,
+      "loss": 0.0,
+      "step": 90
+    },
+    {
+      "epoch": 12.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009764705882352941,
+      "loss": 0.0,
+      "step": 100
+    },
+    {
+      "epoch": 13.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009739674593241552,
+      "loss": 0.0,
+      "step": 110
+    },
+    {
+      "epoch": 15.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009714643304130162,
+      "loss": 0.0,
+      "step": 120
+    },
+    {
+      "epoch": 16.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009689612015018775,
+      "loss": 0.0,
+      "step": 130
+    },
+    {
+      "epoch": 17.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009664580725907385,
+      "loss": 0.0,
+      "step": 140
+    },
+    {
+      "epoch": 18.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009639549436795996,
+      "loss": 0.0,
+      "step": 150
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009614518147684605,
+      "loss": 0.0,
+      "step": 160
+    },
+    {
+      "epoch": 21.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009589486858573217,
+      "loss": 0.0,
+      "step": 170
+    },
+    {
+      "epoch": 22.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009564455569461828,
+      "loss": 0.0,
+      "step": 180
+    },
+    {
+      "epoch": 23.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009539424280350439,
+      "loss": 0.0,
+      "step": 190
+    },
+    {
+      "epoch": 25.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00951439299123905,
+      "loss": 0.0,
+      "step": 200
+    },
+    {
+      "epoch": 26.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00948936170212766,
+      "loss": 0.0,
+      "step": 210
+    },
+    {
+      "epoch": 27.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00946433041301627,
+      "loss": 0.0,
+      "step": 220
+    },
+    {
+      "epoch": 28.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009439299123904881,
+      "loss": 0.0,
+      "step": 230
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009414267834793492,
+      "loss": 0.0,
+      "step": 240
+    },
+    {
+      "epoch": 31.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009389236545682102,
+      "loss": 0.0,
+      "step": 250
+    },
+    {
+      "epoch": 32.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009364205256570713,
+      "loss": 0.0,
+      "step": 260
+    },
+    {
+      "epoch": 33.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009339173967459325,
+      "loss": 0.0,
+      "step": 270
+    },
+    {
+      "epoch": 35.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009314142678347936,
+      "loss": 0.0,
+      "step": 280
+    },
+    {
+      "epoch": 36.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009289111389236547,
+      "loss": 0.0,
+      "step": 290
+    },
+    {
+      "epoch": 37.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009264080100125156,
+      "loss": 0.0,
+      "step": 300
+    },
+    {
+      "epoch": 38.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009239048811013768,
+      "loss": 0.0,
+      "step": 310
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009214017521902379,
+      "loss": 0.0,
+      "step": 320
+    },
+    {
+      "epoch": 41.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00918898623279099,
+      "loss": 0.0,
+      "step": 330
+    },
+    {
+      "epoch": 42.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0091639549436796,
+      "loss": 0.0,
+      "step": 340
+    },
+    {
+      "epoch": 43.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00913892365456821,
+      "loss": 0.0,
+      "step": 350
+    },
+    {
+      "epoch": 45.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009113892365456821,
+      "loss": 0.0,
+      "step": 360
+    },
+    {
+      "epoch": 46.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009088861076345432,
+      "loss": 0.0,
+      "step": 370
+    },
+    {
+      "epoch": 47.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009063829787234043,
+      "loss": 0.0,
+      "step": 380
+    },
+    {
+      "epoch": 48.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009038798498122653,
+      "loss": 0.0,
+      "step": 390
+    },
+    {
+      "epoch": 50.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009013767209011264,
+      "loss": 0.0,
+      "step": 400
+    },
+    {
+      "epoch": 51.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008988735919899874,
+      "loss": 0.0,
+      "step": 410
+    },
+    {
+      "epoch": 52.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008963704630788487,
+      "loss": 0.0,
+      "step": 420
+    },
+    {
+      "epoch": 53.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008938673341677096,
+      "loss": 0.0,
+      "step": 430
+    },
+    {
+      "epoch": 55.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008913642052565706,
+      "loss": 0.0,
+      "step": 440
+    },
+    {
+      "epoch": 56.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008888610763454317,
+      "loss": 0.0,
+      "step": 450
+    },
+    {
+      "epoch": 57.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00886357947434293,
+      "loss": 0.0,
+      "step": 460
+    },
+    {
+      "epoch": 58.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00883854818523154,
+      "loss": 0.0,
+      "step": 470
+    },
+    {
+      "epoch": 60.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00881351689612015,
+      "loss": 0.0,
+      "step": 480
+    },
+    {
+      "epoch": 61.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008788485607008761,
+      "loss": 0.0,
+      "step": 490
+    },
+    {
+      "epoch": 62.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008763454317897372,
+      "loss": 0.0,
+      "step": 500
+    },
+    {
+      "epoch": 63.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008738423028785983,
+      "loss": 0.0,
+      "step": 510
+    },
+    {
+      "epoch": 65.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008713391739674593,
+      "loss": 0.0,
+      "step": 520
+    },
+    {
+      "epoch": 66.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008688360450563204,
+      "loss": 0.0,
+      "step": 530
+    },
+    {
+      "epoch": 67.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008663329161451815,
+      "loss": 0.0,
+      "step": 540
+    },
+    {
+      "epoch": 68.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008638297872340425,
+      "loss": 0.0,
+      "step": 550
+    },
+    {
+      "epoch": 70.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008613266583229038,
+      "loss": 0.0,
+      "step": 560
+    },
+    {
+      "epoch": 71.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008588235294117647,
+      "loss": 0.0,
+      "step": 570
+    },
+    {
+      "epoch": 72.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008563204005006257,
+      "loss": 0.0,
+      "step": 580
+    },
+    {
+      "epoch": 73.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008538172715894868,
+      "loss": 0.0,
+      "step": 590
+    },
+    {
+      "epoch": 75.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00851314142678348,
+      "loss": 0.0,
+      "step": 600
+    },
+    {
+      "epoch": 76.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00848811013767209,
+      "loss": 0.0,
+      "step": 610
+    },
+    {
+      "epoch": 77.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008463078848560701,
+      "loss": 0.0,
+      "step": 620
+    },
+    {
+      "epoch": 78.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008438047559449312,
+      "loss": 0.0,
+      "step": 630
+    },
+    {
+      "epoch": 80.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008413016270337923,
+      "loss": 0.0,
+      "step": 640
+    },
+    {
+      "epoch": 81.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008387984981226533,
+      "loss": 0.0,
+      "step": 650
+    },
+    {
+      "epoch": 82.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008362953692115144,
+      "loss": 0.0,
+      "step": 660
+    },
+    {
+      "epoch": 83.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008337922403003755,
+      "loss": 0.0,
+      "step": 670
+    },
+    {
+      "epoch": 85.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008312891113892365,
+      "loss": 0.0,
+      "step": 680
+    },
+    {
+      "epoch": 86.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008287859824780976,
+      "loss": 0.0,
+      "step": 690
+    },
+    {
+      "epoch": 87.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008262828535669588,
+      "loss": 0.0,
+      "step": 700
+    },
+    {
+      "epoch": 88.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008237797246558197,
+      "loss": 0.0,
+      "step": 710
+    },
+    {
+      "epoch": 90.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008212765957446808,
+      "loss": 0.0,
+      "step": 720
+    },
+    {
+      "epoch": 91.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008187734668335419,
+      "loss": 0.0,
+      "step": 730
+    },
+    {
+      "epoch": 92.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008162703379224031,
+      "loss": 0.0,
+      "step": 740
+    },
+    {
+      "epoch": 93.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008137672090112642,
+      "loss": 0.0,
+      "step": 750
+    },
+    {
+      "epoch": 95.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008112640801001252,
+      "loss": 0.0,
+      "step": 760
+    },
+    {
+      "epoch": 96.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008087609511889863,
+      "loss": 0.0,
+      "step": 770
+    },
+    {
+      "epoch": 97.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008062578222778474,
+      "loss": 0.0,
+      "step": 780
+    },
+    {
+      "epoch": 98.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008037546933667084,
+      "loss": 0.0,
+      "step": 790
+    },
+    {
+      "epoch": 100.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008012515644555695,
+      "loss": 0.0,
+      "step": 800
+    },
+    {
+      "epoch": 101.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007987484355444305,
+      "loss": 0.0,
+      "step": 810
+    },
+    {
+      "epoch": 102.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007962453066332916,
+      "loss": 0.0,
+      "step": 820
+    },
+    {
+      "epoch": 103.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007937421777221527,
+      "loss": 0.0,
+      "step": 830
+    },
+    {
+      "epoch": 105.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007912390488110137,
+      "loss": 0.0,
+      "step": 840
+    },
+    {
+      "epoch": 106.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007887359198998748,
+      "loss": 0.0,
+      "step": 850
+    },
+    {
+      "epoch": 107.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007862327909887359,
+      "loss": 0.0,
+      "step": 860
+    },
+    {
+      "epoch": 108.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00783729662077597,
+      "loss": 0.0,
+      "step": 870
+    },
+    {
+      "epoch": 110.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007812265331664581,
+      "loss": 0.0,
+      "step": 880
+    },
+    {
+      "epoch": 111.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0077872340425531915,
+      "loss": 0.0,
+      "step": 890
+    },
+    {
+      "epoch": 112.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007762202753441803,
+      "loss": 0.0,
+      "step": 900
+    },
+    {
+      "epoch": 113.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007737171464330414,
+      "loss": 0.0,
+      "step": 910
+    },
+    {
+      "epoch": 115.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0077121401752190235,
+      "loss": 0.0,
+      "step": 920
+    },
+    {
+      "epoch": 116.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007687108886107634,
+      "loss": 0.0,
+      "step": 930
+    },
+    {
+      "epoch": 117.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007662077596996246,
+      "loss": 0.0,
+      "step": 940
+    },
+    {
+      "epoch": 118.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007637046307884856,
+      "loss": 0.0,
+      "step": 950
+    },
+    {
+      "epoch": 120.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007612015018773467,
+      "loss": 0.0,
+      "step": 960
+    },
+    {
+      "epoch": 121.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007586983729662078,
+      "loss": 0.0,
+      "step": 970
+    },
+    {
+      "epoch": 122.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007561952440550689,
+      "loss": 0.0,
+      "step": 980
+    },
+    {
+      "epoch": 123.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007536921151439299,
+      "loss": 0.0,
+      "step": 990
+    },
+    {
+      "epoch": 125.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0075118898623279095,
+      "loss": 0.0,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 500,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6750000.0,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6208b173b902d75ab9814341e70009f66ecc0ecd70d024038fcaaf650010173
+size 5137

checkpoint-1050/config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "activation_function": "gelu",
+  "add_cross_attention": false,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "embd_pdrop": 0.0,
+  "eos_token_id": 1,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 1,
+  "n_head": 1,
+  "n_inner": 1,
+  "n_layer": 1,
+  "n_positions": 1,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.0,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.12.0",
+  "use_cache": false,
+  "vocab_size": 2
+}

checkpoint-1050/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 1,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "pad_token_id": 0,
+  "transformers_version": "5.12.0",
+  "use_cache": true
+}

checkpoint-1050/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:880c2ec6c05c1e38595824792a1d3850592746c746ee2caa95bee8cb0e8ec77d
+size 1452

checkpoint-1050/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0587ee5489e3230570b30b2c6399ad8da6204af076e1b7662ccdb717d4f61c53
+size 13823

checkpoint-1050/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:72dedc896a3c9245030a09cd03b9a14e0fdca07e3a751912053a18a37a5e6782
+size 14455

checkpoint-1050/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82fa2536fa054f9bf5776f9ff6684daf9790146183edb054be6762da536dacb2
+size 1465

checkpoint-1050/trainer_state.json ADDED Viewed

	@@ -0,0 +1,769 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 131.25,
+  "eval_steps": 500,
+  "global_step": 1050,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009989987484355445,
+      "loss": 0.0,
+      "step": 10
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009964956195244054,
+      "loss": 0.0,
+      "step": 20
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009939924906132666,
+      "loss": 0.0,
+      "step": 30
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009914893617021277,
+      "loss": 0.0,
+      "step": 40
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009889862327909888,
+      "loss": 0.0,
+      "step": 50
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009864831038798498,
+      "loss": 0.0,
+      "step": 60
+    },
+    {
+      "epoch": 8.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009839799749687109,
+      "loss": 0.0,
+      "step": 70
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00981476846057572,
+      "loss": 0.0,
+      "step": 80
+    },
+    {
+      "epoch": 11.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00978973717146433,
+      "loss": 0.0,
+      "step": 90
+    },
+    {
+      "epoch": 12.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009764705882352941,
+      "loss": 0.0,
+      "step": 100
+    },
+    {
+      "epoch": 13.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009739674593241552,
+      "loss": 0.0,
+      "step": 110
+    },
+    {
+      "epoch": 15.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009714643304130162,
+      "loss": 0.0,
+      "step": 120
+    },
+    {
+      "epoch": 16.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009689612015018775,
+      "loss": 0.0,
+      "step": 130
+    },
+    {
+      "epoch": 17.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009664580725907385,
+      "loss": 0.0,
+      "step": 140
+    },
+    {
+      "epoch": 18.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009639549436795996,
+      "loss": 0.0,
+      "step": 150
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009614518147684605,
+      "loss": 0.0,
+      "step": 160
+    },
+    {
+      "epoch": 21.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009589486858573217,
+      "loss": 0.0,
+      "step": 170
+    },
+    {
+      "epoch": 22.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009564455569461828,
+      "loss": 0.0,
+      "step": 180
+    },
+    {
+      "epoch": 23.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009539424280350439,
+      "loss": 0.0,
+      "step": 190
+    },
+    {
+      "epoch": 25.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00951439299123905,
+      "loss": 0.0,
+      "step": 200
+    },
+    {
+      "epoch": 26.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00948936170212766,
+      "loss": 0.0,
+      "step": 210
+    },
+    {
+      "epoch": 27.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00946433041301627,
+      "loss": 0.0,
+      "step": 220
+    },
+    {
+      "epoch": 28.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009439299123904881,
+      "loss": 0.0,
+      "step": 230
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009414267834793492,
+      "loss": 0.0,
+      "step": 240
+    },
+    {
+      "epoch": 31.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009389236545682102,
+      "loss": 0.0,
+      "step": 250
+    },
+    {
+      "epoch": 32.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009364205256570713,
+      "loss": 0.0,
+      "step": 260
+    },
+    {
+      "epoch": 33.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009339173967459325,
+      "loss": 0.0,
+      "step": 270
+    },
+    {
+      "epoch": 35.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009314142678347936,
+      "loss": 0.0,
+      "step": 280
+    },
+    {
+      "epoch": 36.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009289111389236547,
+      "loss": 0.0,
+      "step": 290
+    },
+    {
+      "epoch": 37.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009264080100125156,
+      "loss": 0.0,
+      "step": 300
+    },
+    {
+      "epoch": 38.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009239048811013768,
+      "loss": 0.0,
+      "step": 310
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009214017521902379,
+      "loss": 0.0,
+      "step": 320
+    },
+    {
+      "epoch": 41.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00918898623279099,
+      "loss": 0.0,
+      "step": 330
+    },
+    {
+      "epoch": 42.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0091639549436796,
+      "loss": 0.0,
+      "step": 340
+    },
+    {
+      "epoch": 43.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00913892365456821,
+      "loss": 0.0,
+      "step": 350
+    },
+    {
+      "epoch": 45.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009113892365456821,
+      "loss": 0.0,
+      "step": 360
+    },
+    {
+      "epoch": 46.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009088861076345432,
+      "loss": 0.0,
+      "step": 370
+    },
+    {
+      "epoch": 47.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009063829787234043,
+      "loss": 0.0,
+      "step": 380
+    },
+    {
+      "epoch": 48.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009038798498122653,
+      "loss": 0.0,
+      "step": 390
+    },
+    {
+      "epoch": 50.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009013767209011264,
+      "loss": 0.0,
+      "step": 400
+    },
+    {
+      "epoch": 51.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008988735919899874,
+      "loss": 0.0,
+      "step": 410
+    },
+    {
+      "epoch": 52.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008963704630788487,
+      "loss": 0.0,
+      "step": 420
+    },
+    {
+      "epoch": 53.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008938673341677096,
+      "loss": 0.0,
+      "step": 430
+    },
+    {
+      "epoch": 55.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008913642052565706,
+      "loss": 0.0,
+      "step": 440
+    },
+    {
+      "epoch": 56.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008888610763454317,
+      "loss": 0.0,
+      "step": 450
+    },
+    {
+      "epoch": 57.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00886357947434293,
+      "loss": 0.0,
+      "step": 460
+    },
+    {
+      "epoch": 58.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00883854818523154,
+      "loss": 0.0,
+      "step": 470
+    },
+    {
+      "epoch": 60.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00881351689612015,
+      "loss": 0.0,
+      "step": 480
+    },
+    {
+      "epoch": 61.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008788485607008761,
+      "loss": 0.0,
+      "step": 490
+    },
+    {
+      "epoch": 62.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008763454317897372,
+      "loss": 0.0,
+      "step": 500
+    },
+    {
+      "epoch": 63.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008738423028785983,
+      "loss": 0.0,
+      "step": 510
+    },
+    {
+      "epoch": 65.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008713391739674593,
+      "loss": 0.0,
+      "step": 520
+    },
+    {
+      "epoch": 66.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008688360450563204,
+      "loss": 0.0,
+      "step": 530
+    },
+    {
+      "epoch": 67.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008663329161451815,
+      "loss": 0.0,
+      "step": 540
+    },
+    {
+      "epoch": 68.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008638297872340425,
+      "loss": 0.0,
+      "step": 550
+    },
+    {
+      "epoch": 70.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008613266583229038,
+      "loss": 0.0,
+      "step": 560
+    },
+    {
+      "epoch": 71.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008588235294117647,
+      "loss": 0.0,
+      "step": 570
+    },
+    {
+      "epoch": 72.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008563204005006257,
+      "loss": 0.0,
+      "step": 580
+    },
+    {
+      "epoch": 73.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008538172715894868,
+      "loss": 0.0,
+      "step": 590
+    },
+    {
+      "epoch": 75.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00851314142678348,
+      "loss": 0.0,
+      "step": 600
+    },
+    {
+      "epoch": 76.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00848811013767209,
+      "loss": 0.0,
+      "step": 610
+    },
+    {
+      "epoch": 77.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008463078848560701,
+      "loss": 0.0,
+      "step": 620
+    },
+    {
+      "epoch": 78.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008438047559449312,
+      "loss": 0.0,
+      "step": 630
+    },
+    {
+      "epoch": 80.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008413016270337923,
+      "loss": 0.0,
+      "step": 640
+    },
+    {
+      "epoch": 81.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008387984981226533,
+      "loss": 0.0,
+      "step": 650
+    },
+    {
+      "epoch": 82.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008362953692115144,
+      "loss": 0.0,
+      "step": 660
+    },
+    {
+      "epoch": 83.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008337922403003755,
+      "loss": 0.0,
+      "step": 670
+    },
+    {
+      "epoch": 85.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008312891113892365,
+      "loss": 0.0,
+      "step": 680
+    },
+    {
+      "epoch": 86.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008287859824780976,
+      "loss": 0.0,
+      "step": 690
+    },
+    {
+      "epoch": 87.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008262828535669588,
+      "loss": 0.0,
+      "step": 700
+    },
+    {
+      "epoch": 88.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008237797246558197,
+      "loss": 0.0,
+      "step": 710
+    },
+    {
+      "epoch": 90.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008212765957446808,
+      "loss": 0.0,
+      "step": 720
+    },
+    {
+      "epoch": 91.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008187734668335419,
+      "loss": 0.0,
+      "step": 730
+    },
+    {
+      "epoch": 92.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008162703379224031,
+      "loss": 0.0,
+      "step": 740
+    },
+    {
+      "epoch": 93.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008137672090112642,
+      "loss": 0.0,
+      "step": 750
+    },
+    {
+      "epoch": 95.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008112640801001252,
+      "loss": 0.0,
+      "step": 760
+    },
+    {
+      "epoch": 96.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008087609511889863,
+      "loss": 0.0,
+      "step": 770
+    },
+    {
+      "epoch": 97.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008062578222778474,
+      "loss": 0.0,
+      "step": 780
+    },
+    {
+      "epoch": 98.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008037546933667084,
+      "loss": 0.0,
+      "step": 790
+    },
+    {
+      "epoch": 100.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008012515644555695,
+      "loss": 0.0,
+      "step": 800
+    },
+    {
+      "epoch": 101.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007987484355444305,
+      "loss": 0.0,
+      "step": 810
+    },
+    {
+      "epoch": 102.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007962453066332916,
+      "loss": 0.0,
+      "step": 820
+    },
+    {
+      "epoch": 103.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007937421777221527,
+      "loss": 0.0,
+      "step": 830
+    },
+    {
+      "epoch": 105.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007912390488110137,
+      "loss": 0.0,
+      "step": 840
+    },
+    {
+      "epoch": 106.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007887359198998748,
+      "loss": 0.0,
+      "step": 850
+    },
+    {
+      "epoch": 107.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007862327909887359,
+      "loss": 0.0,
+      "step": 860
+    },
+    {
+      "epoch": 108.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00783729662077597,
+      "loss": 0.0,
+      "step": 870
+    },
+    {
+      "epoch": 110.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007812265331664581,
+      "loss": 0.0,
+      "step": 880
+    },
+    {
+      "epoch": 111.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0077872340425531915,
+      "loss": 0.0,
+      "step": 890
+    },
+    {
+      "epoch": 112.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007762202753441803,
+      "loss": 0.0,
+      "step": 900
+    },
+    {
+      "epoch": 113.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007737171464330414,
+      "loss": 0.0,
+      "step": 910
+    },
+    {
+      "epoch": 115.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0077121401752190235,
+      "loss": 0.0,
+      "step": 920
+    },
+    {
+      "epoch": 116.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007687108886107634,
+      "loss": 0.0,
+      "step": 930
+    },
+    {
+      "epoch": 117.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007662077596996246,
+      "loss": 0.0,
+      "step": 940
+    },
+    {
+      "epoch": 118.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007637046307884856,
+      "loss": 0.0,
+      "step": 950
+    },
+    {
+      "epoch": 120.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007612015018773467,
+      "loss": 0.0,
+      "step": 960
+    },
+    {
+      "epoch": 121.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007586983729662078,
+      "loss": 0.0,
+      "step": 970
+    },
+    {
+      "epoch": 122.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007561952440550689,
+      "loss": 0.0,
+      "step": 980
+    },
+    {
+      "epoch": 123.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007536921151439299,
+      "loss": 0.0,
+      "step": 990
+    },
+    {
+      "epoch": 125.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0075118898623279095,
+      "loss": 0.0,
+      "step": 1000
+    },
+    {
+      "epoch": 126.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007486858573216521,
+      "loss": 0.0,
+      "step": 1010
+    },
+    {
+      "epoch": 127.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007461827284105132,
+      "loss": 0.0,
+      "step": 1020
+    },
+    {
+      "epoch": 128.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007436795994993742,
+      "loss": 0.0,
+      "step": 1030
+    },
+    {
+      "epoch": 130.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007411764705882354,
+      "loss": 0.0,
+      "step": 1040
+    },
+    {
+      "epoch": 131.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0073867334167709645,
+      "loss": 0.0,
+      "step": 1050
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 500,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7087824.0,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1050/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6208b173b902d75ab9814341e70009f66ecc0ecd70d024038fcaaf650010173
+size 5137

checkpoint-1100/config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "activation_function": "gelu",
+  "add_cross_attention": false,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "embd_pdrop": 0.0,
+  "eos_token_id": 1,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 1,
+  "n_head": 1,
+  "n_inner": 1,
+  "n_layer": 1,
+  "n_positions": 1,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.0,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.12.0",
+  "use_cache": false,
+  "vocab_size": 2
+}

checkpoint-1100/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 1,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "pad_token_id": 0,
+  "transformers_version": "5.12.0",
+  "use_cache": true
+}

checkpoint-1100/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:880c2ec6c05c1e38595824792a1d3850592746c746ee2caa95bee8cb0e8ec77d
+size 1452

checkpoint-1100/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cecfcf884ab0004c59872c46381bfe51f9a073a67de02523a5acf9692d5ba866
+size 13823

checkpoint-1100/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5fc2de446fe61209ed367ff94c7ffb565e5e69564436f15adf49d20829abf178
+size 14455

checkpoint-1100/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb7541fe30b75a402f49ee32b1224b2f0f71b7e0c5a8d479bb8a263674caa2b4
+size 1465

checkpoint-1100/trainer_state.json ADDED Viewed

	@@ -0,0 +1,804 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 137.5,
+  "eval_steps": 500,
+  "global_step": 1100,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009989987484355445,
+      "loss": 0.0,
+      "step": 10
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009964956195244054,
+      "loss": 0.0,
+      "step": 20
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009939924906132666,
+      "loss": 0.0,
+      "step": 30
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009914893617021277,
+      "loss": 0.0,
+      "step": 40
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009889862327909888,
+      "loss": 0.0,
+      "step": 50
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009864831038798498,
+      "loss": 0.0,
+      "step": 60
+    },
+    {
+      "epoch": 8.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009839799749687109,
+      "loss": 0.0,
+      "step": 70
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00981476846057572,
+      "loss": 0.0,
+      "step": 80
+    },
+    {
+      "epoch": 11.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00978973717146433,
+      "loss": 0.0,
+      "step": 90
+    },
+    {
+      "epoch": 12.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009764705882352941,
+      "loss": 0.0,
+      "step": 100
+    },
+    {
+      "epoch": 13.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009739674593241552,
+      "loss": 0.0,
+      "step": 110
+    },
+    {
+      "epoch": 15.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009714643304130162,
+      "loss": 0.0,
+      "step": 120
+    },
+    {
+      "epoch": 16.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009689612015018775,
+      "loss": 0.0,
+      "step": 130
+    },
+    {
+      "epoch": 17.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009664580725907385,
+      "loss": 0.0,
+      "step": 140
+    },
+    {
+      "epoch": 18.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009639549436795996,
+      "loss": 0.0,
+      "step": 150
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009614518147684605,
+      "loss": 0.0,
+      "step": 160
+    },
+    {
+      "epoch": 21.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009589486858573217,
+      "loss": 0.0,
+      "step": 170
+    },
+    {
+      "epoch": 22.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009564455569461828,
+      "loss": 0.0,
+      "step": 180
+    },
+    {
+      "epoch": 23.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009539424280350439,
+      "loss": 0.0,
+      "step": 190
+    },
+    {
+      "epoch": 25.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00951439299123905,
+      "loss": 0.0,
+      "step": 200
+    },
+    {
+      "epoch": 26.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00948936170212766,
+      "loss": 0.0,
+      "step": 210
+    },
+    {
+      "epoch": 27.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00946433041301627,
+      "loss": 0.0,
+      "step": 220
+    },
+    {
+      "epoch": 28.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009439299123904881,
+      "loss": 0.0,
+      "step": 230
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009414267834793492,
+      "loss": 0.0,
+      "step": 240
+    },
+    {
+      "epoch": 31.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009389236545682102,
+      "loss": 0.0,
+      "step": 250
+    },
+    {
+      "epoch": 32.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009364205256570713,
+      "loss": 0.0,
+      "step": 260
+    },
+    {
+      "epoch": 33.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009339173967459325,
+      "loss": 0.0,
+      "step": 270
+    },
+    {
+      "epoch": 35.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009314142678347936,
+      "loss": 0.0,
+      "step": 280
+    },
+    {
+      "epoch": 36.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009289111389236547,
+      "loss": 0.0,
+      "step": 290
+    },
+    {
+      "epoch": 37.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009264080100125156,
+      "loss": 0.0,
+      "step": 300
+    },
+    {
+      "epoch": 38.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009239048811013768,
+      "loss": 0.0,
+      "step": 310
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009214017521902379,
+      "loss": 0.0,
+      "step": 320
+    },
+    {
+      "epoch": 41.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00918898623279099,
+      "loss": 0.0,
+      "step": 330
+    },
+    {
+      "epoch": 42.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0091639549436796,
+      "loss": 0.0,
+      "step": 340
+    },
+    {
+      "epoch": 43.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00913892365456821,
+      "loss": 0.0,
+      "step": 350
+    },
+    {
+      "epoch": 45.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009113892365456821,
+      "loss": 0.0,
+      "step": 360
+    },
+    {
+      "epoch": 46.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009088861076345432,
+      "loss": 0.0,
+      "step": 370
+    },
+    {
+      "epoch": 47.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009063829787234043,
+      "loss": 0.0,
+      "step": 380
+    },
+    {
+      "epoch": 48.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009038798498122653,
+      "loss": 0.0,
+      "step": 390
+    },
+    {
+      "epoch": 50.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009013767209011264,
+      "loss": 0.0,
+      "step": 400
+    },
+    {
+      "epoch": 51.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008988735919899874,
+      "loss": 0.0,
+      "step": 410
+    },
+    {
+      "epoch": 52.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008963704630788487,
+      "loss": 0.0,
+      "step": 420
+    },
+    {
+      "epoch": 53.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008938673341677096,
+      "loss": 0.0,
+      "step": 430
+    },
+    {
+      "epoch": 55.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008913642052565706,
+      "loss": 0.0,
+      "step": 440
+    },
+    {
+      "epoch": 56.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008888610763454317,
+      "loss": 0.0,
+      "step": 450
+    },
+    {
+      "epoch": 57.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00886357947434293,
+      "loss": 0.0,
+      "step": 460
+    },
+    {
+      "epoch": 58.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00883854818523154,
+      "loss": 0.0,
+      "step": 470
+    },
+    {
+      "epoch": 60.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00881351689612015,
+      "loss": 0.0,
+      "step": 480
+    },
+    {
+      "epoch": 61.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008788485607008761,
+      "loss": 0.0,
+      "step": 490
+    },
+    {
+      "epoch": 62.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008763454317897372,
+      "loss": 0.0,
+      "step": 500
+    },
+    {
+      "epoch": 63.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008738423028785983,
+      "loss": 0.0,
+      "step": 510
+    },
+    {
+      "epoch": 65.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008713391739674593,
+      "loss": 0.0,
+      "step": 520
+    },
+    {
+      "epoch": 66.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008688360450563204,
+      "loss": 0.0,
+      "step": 530
+    },
+    {
+      "epoch": 67.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008663329161451815,
+      "loss": 0.0,
+      "step": 540
+    },
+    {
+      "epoch": 68.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008638297872340425,
+      "loss": 0.0,
+      "step": 550
+    },
+    {
+      "epoch": 70.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008613266583229038,
+      "loss": 0.0,
+      "step": 560
+    },
+    {
+      "epoch": 71.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008588235294117647,
+      "loss": 0.0,
+      "step": 570
+    },
+    {
+      "epoch": 72.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008563204005006257,
+      "loss": 0.0,
+      "step": 580
+    },
+    {
+      "epoch": 73.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008538172715894868,
+      "loss": 0.0,
+      "step": 590
+    },
+    {
+      "epoch": 75.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00851314142678348,
+      "loss": 0.0,
+      "step": 600
+    },
+    {
+      "epoch": 76.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00848811013767209,
+      "loss": 0.0,
+      "step": 610
+    },
+    {
+      "epoch": 77.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008463078848560701,
+      "loss": 0.0,
+      "step": 620
+    },
+    {
+      "epoch": 78.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008438047559449312,
+      "loss": 0.0,
+      "step": 630
+    },
+    {
+      "epoch": 80.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008413016270337923,
+      "loss": 0.0,
+      "step": 640
+    },
+    {
+      "epoch": 81.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008387984981226533,
+      "loss": 0.0,
+      "step": 650
+    },
+    {
+      "epoch": 82.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008362953692115144,
+      "loss": 0.0,
+      "step": 660
+    },
+    {
+      "epoch": 83.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008337922403003755,
+      "loss": 0.0,
+      "step": 670
+    },
+    {
+      "epoch": 85.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008312891113892365,
+      "loss": 0.0,
+      "step": 680
+    },
+    {
+      "epoch": 86.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008287859824780976,
+      "loss": 0.0,
+      "step": 690
+    },
+    {
+      "epoch": 87.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008262828535669588,
+      "loss": 0.0,
+      "step": 700
+    },
+    {
+      "epoch": 88.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008237797246558197,
+      "loss": 0.0,
+      "step": 710
+    },
+    {
+      "epoch": 90.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008212765957446808,
+      "loss": 0.0,
+      "step": 720
+    },
+    {
+      "epoch": 91.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008187734668335419,
+      "loss": 0.0,
+      "step": 730
+    },
+    {
+      "epoch": 92.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008162703379224031,
+      "loss": 0.0,
+      "step": 740
+    },
+    {
+      "epoch": 93.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008137672090112642,
+      "loss": 0.0,
+      "step": 750
+    },
+    {
+      "epoch": 95.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008112640801001252,
+      "loss": 0.0,
+      "step": 760
+    },
+    {
+      "epoch": 96.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008087609511889863,
+      "loss": 0.0,
+      "step": 770
+    },
+    {
+      "epoch": 97.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008062578222778474,
+      "loss": 0.0,
+      "step": 780
+    },
+    {
+      "epoch": 98.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008037546933667084,
+      "loss": 0.0,
+      "step": 790
+    },
+    {
+      "epoch": 100.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008012515644555695,
+      "loss": 0.0,
+      "step": 800
+    },
+    {
+      "epoch": 101.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007987484355444305,
+      "loss": 0.0,
+      "step": 810
+    },
+    {
+      "epoch": 102.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007962453066332916,
+      "loss": 0.0,
+      "step": 820
+    },
+    {
+      "epoch": 103.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007937421777221527,
+      "loss": 0.0,
+      "step": 830
+    },
+    {
+      "epoch": 105.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007912390488110137,
+      "loss": 0.0,
+      "step": 840
+    },
+    {
+      "epoch": 106.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007887359198998748,
+      "loss": 0.0,
+      "step": 850
+    },
+    {
+      "epoch": 107.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007862327909887359,
+      "loss": 0.0,
+      "step": 860
+    },
+    {
+      "epoch": 108.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00783729662077597,
+      "loss": 0.0,
+      "step": 870
+    },
+    {
+      "epoch": 110.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007812265331664581,
+      "loss": 0.0,
+      "step": 880
+    },
+    {
+      "epoch": 111.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0077872340425531915,
+      "loss": 0.0,
+      "step": 890
+    },
+    {
+      "epoch": 112.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007762202753441803,
+      "loss": 0.0,
+      "step": 900
+    },
+    {
+      "epoch": 113.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007737171464330414,
+      "loss": 0.0,
+      "step": 910
+    },
+    {
+      "epoch": 115.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0077121401752190235,
+      "loss": 0.0,
+      "step": 920
+    },
+    {
+      "epoch": 116.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007687108886107634,
+      "loss": 0.0,
+      "step": 930
+    },
+    {
+      "epoch": 117.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007662077596996246,
+      "loss": 0.0,
+      "step": 940
+    },
+    {
+      "epoch": 118.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007637046307884856,
+      "loss": 0.0,
+      "step": 950
+    },
+    {
+      "epoch": 120.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007612015018773467,
+      "loss": 0.0,
+      "step": 960
+    },
+    {
+      "epoch": 121.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007586983729662078,
+      "loss": 0.0,
+      "step": 970
+    },
+    {
+      "epoch": 122.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007561952440550689,
+      "loss": 0.0,
+      "step": 980
+    },
+    {
+      "epoch": 123.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007536921151439299,
+      "loss": 0.0,
+      "step": 990
+    },
+    {
+      "epoch": 125.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0075118898623279095,
+      "loss": 0.0,
+      "step": 1000
+    },
+    {
+      "epoch": 126.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007486858573216521,
+      "loss": 0.0,
+      "step": 1010
+    },
+    {
+      "epoch": 127.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007461827284105132,
+      "loss": 0.0,
+      "step": 1020
+    },
+    {
+      "epoch": 128.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007436795994993742,
+      "loss": 0.0,
+      "step": 1030
+    },
+    {
+      "epoch": 130.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007411764705882354,
+      "loss": 0.0,
+      "step": 1040
+    },
+    {
+      "epoch": 131.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0073867334167709645,
+      "loss": 0.0,
+      "step": 1050
+    },
+    {
+      "epoch": 132.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007361702127659574,
+      "loss": 0.0,
+      "step": 1060
+    },
+    {
+      "epoch": 133.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007336670838548185,
+      "loss": 0.0,
+      "step": 1070
+    },
+    {
+      "epoch": 135.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007311639549436796,
+      "loss": 0.0,
+      "step": 1080
+    },
+    {
+      "epoch": 136.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007286608260325407,
+      "loss": 0.0,
+      "step": 1090
+    },
+    {
+      "epoch": 137.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007261576971214018,
+      "loss": 0.0,
+      "step": 1100
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 500,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7425648.0,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1100/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6208b173b902d75ab9814341e70009f66ecc0ecd70d024038fcaaf650010173
+size 5137

checkpoint-1150/config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "activation_function": "gelu",
+  "add_cross_attention": false,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "embd_pdrop": 0.0,
+  "eos_token_id": 1,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 1,
+  "n_head": 1,
+  "n_inner": 1,
+  "n_layer": 1,
+  "n_positions": 1,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.0,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.12.0",
+  "use_cache": false,
+  "vocab_size": 2
+}

checkpoint-1150/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 1,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "pad_token_id": 0,
+  "transformers_version": "5.12.0",
+  "use_cache": true
+}

checkpoint-1150/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:880c2ec6c05c1e38595824792a1d3850592746c746ee2caa95bee8cb0e8ec77d
+size 1452

checkpoint-1150/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c6f07608caca2628e4e7d4f07ad180212b4eccace16f6c14e1ba4b9c0752cd6
+size 13823

checkpoint-1150/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:adc2989b41697bc86300e49c8243db2e3b464f9053a52911e9b2ba6e76a2eee9
+size 14455

checkpoint-1150/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d053bc3befa0c54cb32d83b5ef57ea57d883adf0189ecfb06b92af8c072919c
+size 1465

checkpoint-1150/trainer_state.json ADDED Viewed

	@@ -0,0 +1,839 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 143.75,
+  "eval_steps": 500,
+  "global_step": 1150,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009989987484355445,
+      "loss": 0.0,
+      "step": 10
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009964956195244054,
+      "loss": 0.0,
+      "step": 20
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009939924906132666,
+      "loss": 0.0,
+      "step": 30
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009914893617021277,
+      "loss": 0.0,
+      "step": 40
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009889862327909888,
+      "loss": 0.0,
+      "step": 50
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009864831038798498,
+      "loss": 0.0,
+      "step": 60
+    },
+    {
+      "epoch": 8.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009839799749687109,
+      "loss": 0.0,
+      "step": 70
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00981476846057572,
+      "loss": 0.0,
+      "step": 80
+    },
+    {
+      "epoch": 11.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00978973717146433,
+      "loss": 0.0,
+      "step": 90
+    },
+    {
+      "epoch": 12.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009764705882352941,
+      "loss": 0.0,
+      "step": 100
+    },
+    {
+      "epoch": 13.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009739674593241552,
+      "loss": 0.0,
+      "step": 110
+    },
+    {
+      "epoch": 15.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009714643304130162,
+      "loss": 0.0,
+      "step": 120
+    },
+    {
+      "epoch": 16.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009689612015018775,
+      "loss": 0.0,
+      "step": 130
+    },
+    {
+      "epoch": 17.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009664580725907385,
+      "loss": 0.0,
+      "step": 140
+    },
+    {
+      "epoch": 18.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009639549436795996,
+      "loss": 0.0,
+      "step": 150
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009614518147684605,
+      "loss": 0.0,
+      "step": 160
+    },
+    {
+      "epoch": 21.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009589486858573217,
+      "loss": 0.0,
+      "step": 170
+    },
+    {
+      "epoch": 22.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009564455569461828,
+      "loss": 0.0,
+      "step": 180
+    },
+    {
+      "epoch": 23.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009539424280350439,
+      "loss": 0.0,
+      "step": 190
+    },
+    {
+      "epoch": 25.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00951439299123905,
+      "loss": 0.0,
+      "step": 200
+    },
+    {
+      "epoch": 26.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00948936170212766,
+      "loss": 0.0,
+      "step": 210
+    },
+    {
+      "epoch": 27.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00946433041301627,
+      "loss": 0.0,
+      "step": 220
+    },
+    {
+      "epoch": 28.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009439299123904881,
+      "loss": 0.0,
+      "step": 230
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009414267834793492,
+      "loss": 0.0,
+      "step": 240
+    },
+    {
+      "epoch": 31.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009389236545682102,
+      "loss": 0.0,
+      "step": 250
+    },
+    {
+      "epoch": 32.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009364205256570713,
+      "loss": 0.0,
+      "step": 260
+    },
+    {
+      "epoch": 33.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009339173967459325,
+      "loss": 0.0,
+      "step": 270
+    },
+    {
+      "epoch": 35.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009314142678347936,
+      "loss": 0.0,
+      "step": 280
+    },
+    {
+      "epoch": 36.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009289111389236547,
+      "loss": 0.0,
+      "step": 290
+    },
+    {
+      "epoch": 37.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009264080100125156,
+      "loss": 0.0,
+      "step": 300
+    },
+    {
+      "epoch": 38.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009239048811013768,
+      "loss": 0.0,
+      "step": 310
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009214017521902379,
+      "loss": 0.0,
+      "step": 320
+    },
+    {
+      "epoch": 41.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00918898623279099,
+      "loss": 0.0,
+      "step": 330
+    },
+    {
+      "epoch": 42.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0091639549436796,
+      "loss": 0.0,
+      "step": 340
+    },
+    {
+      "epoch": 43.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00913892365456821,
+      "loss": 0.0,
+      "step": 350
+    },
+    {
+      "epoch": 45.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009113892365456821,
+      "loss": 0.0,
+      "step": 360
+    },
+    {
+      "epoch": 46.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009088861076345432,
+      "loss": 0.0,
+      "step": 370
+    },
+    {
+      "epoch": 47.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009063829787234043,
+      "loss": 0.0,
+      "step": 380
+    },
+    {
+      "epoch": 48.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009038798498122653,
+      "loss": 0.0,
+      "step": 390
+    },
+    {
+      "epoch": 50.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009013767209011264,
+      "loss": 0.0,
+      "step": 400
+    },
+    {
+      "epoch": 51.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008988735919899874,
+      "loss": 0.0,
+      "step": 410
+    },
+    {
+      "epoch": 52.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008963704630788487,
+      "loss": 0.0,
+      "step": 420
+    },
+    {
+      "epoch": 53.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008938673341677096,
+      "loss": 0.0,
+      "step": 430
+    },
+    {
+      "epoch": 55.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008913642052565706,
+      "loss": 0.0,
+      "step": 440
+    },
+    {
+      "epoch": 56.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008888610763454317,
+      "loss": 0.0,
+      "step": 450
+    },
+    {
+      "epoch": 57.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00886357947434293,
+      "loss": 0.0,
+      "step": 460
+    },
+    {
+      "epoch": 58.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00883854818523154,
+      "loss": 0.0,
+      "step": 470
+    },
+    {
+      "epoch": 60.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00881351689612015,
+      "loss": 0.0,
+      "step": 480
+    },
+    {
+      "epoch": 61.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008788485607008761,
+      "loss": 0.0,
+      "step": 490
+    },
+    {
+      "epoch": 62.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008763454317897372,
+      "loss": 0.0,
+      "step": 500
+    },
+    {
+      "epoch": 63.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008738423028785983,
+      "loss": 0.0,
+      "step": 510
+    },
+    {
+      "epoch": 65.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008713391739674593,
+      "loss": 0.0,
+      "step": 520
+    },
+    {
+      "epoch": 66.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008688360450563204,
+      "loss": 0.0,
+      "step": 530
+    },
+    {
+      "epoch": 67.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008663329161451815,
+      "loss": 0.0,
+      "step": 540
+    },
+    {
+      "epoch": 68.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008638297872340425,
+      "loss": 0.0,
+      "step": 550
+    },
+    {
+      "epoch": 70.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008613266583229038,
+      "loss": 0.0,
+      "step": 560
+    },
+    {
+      "epoch": 71.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008588235294117647,
+      "loss": 0.0,
+      "step": 570
+    },
+    {
+      "epoch": 72.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008563204005006257,
+      "loss": 0.0,
+      "step": 580
+    },
+    {
+      "epoch": 73.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008538172715894868,
+      "loss": 0.0,
+      "step": 590
+    },
+    {
+      "epoch": 75.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00851314142678348,
+      "loss": 0.0,
+      "step": 600
+    },
+    {
+      "epoch": 76.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00848811013767209,
+      "loss": 0.0,
+      "step": 610
+    },
+    {
+      "epoch": 77.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008463078848560701,
+      "loss": 0.0,
+      "step": 620
+    },
+    {
+      "epoch": 78.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008438047559449312,
+      "loss": 0.0,
+      "step": 630
+    },
+    {
+      "epoch": 80.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008413016270337923,
+      "loss": 0.0,
+      "step": 640
+    },
+    {
+      "epoch": 81.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008387984981226533,
+      "loss": 0.0,
+      "step": 650
+    },
+    {
+      "epoch": 82.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008362953692115144,
+      "loss": 0.0,
+      "step": 660
+    },
+    {
+      "epoch": 83.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008337922403003755,
+      "loss": 0.0,
+      "step": 670
+    },
+    {
+      "epoch": 85.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008312891113892365,
+      "loss": 0.0,
+      "step": 680
+    },
+    {
+      "epoch": 86.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008287859824780976,
+      "loss": 0.0,
+      "step": 690
+    },
+    {
+      "epoch": 87.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008262828535669588,
+      "loss": 0.0,
+      "step": 700
+    },
+    {
+      "epoch": 88.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008237797246558197,
+      "loss": 0.0,
+      "step": 710
+    },
+    {
+      "epoch": 90.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008212765957446808,
+      "loss": 0.0,
+      "step": 720
+    },
+    {
+      "epoch": 91.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008187734668335419,
+      "loss": 0.0,
+      "step": 730
+    },
+    {
+      "epoch": 92.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008162703379224031,
+      "loss": 0.0,
+      "step": 740
+    },
+    {
+      "epoch": 93.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008137672090112642,
+      "loss": 0.0,
+      "step": 750
+    },
+    {
+      "epoch": 95.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008112640801001252,
+      "loss": 0.0,
+      "step": 760
+    },
+    {
+      "epoch": 96.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008087609511889863,
+      "loss": 0.0,
+      "step": 770
+    },
+    {
+      "epoch": 97.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008062578222778474,
+      "loss": 0.0,
+      "step": 780
+    },
+    {
+      "epoch": 98.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008037546933667084,
+      "loss": 0.0,
+      "step": 790
+    },
+    {
+      "epoch": 100.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008012515644555695,
+      "loss": 0.0,
+      "step": 800
+    },
+    {
+      "epoch": 101.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007987484355444305,
+      "loss": 0.0,
+      "step": 810
+    },
+    {
+      "epoch": 102.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007962453066332916,
+      "loss": 0.0,
+      "step": 820
+    },
+    {
+      "epoch": 103.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007937421777221527,
+      "loss": 0.0,
+      "step": 830
+    },
+    {
+      "epoch": 105.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007912390488110137,
+      "loss": 0.0,
+      "step": 840
+    },
+    {
+      "epoch": 106.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007887359198998748,
+      "loss": 0.0,
+      "step": 850
+    },
+    {
+      "epoch": 107.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007862327909887359,
+      "loss": 0.0,
+      "step": 860
+    },
+    {
+      "epoch": 108.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00783729662077597,
+      "loss": 0.0,
+      "step": 870
+    },
+    {
+      "epoch": 110.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007812265331664581,
+      "loss": 0.0,
+      "step": 880
+    },
+    {
+      "epoch": 111.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0077872340425531915,
+      "loss": 0.0,
+      "step": 890
+    },
+    {
+      "epoch": 112.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007762202753441803,
+      "loss": 0.0,
+      "step": 900
+    },
+    {
+      "epoch": 113.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007737171464330414,
+      "loss": 0.0,
+      "step": 910
+    },
+    {
+      "epoch": 115.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0077121401752190235,
+      "loss": 0.0,
+      "step": 920
+    },
+    {
+      "epoch": 116.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007687108886107634,
+      "loss": 0.0,
+      "step": 930
+    },
+    {
+      "epoch": 117.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007662077596996246,
+      "loss": 0.0,
+      "step": 940
+    },
+    {
+      "epoch": 118.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007637046307884856,
+      "loss": 0.0,
+      "step": 950
+    },
+    {
+      "epoch": 120.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007612015018773467,
+      "loss": 0.0,
+      "step": 960
+    },
+    {
+      "epoch": 121.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007586983729662078,
+      "loss": 0.0,
+      "step": 970
+    },
+    {
+      "epoch": 122.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007561952440550689,
+      "loss": 0.0,
+      "step": 980
+    },
+    {
+      "epoch": 123.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007536921151439299,
+      "loss": 0.0,
+      "step": 990
+    },
+    {
+      "epoch": 125.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0075118898623279095,
+      "loss": 0.0,
+      "step": 1000
+    },
+    {
+      "epoch": 126.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007486858573216521,
+      "loss": 0.0,
+      "step": 1010
+    },
+    {
+      "epoch": 127.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007461827284105132,
+      "loss": 0.0,
+      "step": 1020
+    },
+    {
+      "epoch": 128.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007436795994993742,
+      "loss": 0.0,
+      "step": 1030
+    },
+    {
+      "epoch": 130.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007411764705882354,
+      "loss": 0.0,
+      "step": 1040
+    },
+    {
+      "epoch": 131.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0073867334167709645,
+      "loss": 0.0,
+      "step": 1050
+    },
+    {
+      "epoch": 132.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007361702127659574,
+      "loss": 0.0,
+      "step": 1060
+    },
+    {
+      "epoch": 133.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007336670838548185,
+      "loss": 0.0,
+      "step": 1070
+    },
+    {
+      "epoch": 135.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007311639549436796,
+      "loss": 0.0,
+      "step": 1080
+    },
+    {
+      "epoch": 136.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007286608260325407,
+      "loss": 0.0,
+      "step": 1090
+    },
+    {
+      "epoch": 137.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007261576971214018,
+      "loss": 0.0,
+      "step": 1100
+    },
+    {
+      "epoch": 138.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007236545682102628,
+      "loss": 0.0,
+      "step": 1110
+    },
+    {
+      "epoch": 140.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00721151439299124,
+      "loss": 0.0,
+      "step": 1120
+    },
+    {
+      "epoch": 141.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00718648310387985,
+      "loss": 0.0,
+      "step": 1130
+    },
+    {
+      "epoch": 142.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00716145181476846,
+      "loss": 0.0,
+      "step": 1140
+    },
+    {
+      "epoch": 143.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007136420525657071,
+      "loss": 0.0,
+      "step": 1150
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 500,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7763472.0,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1150/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6208b173b902d75ab9814341e70009f66ecc0ecd70d024038fcaaf650010173
+size 5137

checkpoint-1200/config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "activation_function": "gelu",
+  "add_cross_attention": false,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "embd_pdrop": 0.0,
+  "eos_token_id": 1,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 1,
+  "n_head": 1,
+  "n_inner": 1,
+  "n_layer": 1,
+  "n_positions": 1,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.0,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.12.0",
+  "use_cache": false,
+  "vocab_size": 2
+}

checkpoint-1200/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 1,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "pad_token_id": 0,
+  "transformers_version": "5.12.0",
+  "use_cache": true
+}

checkpoint-1200/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:880c2ec6c05c1e38595824792a1d3850592746c746ee2caa95bee8cb0e8ec77d
+size 1452

checkpoint-1200/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3975bc7a21f5a4fd5059daec4f18dc973217774caa4f4484f1069b0e3cf0034e
+size 13823

checkpoint-1200/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5992258b0e831b29c37beeefed96237fb5573ddb793970294c8b6f2dc3098fd8
+size 14455

checkpoint-1200/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:822b71f69a7890e9b8716f7f8e72637712c24b0c354b6f54e6d112260cd7264c
+size 1465

checkpoint-1200/trainer_state.json ADDED Viewed

	@@ -0,0 +1,874 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 150.0,
+  "eval_steps": 500,
+  "global_step": 1200,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009989987484355445,
+      "loss": 0.0,
+      "step": 10
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009964956195244054,
+      "loss": 0.0,
+      "step": 20
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009939924906132666,
+      "loss": 0.0,
+      "step": 30
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009914893617021277,
+      "loss": 0.0,
+      "step": 40
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009889862327909888,
+      "loss": 0.0,
+      "step": 50
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009864831038798498,
+      "loss": 0.0,
+      "step": 60
+    },
+    {
+      "epoch": 8.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009839799749687109,
+      "loss": 0.0,
+      "step": 70
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00981476846057572,
+      "loss": 0.0,
+      "step": 80
+    },
+    {
+      "epoch": 11.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00978973717146433,
+      "loss": 0.0,
+      "step": 90
+    },
+    {
+      "epoch": 12.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009764705882352941,
+      "loss": 0.0,
+      "step": 100
+    },
+    {
+      "epoch": 13.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009739674593241552,
+      "loss": 0.0,
+      "step": 110
+    },
+    {
+      "epoch": 15.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009714643304130162,
+      "loss": 0.0,
+      "step": 120
+    },
+    {
+      "epoch": 16.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009689612015018775,
+      "loss": 0.0,
+      "step": 130
+    },
+    {
+      "epoch": 17.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009664580725907385,
+      "loss": 0.0,
+      "step": 140
+    },
+    {
+      "epoch": 18.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009639549436795996,
+      "loss": 0.0,
+      "step": 150
+    },
+    {
+      "epoch": 20.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009614518147684605,
+      "loss": 0.0,
+      "step": 160
+    },
+    {
+      "epoch": 21.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009589486858573217,
+      "loss": 0.0,
+      "step": 170
+    },
+    {
+      "epoch": 22.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009564455569461828,
+      "loss": 0.0,
+      "step": 180
+    },
+    {
+      "epoch": 23.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009539424280350439,
+      "loss": 0.0,
+      "step": 190
+    },
+    {
+      "epoch": 25.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00951439299123905,
+      "loss": 0.0,
+      "step": 200
+    },
+    {
+      "epoch": 26.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00948936170212766,
+      "loss": 0.0,
+      "step": 210
+    },
+    {
+      "epoch": 27.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00946433041301627,
+      "loss": 0.0,
+      "step": 220
+    },
+    {
+      "epoch": 28.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009439299123904881,
+      "loss": 0.0,
+      "step": 230
+    },
+    {
+      "epoch": 30.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009414267834793492,
+      "loss": 0.0,
+      "step": 240
+    },
+    {
+      "epoch": 31.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009389236545682102,
+      "loss": 0.0,
+      "step": 250
+    },
+    {
+      "epoch": 32.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009364205256570713,
+      "loss": 0.0,
+      "step": 260
+    },
+    {
+      "epoch": 33.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009339173967459325,
+      "loss": 0.0,
+      "step": 270
+    },
+    {
+      "epoch": 35.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009314142678347936,
+      "loss": 0.0,
+      "step": 280
+    },
+    {
+      "epoch": 36.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009289111389236547,
+      "loss": 0.0,
+      "step": 290
+    },
+    {
+      "epoch": 37.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009264080100125156,
+      "loss": 0.0,
+      "step": 300
+    },
+    {
+      "epoch": 38.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009239048811013768,
+      "loss": 0.0,
+      "step": 310
+    },
+    {
+      "epoch": 40.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009214017521902379,
+      "loss": 0.0,
+      "step": 320
+    },
+    {
+      "epoch": 41.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00918898623279099,
+      "loss": 0.0,
+      "step": 330
+    },
+    {
+      "epoch": 42.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0091639549436796,
+      "loss": 0.0,
+      "step": 340
+    },
+    {
+      "epoch": 43.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00913892365456821,
+      "loss": 0.0,
+      "step": 350
+    },
+    {
+      "epoch": 45.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009113892365456821,
+      "loss": 0.0,
+      "step": 360
+    },
+    {
+      "epoch": 46.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009088861076345432,
+      "loss": 0.0,
+      "step": 370
+    },
+    {
+      "epoch": 47.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009063829787234043,
+      "loss": 0.0,
+      "step": 380
+    },
+    {
+      "epoch": 48.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009038798498122653,
+      "loss": 0.0,
+      "step": 390
+    },
+    {
+      "epoch": 50.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.009013767209011264,
+      "loss": 0.0,
+      "step": 400
+    },
+    {
+      "epoch": 51.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008988735919899874,
+      "loss": 0.0,
+      "step": 410
+    },
+    {
+      "epoch": 52.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008963704630788487,
+      "loss": 0.0,
+      "step": 420
+    },
+    {
+      "epoch": 53.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008938673341677096,
+      "loss": 0.0,
+      "step": 430
+    },
+    {
+      "epoch": 55.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008913642052565706,
+      "loss": 0.0,
+      "step": 440
+    },
+    {
+      "epoch": 56.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008888610763454317,
+      "loss": 0.0,
+      "step": 450
+    },
+    {
+      "epoch": 57.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00886357947434293,
+      "loss": 0.0,
+      "step": 460
+    },
+    {
+      "epoch": 58.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00883854818523154,
+      "loss": 0.0,
+      "step": 470
+    },
+    {
+      "epoch": 60.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00881351689612015,
+      "loss": 0.0,
+      "step": 480
+    },
+    {
+      "epoch": 61.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008788485607008761,
+      "loss": 0.0,
+      "step": 490
+    },
+    {
+      "epoch": 62.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008763454317897372,
+      "loss": 0.0,
+      "step": 500
+    },
+    {
+      "epoch": 63.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008738423028785983,
+      "loss": 0.0,
+      "step": 510
+    },
+    {
+      "epoch": 65.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008713391739674593,
+      "loss": 0.0,
+      "step": 520
+    },
+    {
+      "epoch": 66.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008688360450563204,
+      "loss": 0.0,
+      "step": 530
+    },
+    {
+      "epoch": 67.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008663329161451815,
+      "loss": 0.0,
+      "step": 540
+    },
+    {
+      "epoch": 68.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008638297872340425,
+      "loss": 0.0,
+      "step": 550
+    },
+    {
+      "epoch": 70.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008613266583229038,
+      "loss": 0.0,
+      "step": 560
+    },
+    {
+      "epoch": 71.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008588235294117647,
+      "loss": 0.0,
+      "step": 570
+    },
+    {
+      "epoch": 72.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008563204005006257,
+      "loss": 0.0,
+      "step": 580
+    },
+    {
+      "epoch": 73.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008538172715894868,
+      "loss": 0.0,
+      "step": 590
+    },
+    {
+      "epoch": 75.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00851314142678348,
+      "loss": 0.0,
+      "step": 600
+    },
+    {
+      "epoch": 76.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00848811013767209,
+      "loss": 0.0,
+      "step": 610
+    },
+    {
+      "epoch": 77.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008463078848560701,
+      "loss": 0.0,
+      "step": 620
+    },
+    {
+      "epoch": 78.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008438047559449312,
+      "loss": 0.0,
+      "step": 630
+    },
+    {
+      "epoch": 80.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008413016270337923,
+      "loss": 0.0,
+      "step": 640
+    },
+    {
+      "epoch": 81.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008387984981226533,
+      "loss": 0.0,
+      "step": 650
+    },
+    {
+      "epoch": 82.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008362953692115144,
+      "loss": 0.0,
+      "step": 660
+    },
+    {
+      "epoch": 83.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008337922403003755,
+      "loss": 0.0,
+      "step": 670
+    },
+    {
+      "epoch": 85.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008312891113892365,
+      "loss": 0.0,
+      "step": 680
+    },
+    {
+      "epoch": 86.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008287859824780976,
+      "loss": 0.0,
+      "step": 690
+    },
+    {
+      "epoch": 87.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008262828535669588,
+      "loss": 0.0,
+      "step": 700
+    },
+    {
+      "epoch": 88.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008237797246558197,
+      "loss": 0.0,
+      "step": 710
+    },
+    {
+      "epoch": 90.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008212765957446808,
+      "loss": 0.0,
+      "step": 720
+    },
+    {
+      "epoch": 91.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008187734668335419,
+      "loss": 0.0,
+      "step": 730
+    },
+    {
+      "epoch": 92.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008162703379224031,
+      "loss": 0.0,
+      "step": 740
+    },
+    {
+      "epoch": 93.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008137672090112642,
+      "loss": 0.0,
+      "step": 750
+    },
+    {
+      "epoch": 95.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008112640801001252,
+      "loss": 0.0,
+      "step": 760
+    },
+    {
+      "epoch": 96.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008087609511889863,
+      "loss": 0.0,
+      "step": 770
+    },
+    {
+      "epoch": 97.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008062578222778474,
+      "loss": 0.0,
+      "step": 780
+    },
+    {
+      "epoch": 98.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008037546933667084,
+      "loss": 0.0,
+      "step": 790
+    },
+    {
+      "epoch": 100.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.008012515644555695,
+      "loss": 0.0,
+      "step": 800
+    },
+    {
+      "epoch": 101.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007987484355444305,
+      "loss": 0.0,
+      "step": 810
+    },
+    {
+      "epoch": 102.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007962453066332916,
+      "loss": 0.0,
+      "step": 820
+    },
+    {
+      "epoch": 103.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007937421777221527,
+      "loss": 0.0,
+      "step": 830
+    },
+    {
+      "epoch": 105.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007912390488110137,
+      "loss": 0.0,
+      "step": 840
+    },
+    {
+      "epoch": 106.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007887359198998748,
+      "loss": 0.0,
+      "step": 850
+    },
+    {
+      "epoch": 107.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007862327909887359,
+      "loss": 0.0,
+      "step": 860
+    },
+    {
+      "epoch": 108.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00783729662077597,
+      "loss": 0.0,
+      "step": 870
+    },
+    {
+      "epoch": 110.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007812265331664581,
+      "loss": 0.0,
+      "step": 880
+    },
+    {
+      "epoch": 111.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0077872340425531915,
+      "loss": 0.0,
+      "step": 890
+    },
+    {
+      "epoch": 112.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007762202753441803,
+      "loss": 0.0,
+      "step": 900
+    },
+    {
+      "epoch": 113.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007737171464330414,
+      "loss": 0.0,
+      "step": 910
+    },
+    {
+      "epoch": 115.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0077121401752190235,
+      "loss": 0.0,
+      "step": 920
+    },
+    {
+      "epoch": 116.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007687108886107634,
+      "loss": 0.0,
+      "step": 930
+    },
+    {
+      "epoch": 117.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007662077596996246,
+      "loss": 0.0,
+      "step": 940
+    },
+    {
+      "epoch": 118.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007637046307884856,
+      "loss": 0.0,
+      "step": 950
+    },
+    {
+      "epoch": 120.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007612015018773467,
+      "loss": 0.0,
+      "step": 960
+    },
+    {
+      "epoch": 121.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007586983729662078,
+      "loss": 0.0,
+      "step": 970
+    },
+    {
+      "epoch": 122.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007561952440550689,
+      "loss": 0.0,
+      "step": 980
+    },
+    {
+      "epoch": 123.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007536921151439299,
+      "loss": 0.0,
+      "step": 990
+    },
+    {
+      "epoch": 125.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0075118898623279095,
+      "loss": 0.0,
+      "step": 1000
+    },
+    {
+      "epoch": 126.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007486858573216521,
+      "loss": 0.0,
+      "step": 1010
+    },
+    {
+      "epoch": 127.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007461827284105132,
+      "loss": 0.0,
+      "step": 1020
+    },
+    {
+      "epoch": 128.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007436795994993742,
+      "loss": 0.0,
+      "step": 1030
+    },
+    {
+      "epoch": 130.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007411764705882354,
+      "loss": 0.0,
+      "step": 1040
+    },
+    {
+      "epoch": 131.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.0073867334167709645,
+      "loss": 0.0,
+      "step": 1050
+    },
+    {
+      "epoch": 132.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007361702127659574,
+      "loss": 0.0,
+      "step": 1060
+    },
+    {
+      "epoch": 133.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007336670838548185,
+      "loss": 0.0,
+      "step": 1070
+    },
+    {
+      "epoch": 135.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007311639549436796,
+      "loss": 0.0,
+      "step": 1080
+    },
+    {
+      "epoch": 136.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007286608260325407,
+      "loss": 0.0,
+      "step": 1090
+    },
+    {
+      "epoch": 137.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007261576971214018,
+      "loss": 0.0,
+      "step": 1100
+    },
+    {
+      "epoch": 138.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007236545682102628,
+      "loss": 0.0,
+      "step": 1110
+    },
+    {
+      "epoch": 140.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00721151439299124,
+      "loss": 0.0,
+      "step": 1120
+    },
+    {
+      "epoch": 141.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00718648310387985,
+      "loss": 0.0,
+      "step": 1130
+    },
+    {
+      "epoch": 142.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.00716145181476846,
+      "loss": 0.0,
+      "step": 1140
+    },
+    {
+      "epoch": 143.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007136420525657071,
+      "loss": 0.0,
+      "step": 1150
+    },
+    {
+      "epoch": 145.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007111389236545682,
+      "loss": 0.0,
+      "step": 1160
+    },
+    {
+      "epoch": 146.25,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007086357947434293,
+      "loss": 0.0,
+      "step": 1170
+    },
+    {
+      "epoch": 147.5,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007061326658322904,
+      "loss": 0.0,
+      "step": 1180
+    },
+    {
+      "epoch": 148.75,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007036295369211515,
+      "loss": 0.0,
+      "step": 1190
+    },
+    {
+      "epoch": 150.0,
+      "grad_norm": 0.0,
+      "learning_rate": 0.007011264080100125,
+      "loss": 0.0,
+      "step": 1200
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 500,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8100000.0,
+  "train_batch_size": 64,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1200/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6208b173b902d75ab9814341e70009f66ecc0ecd70d024038fcaaf650010173
+size 5137

checkpoint-1250/config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "activation_function": "gelu",
+  "add_cross_attention": false,
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.0,
+  "bos_token_id": 1,
+  "dtype": "float32",
+  "embd_pdrop": 0.0,
+  "eos_token_id": 1,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_embd": 1,
+  "n_head": 1,
+  "n_inner": 1,
+  "n_layer": 1,
+  "n_positions": 1,
+  "pad_token_id": 0,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.0,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.12.0",
+  "use_cache": false,
+  "vocab_size": 2
+}

checkpoint-1250/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 1,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "pad_token_id": 0,
+  "transformers_version": "5.12.0",
+  "use_cache": true
+}