liangc40 commited on May 28, 2025

Commit

ca7286e

verified ·

1 Parent(s): 5233916

Upload folder using huggingface_hub

Browse files

Files changed (20) hide show

checkpoint-282/config.json +41 -0
checkpoint-282/generation_config.json +6 -0
checkpoint-282/model.safetensors +3 -0
checkpoint-282/optimizer.pt +3 -0
checkpoint-282/rng_state.pth +3 -0
checkpoint-282/scaler.pt +3 -0
checkpoint-282/scheduler.pt +3 -0
checkpoint-282/special_tokens_map.json +7 -0
checkpoint-282/tokenizer.json +0 -0
checkpoint-282/tokenizer_config.json +58 -0
checkpoint-282/trainer_state.json +230 -0
checkpoint-282/training_args.bin +3 -0
checkpoint-282/vocab.txt +0 -0
config.json +41 -0
generation_config.json +6 -0
model.safetensors +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +58 -0
vocab.txt +0 -0

checkpoint-282/config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "gradient_checkpointing": false,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "output_past": true,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 120
+    }
+  },
+  "tokenizer_class": "BertTokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.2",
+  "use_cache": true,
+  "vocab_size": 22557
+}

checkpoint-282/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.52.2"
+}

checkpoint-282/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09510b232645240c2ad0d7e03da8b7664e7fefb179efc7af2c98279daba1e737
+size 412679808

checkpoint-282/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b7a49406660605bedb67139e0739c7528a4c1c82f4cdcddc5bb40b29ed20f36
+size 825453498

checkpoint-282/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3837a2b51525e412b7700c9c872dacba4f277c1a3292053e3a1ccd8b5d06bcc4
+size 14244

checkpoint-282/scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82fbb8a41699527f8e897a6a4653fd200a409dcfcca603338998c5883c75ff74
+size 988

checkpoint-282/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ad6dd2b5727fe8a6f4108f69903c823118467fa11ae53cc14acdc6b5095985b
+size 1064

checkpoint-282/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

checkpoint-282/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-282/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

checkpoint-282/trainer_state.json ADDED Viewed

	@@ -0,0 +1,230 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 282,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.10638297872340426,
+      "grad_norm": 2.73763108253479,
+      "learning_rate": 4.840425531914894e-05,
+      "loss": 7.0593,
+      "step": 10
+    },
+    {
+      "epoch": 0.2127659574468085,
+      "grad_norm": 4.184208869934082,
+      "learning_rate": 4.663120567375887e-05,
+      "loss": 4.3509,
+      "step": 20
+    },
+    {
+      "epoch": 0.3191489361702128,
+      "grad_norm": 2.815650224685669,
+      "learning_rate": 4.48581560283688e-05,
+      "loss": 2.3531,
+      "step": 30
+    },
+    {
+      "epoch": 0.425531914893617,
+      "grad_norm": 1.4420971870422363,
+      "learning_rate": 4.3085106382978725e-05,
+      "loss": 1.3577,
+      "step": 40
+    },
+    {
+      "epoch": 0.5319148936170213,
+      "grad_norm": 1.3123074769973755,
+      "learning_rate": 4.1312056737588654e-05,
+      "loss": 0.8151,
+      "step": 50
+    },
+    {
+      "epoch": 0.6382978723404256,
+      "grad_norm": 1.1796232461929321,
+      "learning_rate": 3.953900709219858e-05,
+      "loss": 0.7291,
+      "step": 60
+    },
+    {
+      "epoch": 0.7446808510638298,
+      "grad_norm": 1.0259984731674194,
+      "learning_rate": 3.776595744680852e-05,
+      "loss": 0.5471,
+      "step": 70
+    },
+    {
+      "epoch": 0.851063829787234,
+      "grad_norm": 0.7393821477890015,
+      "learning_rate": 3.599290780141844e-05,
+      "loss": 0.5002,
+      "step": 80
+    },
+    {
+      "epoch": 0.9574468085106383,
+      "grad_norm": 1.040688157081604,
+      "learning_rate": 3.4219858156028374e-05,
+      "loss": 0.4176,
+      "step": 90
+    },
+    {
+      "epoch": 1.0638297872340425,
+      "grad_norm": 0.8522056341171265,
+      "learning_rate": 3.2446808510638296e-05,
+      "loss": 0.4897,
+      "step": 100
+    },
+    {
+      "epoch": 1.1702127659574468,
+      "grad_norm": 0.6385951638221741,
+      "learning_rate": 3.067375886524823e-05,
+      "loss": 0.3598,
+      "step": 110
+    },
+    {
+      "epoch": 1.2765957446808511,
+      "grad_norm": 0.5381748676300049,
+      "learning_rate": 2.8900709219858156e-05,
+      "loss": 0.332,
+      "step": 120
+    },
+    {
+      "epoch": 1.3829787234042552,
+      "grad_norm": 0.697250247001648,
+      "learning_rate": 2.7127659574468084e-05,
+      "loss": 0.4078,
+      "step": 130
+    },
+    {
+      "epoch": 1.4893617021276595,
+      "grad_norm": 0.7715345025062561,
+      "learning_rate": 2.5354609929078016e-05,
+      "loss": 0.3357,
+      "step": 140
+    },
+    {
+      "epoch": 1.5957446808510638,
+      "grad_norm": 0.5877780914306641,
+      "learning_rate": 2.3581560283687945e-05,
+      "loss": 0.3374,
+      "step": 150
+    },
+    {
+      "epoch": 1.702127659574468,
+      "grad_norm": 1.4718451499938965,
+      "learning_rate": 2.1808510638297873e-05,
+      "loss": 0.4327,
+      "step": 160
+    },
+    {
+      "epoch": 1.8085106382978724,
+      "grad_norm": 0.5314385890960693,
+      "learning_rate": 2.0035460992907805e-05,
+      "loss": 0.357,
+      "step": 170
+    },
+    {
+      "epoch": 1.9148936170212765,
+      "grad_norm": 0.5589902400970459,
+      "learning_rate": 1.8262411347517733e-05,
+      "loss": 0.3687,
+      "step": 180
+    },
+    {
+      "epoch": 2.021276595744681,
+      "grad_norm": 0.5115184187889099,
+      "learning_rate": 1.6489361702127658e-05,
+      "loss": 0.2445,
+      "step": 190
+    },
+    {
+      "epoch": 2.127659574468085,
+      "grad_norm": 0.5463274121284485,
+      "learning_rate": 1.4716312056737588e-05,
+      "loss": 0.2548,
+      "step": 200
+    },
+    {
+      "epoch": 2.2340425531914896,
+      "grad_norm": 0.5499547123908997,
+      "learning_rate": 1.2943262411347517e-05,
+      "loss": 0.3375,
+      "step": 210
+    },
+    {
+      "epoch": 2.3404255319148937,
+      "grad_norm": 0.49347832798957825,
+      "learning_rate": 1.1170212765957447e-05,
+      "loss": 0.3534,
+      "step": 220
+    },
+    {
+      "epoch": 2.4468085106382977,
+      "grad_norm": 0.5885722041130066,
+      "learning_rate": 9.397163120567375e-06,
+      "loss": 0.2803,
+      "step": 230
+    },
+    {
+      "epoch": 2.5531914893617023,
+      "grad_norm": 0.5481957197189331,
+      "learning_rate": 7.6241134751773054e-06,
+      "loss": 0.3067,
+      "step": 240
+    },
+    {
+      "epoch": 2.6595744680851063,
+      "grad_norm": 0.49966493248939514,
+      "learning_rate": 5.851063829787235e-06,
+      "loss": 0.317,
+      "step": 250
+    },
+    {
+      "epoch": 2.7659574468085104,
+      "grad_norm": 0.7681831121444702,
+      "learning_rate": 4.078014184397164e-06,
+      "loss": 0.3071,
+      "step": 260
+    },
+    {
+      "epoch": 2.872340425531915,
+      "grad_norm": 0.8694835901260376,
+      "learning_rate": 2.304964539007092e-06,
+      "loss": 0.2728,
+      "step": 270
+    },
+    {
+      "epoch": 2.978723404255319,
+      "grad_norm": 0.5909697413444519,
+      "learning_rate": 5.319148936170213e-07,
+      "loss": 0.3545,
+      "step": 280
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 282,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 585555443712000.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-282/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d5a1110da641557af387452a47e512af163f4302d8ce372d6f380832ead79fd
+size 5304

checkpoint-282/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "gradient_checkpointing": false,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "output_past": true,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 120
+    }
+  },
+  "tokenizer_class": "BertTokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.2",
+  "use_cache": true,
+  "vocab_size": 22557
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.52.2"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09510b232645240c2ad0d7e03da8b7664e7fefb179efc7af2c98279daba1e737
+size 412679808

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff