daibeiya commited on Oct 11, 2025

Commit

4d46bcb

verified ·

1 Parent(s): adddc7c

model upload

Browse files

Files changed (39) hide show

contextlm_gpt2_base/README.md +81 -0
contextlm_gpt2_base/config.json +43 -0
contextlm_gpt2_base/generation_config.json +6 -0
contextlm_gpt2_base/model.safetensors +3 -0
contextlm_gpt2_base/special_tokens_map.json +5 -0
contextlm_gpt2_base/tokenizer.json +0 -0
contextlm_gpt2_base/tokenizer_config.json +20 -0
contextlm_gpt2_base/trainer_state.json +0 -0
contextlm_gpt2_base/vocab.json +0 -0
contextlm_gpt2_large/README.md +81 -0
contextlm_gpt2_large/config.json +43 -0
contextlm_gpt2_large/generation_config.json +6 -0
contextlm_gpt2_large/model.safetensors +3 -0
contextlm_gpt2_large/special_tokens_map.json +5 -0
contextlm_gpt2_large/tokenizer.json +0 -0
contextlm_gpt2_large/tokenizer_config.json +20 -0
contextlm_gpt2_large/trainer_state.json +0 -0
contextlm_gpt2_large/vocab.json +0 -0
contextlm_gpt2_med/README.md +81 -0
contextlm_gpt2_med/config.json +45 -0
contextlm_gpt2_med/generation_config.json +6 -0
contextlm_gpt2_med/model.safetensors +3 -0
contextlm_gpt2_med/special_tokens_map.json +5 -0
contextlm_gpt2_med/tokenizer.json +0 -0
contextlm_gpt2_med/tokenizer_config.json +20 -0
contextlm_gpt2_med/trainer_state.json +0 -0
contextlm_gpt2_med/vocab.json +0 -0
contextlm_gpt2_xl/README.md +81 -0
contextlm_gpt2_xl/config.json +44 -0
contextlm_gpt2_xl/generation_config.json +6 -0
contextlm_gpt2_xl/model-00001-of-00002.safetensors +3 -0
contextlm_gpt2_xl/model-00002-of-00002.safetensors +3 -0
contextlm_gpt2_xl/model.safetensors.index.json +612 -0
contextlm_gpt2_xl/special_tokens_map.json +5 -0
contextlm_gpt2_xl/tokenizer.json +0 -0
contextlm_gpt2_xl/tokenizer_config.json +20 -0
contextlm_gpt2_xl/trainer_state.json +0 -0
contextlm_gpt2_xl/training_args.bin +3 -0
contextlm_gpt2_xl/vocab.json +0 -0

contextlm_gpt2_base/README.md ADDED Viewed

	@@ -0,0 +1,81 @@

+---
+library_name: transformers
+base_model: /fs-computility/plm/linzhouhan/daibeiya/models/gpt2
+tags:
+- generated_from_trainer
+datasets:
+- openwebtext
+model-index:
+- name: gpt2_base_contextlm_l0212_add_lnnorm_wodetach_v2_lr_bf16_lr1e-3
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# gpt2_base_contextlm_l0212_add_lnnorm_wodetach_v2_lr_bf16_lr1e-3
+This model is a fine-tuned version of [/fs-computility/plm/linzhouhan/daibeiya/models/gpt2](https://huggingface.co//fs-computility/plm/linzhouhan/daibeiya/models/gpt2) on the openwebtext dataset.
+It achieves the following results on the evaluation set:
+- Loss: 3.0300
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.001
+- train_batch_size: 16
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 16
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 512
+- total_eval_batch_size: 128
+- optimizer: Use adamw_torch with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 1.0
+### Training results
+| Training Loss | Epoch  | Step  | Validation Loss |
+|:-------------:|:------:|:-----:|:---------------:|
+| 3.9616        | 0.0580 | 1000  | 3.8911          |
+| 3.5512        | 0.1160 | 2000  | 3.4861          |
+| 3.4279        | 0.1741 | 3000  | 3.3560          |
+| 3.3471        | 0.2321 | 4000  | 3.2811          |
+| 3.2957        | 0.2901 | 5000  | 3.2321          |
+| 3.2677        | 0.3481 | 6000  | 3.1945          |
+| 3.225         | 0.4062 | 7000  | 3.1653          |
+| 3.2051        | 0.4642 | 8000  | 3.1390          |
+| 3.1816        | 0.5222 | 9000  | 3.1161          |
+| 3.1583        | 0.5802 | 10000 | 3.0971          |
+| 3.1464        | 0.6383 | 11000 | 3.0794          |
+| 3.1365        | 0.6963 | 12000 | 3.0645          |
+| 3.1256        | 0.7543 | 13000 | 3.0509          |
+| 3.1073        | 0.8123 | 14000 | 3.0417          |
+| 3.108         | 0.8703 | 15000 | 3.0349          |
+| 3.098         | 0.9284 | 16000 | 3.0312          |
+| 3.092         | 0.9864 | 17000 | 3.0301          |
+### Framework versions
+- Transformers 4.51.3
+- Pytorch 2.3.0+cu121
+- Datasets 4.0.0
+- Tokenizers 0.21.4

contextlm_gpt2_base/config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "TokenGPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "context_lm_layers": 2,
+  "embd_pdrop": 0.1,
+  "encoder_layers": 0,
+  "eos_token_id": 50256,
+  "hlm_n_embd": 768,
+  "hlm_n_head": 12,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "loss_type": "",
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 768,
+  "n_head": 12,
+  "n_inner": null,
+  "n_layer": 12,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "use_context_lm": true,
+  "vocab_size": 50257,
+  "w_size": 4
+}

contextlm_gpt2_base/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.51.3"
+}

contextlm_gpt2_base/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c916b3802836617a1c1adf9fee6a96918451676a6c74c50d71d341fbf8715fd
+size 708869264

contextlm_gpt2_base/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

contextlm_gpt2_base/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

contextlm_gpt2_base/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

contextlm_gpt2_base/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

contextlm_gpt2_base/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

contextlm_gpt2_large/README.md ADDED Viewed

	@@ -0,0 +1,81 @@

+---
+library_name: transformers
+base_model: /fs-computility/plm/linzhouhan/daibeiya/models/gpt2-large
+tags:
+- generated_from_trainer
+datasets:
+- openwebtext
+model-index:
+- name: gpt2_large_contextlm_l0236_add_lnnorm_lr_bf16_lr6e-4
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# gpt2_large_contextlm_l0236_add_lnnorm_lr_bf16_lr6e-4
+This model is a fine-tuned version of [/fs-computility/plm/linzhouhan/daibeiya/models/gpt2-large](https://huggingface.co//fs-computility/plm/linzhouhan/daibeiya/models/gpt2-large) on the openwebtext dataset.
+It achieves the following results on the evaluation set:
+- Loss: 2.7357
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0006
+- train_batch_size: 4
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 64
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 512
+- total_eval_batch_size: 512
+- optimizer: Use adamw_torch with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 1.0
+### Training results
+| Training Loss | Epoch  | Step  | Validation Loss |
+|:-------------:|:------:|:-----:|:---------------:|
+| 3.8376        | 0.0580 | 1000  | 3.7978          |
+| 3.3595        | 0.1160 | 2000  | 3.3179          |
+| 3.207         | 0.1741 | 3000  | 3.1580          |
+| 3.0999        | 0.2321 | 4000  | 3.0631          |
+| 3.0333        | 0.2901 | 5000  | 2.9981          |
+| 2.9958        | 0.3481 | 6000  | 2.9496          |
+| 2.9443        | 0.4062 | 7000  | 2.9102          |
+| 2.9097        | 0.4642 | 8000  | 2.8760          |
+| 2.879         | 0.5222 | 9000  | 2.8451          |
+| 2.8506        | 0.5802 | 10000 | 2.8198          |
+| 2.831         | 0.6383 | 11000 | 2.7969          |
+| 2.8156        | 0.6963 | 12000 | 2.7781          |
+| 2.799         | 0.7543 | 13000 | 2.7616          |
+| 2.7802        | 0.8123 | 14000 | 2.7494          |
+| 2.7785        | 0.8703 | 15000 | 2.7414          |
+| 2.7706        | 0.9284 | 16000 | 2.7370          |
+| 2.7665        | 0.9864 | 17000 | 2.7357          |
+### Framework versions
+- Transformers 4.51.3
+- Pytorch 2.3.0+cu121
+- Datasets 4.0.0
+- Tokenizers 0.21.4

contextlm_gpt2_large/config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "TokenGPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "context_lm_layers": 2,
+  "embd_pdrop": 0.1,
+  "encoder_layers": 0,
+  "eos_token_id": 50256,
+  "hlm_n_embd": 1280,
+  "hlm_n_head": 20,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "loss_type": "",
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 1280,
+  "n_head": 20,
+  "n_inner": null,
+  "n_layer": 36,
+  "n_positions": 1024,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "use_context_lm": true,
+  "vocab_size": 50257,
+  "w_size": 4
+}

contextlm_gpt2_large/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.51.3"
+}

contextlm_gpt2_large/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ace6b1c7f74a072624b7e6a60c078b0fd32f7e285fd5740dccfac2215afb2ee4
+size 3510903960

contextlm_gpt2_large/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

contextlm_gpt2_large/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

contextlm_gpt2_large/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

contextlm_gpt2_large/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

contextlm_gpt2_large/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

contextlm_gpt2_med/README.md ADDED Viewed

	@@ -0,0 +1,81 @@

+---
+library_name: transformers
+base_model: /fs-computility/plm/linzhouhan/daibeiya/models/gpt2-medium
+tags:
+- generated_from_trainer
+datasets:
+- openwebtext
+model-index:
+- name: gpt2_med_contextlm_l0224_add_lnsnorm_lr_bf16_lr8e-4
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# gpt2_med_contextlm_l0224_add_lnsnorm_lr_bf16_lr8e-4
+This model is a fine-tuned version of [/fs-computility/plm/linzhouhan/daibeiya/models/gpt2-medium](https://huggingface.co//fs-computility/plm/linzhouhan/daibeiya/models/gpt2-medium) on the openwebtext dataset.
+It achieves the following results on the evaluation set:
+- Loss: 2.8359
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0008
+- train_batch_size: 8
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 32
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 512
+- total_eval_batch_size: 256
+- optimizer: Use adamw_torch with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 1.0
+### Training results
+| Training Loss | Epoch  | Step  | Validation Loss |
+|:-------------:|:------:|:-----:|:---------------:|
+| 3.8765        | 0.0580 | 1000  | 3.8193          |
+| 3.421         | 0.1160 | 2000  | 3.3707          |
+| 3.279         | 0.1741 | 3000  | 3.2197          |
+| 3.1821        | 0.2321 | 4000  | 3.1376          |
+| 3.1191        | 0.2901 | 5000  | 3.0776          |
+| 3.0841        | 0.3481 | 6000  | 3.0317          |
+| 3.0417        | 0.4062 | 7000  | 2.9948          |
+| 3.0114        | 0.4642 | 8000  | 2.9659          |
+| 2.9864        | 0.5222 | 9000  | 2.9386          |
+| 2.9603        | 0.5802 | 10000 | 2.9137          |
+| 2.9408        | 0.6383 | 11000 | 2.8931          |
+| 2.926         | 0.6963 | 12000 | 2.8762          |
+| 2.9121        | 0.7543 | 13000 | 2.8603          |
+| 2.8928        | 0.8123 | 14000 | 2.8493          |
+| 2.8938        | 0.8703 | 15000 | 2.8413          |
+| 2.8893        | 0.9284 | 16000 | 2.8372          |
+| 2.8822        | 0.9864 | 17000 | 2.8359          |
+### Framework versions
+- Transformers 4.51.3
+- Pytorch 2.3.0+cu121
+- Datasets 4.0.0
+- Tokenizers 0.21.4

contextlm_gpt2_med/config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "TokenGPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "context_lm_layers": 2,
+  "embd_pdrop": 0.1,
+  "encoder_layers": 0,
+  "eos_token_id": 50256,
+  "hlm_n_embd": 1024,
+  "hlm_n_head": 16,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "loss_type": "",
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 1024,
+  "n_head": 16,
+  "n_inner": null,
+  "n_layer": 24,
+  "n_positions": 1024,
+  "n_special": 0,
+  "predict_special_tokens": true,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "use_context_lm": true,
+  "vocab_size": 50257,
+  "w_size": 4
+}

contextlm_gpt2_med/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.51.3"
+}

contextlm_gpt2_med/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38870d5befca0c861aaccc44c93c3f73eec904b3193d6d7282e555ed9bdf930b
+size 1725948040

contextlm_gpt2_med/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

contextlm_gpt2_med/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

contextlm_gpt2_med/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

contextlm_gpt2_med/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

contextlm_gpt2_med/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

contextlm_gpt2_xl/README.md ADDED Viewed

	@@ -0,0 +1,81 @@

+---
+library_name: transformers
+base_model: /fs-computility/plm/linzhouhan/daibeiya/models/gpt2-xl
+tags:
+- generated_from_trainer
+datasets:
+- openwebtext
+model-index:
+- name: gpt2_xl_contextlm_l0248_lnsnorm_add_lr_bf16_lr4e-4
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# gpt2_xl_contextlm_l0248_lnsnorm_add_lr_bf16_lr4e-4
+This model is a fine-tuned version of [/fs-computility/plm/linzhouhan/daibeiya/models/gpt2-xl](https://huggingface.co//fs-computility/plm/linzhouhan/daibeiya/models/gpt2-xl) on the openwebtext dataset.
+It achieves the following results on the evaluation set:
+- Loss: 2.6815
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0004
+- train_batch_size: 2
+- eval_batch_size: 8
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 64
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 512
+- total_eval_batch_size: 512
+- optimizer: Use adamw_torch with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.05
+- num_epochs: 1.0
+### Training results
+| Training Loss | Epoch  | Step  | Validation Loss |
+|:-------------:|:------:|:-----:|:---------------:|
+| 3.886         | 0.0580 | 1000  | 3.8423          |
+| 3.3459        | 0.1160 | 2000  | 3.3027          |
+| 3.1809        | 0.1741 | 3000  | 3.1339          |
+| 3.0677        | 0.2321 | 4000  | 3.0360          |
+| 2.9938        | 0.2901 | 5000  | 2.9655          |
+| 2.954         | 0.3481 | 6000  | 2.9108          |
+| 2.8931        | 0.4062 | 7000  | 2.8683          |
+| 2.8584        | 0.4642 | 8000  | 2.8317          |
+| 2.8266        | 0.5222 | 9000  | 2.7985          |
+| 2.7939        | 0.5802 | 10000 | 2.7710          |
+| 2.7718        | 0.6383 | 11000 | 2.7471          |
+| 2.7557        | 0.6963 | 12000 | 2.7263          |
+| 2.7418        | 0.7543 | 13000 | 2.7088          |
+| 2.7167        | 0.8123 | 14000 | 2.6962          |
+| 2.7161        | 0.8704 | 15000 | 2.6872          |
+| 2.7094        | 0.9284 | 16000 | 2.6828          |
+| 2.7068        | 0.9864 | 17000 | 2.6815          |
+### Framework versions
+- Transformers 4.51.3
+- Pytorch 2.3.0+cu121
+- Datasets 4.0.0
+- Tokenizers 0.21.4

contextlm_gpt2_xl/config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "TokenGPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "context_lm_layers": 2,
+  "embd_pdrop": 0.1,
+  "encoder_layers": 0,
+  "eos_token_id": 50256,
+  "hlm_n_embd": 1600,
+  "hlm_n_head": 25,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "loss_type": "",
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 1600,
+  "n_head": 25,
+  "n_inner": null,
+  "n_layer": 48,
+  "n_positions": 1024,
+  "output_past": true,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "use_context_lm": true,
+  "vocab_size": 50257,
+  "w_size": 4
+}

contextlm_gpt2_xl/generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.51.3"
+}

contextlm_gpt2_xl/model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:050d8631c04df02c1e144e4fc623797908fb6a2a675d726164d0405e3acf6118
+size 4959894384

contextlm_gpt2_xl/model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4dd54c343114d53d15138ef50a72e82f927754f251a04f2af6c92f48b4f622ae
+size 1838185032

contextlm_gpt2_xl/model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,612 @@

+{
+  "metadata": {
+    "total_size": 6798016000
+  },
+  "weight_map": {
+    "context_decoder.0.attn.c_attn.bias": "model-00002-of-00002.safetensors",
+    "context_decoder.0.attn.c_attn.weight": "model-00002-of-00002.safetensors",
+    "context_decoder.0.attn.c_proj.bias": "model-00002-of-00002.safetensors",
+    "context_decoder.0.attn.c_proj.weight": "model-00002-of-00002.safetensors",
+    "context_decoder.0.ln_1.bias": "model-00002-of-00002.safetensors",
+    "context_decoder.0.ln_1.weight": "model-00002-of-00002.safetensors",
+    "context_decoder.0.ln_2.bias": "model-00002-of-00002.safetensors",
+    "context_decoder.0.ln_2.weight": "model-00002-of-00002.safetensors",
+    "context_decoder.0.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
+    "context_decoder.0.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
+    "context_decoder.0.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
+    "context_decoder.0.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
+    "context_decoder.1.attn.c_attn.bias": "model-00002-of-00002.safetensors",
+    "context_decoder.1.attn.c_attn.weight": "model-00002-of-00002.safetensors",
+    "context_decoder.1.attn.c_proj.bias": "model-00002-of-00002.safetensors",
+    "context_decoder.1.attn.c_proj.weight": "model-00002-of-00002.safetensors",
+    "context_decoder.1.ln_1.bias": "model-00002-of-00002.safetensors",
+    "context_decoder.1.ln_1.weight": "model-00002-of-00002.safetensors",
+    "context_decoder.1.ln_2.bias": "model-00002-of-00002.safetensors",
+    "context_decoder.1.ln_2.weight": "model-00002-of-00002.safetensors",
+    "context_decoder.1.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
+    "context_decoder.1.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
+    "context_decoder.1.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
+    "context_decoder.1.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "ln_f.bias": "model-00001-of-00002.safetensors",
+    "ln_f.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.0.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.0.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.0.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.0.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.0.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.0.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.0.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.0.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.0.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.0.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.0.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.0.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.1.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.1.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.1.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.1.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.1.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.1.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.1.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.1.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.1.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.1.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.1.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.1.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.10.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.10.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.10.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.10.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.10.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.10.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.10.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.10.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.10.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.10.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.10.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.10.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.11.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.11.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.11.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.11.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.11.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.11.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.11.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.11.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.11.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.11.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.11.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.11.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.12.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.12.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.12.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.12.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.12.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.12.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.12.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.12.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.12.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.12.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.12.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.12.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.13.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.13.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.13.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.13.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.13.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.13.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.13.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.13.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.13.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.13.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.13.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.13.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.14.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.14.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.14.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.14.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.14.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.14.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.14.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.14.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.14.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.14.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.14.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.14.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.15.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.15.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.15.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.15.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.15.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.15.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.15.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.15.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.15.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.15.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.15.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.15.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.16.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.16.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.16.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.16.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.16.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.16.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.16.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.16.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.16.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.16.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.16.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.16.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.17.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.17.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.17.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.17.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.17.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.17.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.17.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.17.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.17.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.17.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.17.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.17.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.18.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.18.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.18.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.18.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.18.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.18.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.18.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.18.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.18.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.18.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.18.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.18.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.19.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.19.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.19.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.19.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.19.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.19.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.19.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.19.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.19.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.19.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.19.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.19.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.2.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.2.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.2.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.2.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.2.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.2.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.2.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.2.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.2.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.2.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.2.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.2.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.20.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.20.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.20.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.20.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.20.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.20.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.20.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.20.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.20.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.20.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.20.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.20.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.21.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.21.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.21.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.21.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.21.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.21.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.21.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.21.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.21.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.21.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.21.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.21.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.22.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.22.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.22.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.22.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.22.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.22.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.22.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.22.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.22.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.22.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.22.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.22.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.23.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.23.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.23.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.23.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.23.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.23.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.23.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.23.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.23.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.23.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.23.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.23.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.24.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.24.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.24.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.24.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.24.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.24.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.24.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.24.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.24.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.24.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.24.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.24.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.25.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.25.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.25.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.25.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.25.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.25.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.25.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.25.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.25.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.25.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.25.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.25.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.26.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.26.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.26.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.26.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.26.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.26.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.26.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.26.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.26.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.26.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.26.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.26.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.27.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.27.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.27.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.27.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.27.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.27.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.27.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.27.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.27.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.27.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.27.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.27.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.28.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.28.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.28.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.28.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.28.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.28.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.28.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.28.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.28.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.28.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.28.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.28.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.29.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.29.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.29.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.29.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.29.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.29.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.29.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.29.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.29.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.29.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.29.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.29.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.3.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.3.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.3.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.3.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.3.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.3.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.3.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.3.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.3.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.3.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.3.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.3.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.30.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.30.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.30.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.30.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.30.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.30.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.30.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.30.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.30.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.30.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.30.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.30.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.31.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.31.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.31.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.31.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.31.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.31.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.31.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.31.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.31.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.31.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.31.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.31.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.32.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.32.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.32.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.32.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.32.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.32.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.32.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.32.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.32.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.32.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.32.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.32.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.33.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.33.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.33.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.33.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.33.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.33.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.33.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.33.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.33.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.33.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.33.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.33.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.34.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.34.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.34.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.34.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.34.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.34.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.34.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.34.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.34.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.34.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.34.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.34.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.35.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.35.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.35.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.35.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.35.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.35.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.35.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.35.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.35.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.35.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.35.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.35.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.36.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.36.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.36.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.36.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.36.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.36.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.36.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.36.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.36.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.36.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.36.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.36.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.37.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.37.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.37.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.37.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.37.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.37.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.37.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.37.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.37.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.37.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.37.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.37.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.38.attn.c_attn.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.38.attn.c_attn.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.38.attn.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.38.attn.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.38.ln_1.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.38.ln_1.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.38.ln_2.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.38.ln_2.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.38.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.38.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.38.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.38.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.39.attn.c_attn.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.39.attn.c_attn.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.39.attn.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.39.attn.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.39.ln_1.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.39.ln_1.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.39.ln_2.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.39.ln_2.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.39.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.39.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.39.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.39.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.4.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.4.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.4.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.4.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.4.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.4.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.4.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.4.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.4.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.4.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.4.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.4.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.40.attn.c_attn.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.40.attn.c_attn.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.40.attn.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.40.attn.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.40.ln_1.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.40.ln_1.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.40.ln_2.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.40.ln_2.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.40.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.40.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.40.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.40.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.41.attn.c_attn.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.41.attn.c_attn.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.41.attn.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.41.attn.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.41.ln_1.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.41.ln_1.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.41.ln_2.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.41.ln_2.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.41.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.41.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.41.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.41.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.42.attn.c_attn.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.42.attn.c_attn.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.42.attn.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.42.attn.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.42.ln_1.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.42.ln_1.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.42.ln_2.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.42.ln_2.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.42.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.42.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.42.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.42.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.43.attn.c_attn.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.43.attn.c_attn.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.43.attn.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.43.attn.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.43.ln_1.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.43.ln_1.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.43.ln_2.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.43.ln_2.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.43.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.43.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.43.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.43.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.44.attn.c_attn.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.44.attn.c_attn.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.44.attn.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.44.attn.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.44.ln_1.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.44.ln_1.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.44.ln_2.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.44.ln_2.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.44.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.44.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.44.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.44.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.45.attn.c_attn.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.45.attn.c_attn.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.45.attn.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.45.attn.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.45.ln_1.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.45.ln_1.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.45.ln_2.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.45.ln_2.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.45.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.45.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.45.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.45.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.46.attn.c_attn.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.46.attn.c_attn.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.46.attn.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.46.attn.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.46.ln_1.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.46.ln_1.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.46.ln_2.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.46.ln_2.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.46.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.46.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.46.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.46.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.47.attn.c_attn.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.47.attn.c_attn.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.47.attn.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.47.attn.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.47.ln_1.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.47.ln_1.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.47.ln_2.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.47.ln_2.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.47.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.47.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.47.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
+    "token_decoder.47.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
+    "token_decoder.5.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.5.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.5.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.5.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.5.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.5.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.5.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.5.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.5.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.5.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.5.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.5.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.6.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.6.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.6.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.6.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.6.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.6.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.6.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.6.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.6.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.6.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.6.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.6.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.7.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.7.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.7.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.7.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.7.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.7.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.7.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.7.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.7.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.7.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.7.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.7.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.8.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.8.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.8.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.8.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.8.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.8.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.8.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.8.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.8.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.8.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.8.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.8.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.9.attn.c_attn.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.9.attn.c_attn.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.9.attn.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.9.attn.c_proj.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.9.ln_1.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.9.ln_1.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.9.ln_2.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.9.ln_2.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.9.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.9.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
+    "token_decoder.9.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
+    "token_decoder.9.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
+    "wpe.weight": "model-00001-of-00002.safetensors",
+    "wte.weight": "model-00001-of-00002.safetensors"
+  }
+}

contextlm_gpt2_xl/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

contextlm_gpt2_xl/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

contextlm_gpt2_xl/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "50256": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 1024,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>"
+}

contextlm_gpt2_xl/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

contextlm_gpt2_xl/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a97880ab37b6430dcc4dd66cf7c42d52550b439fb8816fa28c8d32c8a6722bc
+size 5432

contextlm_gpt2_xl/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff