ThomasTheMaker commited on Sep 6, 2025

Commit

e6a8f10

verified ·

1 Parent(s): db1e772

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +58 -0
checkpoint-1000/config.json +31 -0
checkpoint-1000/generation_config.json +9 -0
checkpoint-1000/merges.txt +0 -0
checkpoint-1000/model.safetensors +3 -0
checkpoint-1000/optimizer.pt +3 -0
checkpoint-1000/rng_state.pth +3 -0
checkpoint-1000/scheduler.pt +3 -0
checkpoint-1000/special_tokens_map.json +42 -0
checkpoint-1000/tokenizer.json +0 -0
checkpoint-1000/tokenizer_config.json +168 -0
checkpoint-1000/trainer_state.json +134 -0
checkpoint-1000/training_args.bin +3 -0
checkpoint-1000/vocab.json +0 -0
checkpoint-1500/config.json +31 -0
checkpoint-1500/generation_config.json +9 -0
checkpoint-1500/merges.txt +0 -0
checkpoint-1500/model.safetensors +3 -0
checkpoint-1500/optimizer.pt +3 -0
checkpoint-1500/rng_state.pth +3 -0
checkpoint-1500/scheduler.pt +3 -0
checkpoint-1500/special_tokens_map.json +42 -0
checkpoint-1500/tokenizer.json +0 -0
checkpoint-1500/tokenizer_config.json +168 -0
checkpoint-1500/trainer_state.json +184 -0
checkpoint-1500/training_args.bin +3 -0
checkpoint-1500/vocab.json +0 -0
checkpoint-2000/config.json +31 -0
checkpoint-2000/generation_config.json +9 -0
checkpoint-2000/merges.txt +0 -0
checkpoint-2000/model.safetensors +3 -0
checkpoint-2000/optimizer.pt +3 -0
checkpoint-2000/rng_state.pth +3 -0
checkpoint-2000/scheduler.pt +3 -0
checkpoint-2000/special_tokens_map.json +42 -0
checkpoint-2000/tokenizer.json +0 -0
checkpoint-2000/tokenizer_config.json +168 -0
checkpoint-2000/trainer_state.json +234 -0
checkpoint-2000/training_args.bin +3 -0
checkpoint-2000/vocab.json +0 -0
checkpoint-2067/config.json +31 -0
checkpoint-2067/generation_config.json +9 -0
checkpoint-2067/merges.txt +0 -0
checkpoint-2067/model.safetensors +3 -0
checkpoint-2067/optimizer.pt +3 -0
checkpoint-2067/rng_state.pth +3 -0
checkpoint-2067/scheduler.pt +3 -0
checkpoint-2067/special_tokens_map.json +42 -0
checkpoint-2067/tokenizer.json +0 -0
checkpoint-2067/tokenizer_config.json +168 -0

README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+---
+base_model: HuggingFaceTB/SmolLM2-135M
+library_name: transformers
+model_name: SmolLM2-360M-synthetic-concise-reasoning
+tags:
+- generated_from_trainer
+- sft
+- trl
+licence: license
+---
+# Model Card for SmolLM2-360M-synthetic-concise-reasoning
+This model is a fine-tuned version of [HuggingFaceTB/SmolLM2-135M](https://huggingface.co/HuggingFaceTB/SmolLM2-135M).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="argilla/SmolLM2-360M-synthetic-concise-reasoning", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with SFT.
+### Framework versions
+- TRL: 0.22.2
+- Transformers: 4.56.1
+- Pytorch: 2.6.0+cu118
+- Datasets: 4.0.0
+- Tokenizers: 0.22.0
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

checkpoint-1000/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dtype": "float32",
+  "eos_token_id": 0,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 576,
+  "initializer_range": 0.041666666666666664,
+  "intermediate_size": 1536,
+  "is_llama_config": true,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 9,
+  "num_hidden_layers": 30,
+  "num_key_value_heads": 3,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_interleaved": false,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.56.1",
+  "use_cache": true,
+  "vocab_size": 49152
+}

checkpoint-1000/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": [
+    0,
+    2
+  ],
+  "transformers_version": "4.56.1"
+}

checkpoint-1000/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a50959752f862381ff7f39dfccb85402123aa683c228a2be568318ccad2d4d86
+size 538090408

checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5aa72712c84d57764e814f5703010be6b080fc1e84b74eaaf4dfc24bc6cba25
+size 1076349050

checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
+size 14244

checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82923676bac30b43d960a8f638e325f0748321ea701521152093eaaf7a41a687
+size 1064

checkpoint-1000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-1000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,134 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.4837929366231253,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.9665795528888703,
+      "epoch": 0.04837929366231253,
+      "grad_norm": 4.359325885772705,
+      "learning_rate": 4.760522496371553e-05,
+      "loss": 1.7769,
+      "mean_token_accuracy": 0.5657356014847755,
+      "num_tokens": 50640.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.9170209395885467,
+      "epoch": 0.09675858732462506,
+      "grad_norm": 2.7793142795562744,
+      "learning_rate": 4.5186260280599906e-05,
+      "loss": 1.7405,
+      "mean_token_accuracy": 0.567837278842926,
+      "num_tokens": 99726.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.8398987126350403,
+      "epoch": 0.14513788098693758,
+      "grad_norm": 2.6684820652008057,
+      "learning_rate": 4.276729559748428e-05,
+      "loss": 1.6837,
+      "mean_token_accuracy": 0.5797487896680832,
+      "num_tokens": 150973.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.854783646464348,
+      "epoch": 0.1935171746492501,
+      "grad_norm": 2.2839877605438232,
+      "learning_rate": 4.0348330914368655e-05,
+      "loss": 1.6963,
+      "mean_token_accuracy": 0.5789705204963684,
+      "num_tokens": 203850.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.8420159757137298,
+      "epoch": 0.24189646831156264,
+      "grad_norm": 5.425283908843994,
+      "learning_rate": 3.792936623125303e-05,
+      "loss": 1.6699,
+      "mean_token_accuracy": 0.5854617989063263,
+      "num_tokens": 251974.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.8752372413873672,
+      "epoch": 0.29027576197387517,
+      "grad_norm": 3.1188437938690186,
+      "learning_rate": 3.55104015481374e-05,
+      "loss": 1.7033,
+      "mean_token_accuracy": 0.5757111895084381,
+      "num_tokens": 301563.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.8328426551818848,
+      "epoch": 0.3386550556361877,
+      "grad_norm": 3.9171409606933594,
+      "learning_rate": 3.309143686502178e-05,
+      "loss": 1.6552,
+      "mean_token_accuracy": 0.5872596988081932,
+      "num_tokens": 352401.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.7965506362915038,
+      "epoch": 0.3870343492985002,
+      "grad_norm": 3.305162191390991,
+      "learning_rate": 3.0672472181906144e-05,
+      "loss": 1.6148,
+      "mean_token_accuracy": 0.591159172654152,
+      "num_tokens": 401032.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.828446706533432,
+      "epoch": 0.43541364296081275,
+      "grad_norm": 3.2749598026275635,
+      "learning_rate": 2.8253507498790522e-05,
+      "loss": 1.6748,
+      "mean_token_accuracy": 0.5816510277986526,
+      "num_tokens": 453704.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.8458160006999969,
+      "epoch": 0.4837929366231253,
+      "grad_norm": 4.897568702697754,
+      "learning_rate": 2.5834542815674896e-05,
+      "loss": 1.6887,
+      "mean_token_accuracy": 0.5766134199500084,
+      "num_tokens": 503598.0,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 2067,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 427061513942784.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0243dd8856240fbf0fd7008bf18425c39b8b15885c5015aadc422caea0037a44
+size 5752

checkpoint-1000/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1500/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dtype": "float32",
+  "eos_token_id": 0,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 576,
+  "initializer_range": 0.041666666666666664,
+  "intermediate_size": 1536,
+  "is_llama_config": true,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 9,
+  "num_hidden_layers": 30,
+  "num_key_value_heads": 3,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_interleaved": false,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.56.1",
+  "use_cache": true,
+  "vocab_size": 49152
+}

checkpoint-1500/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": [
+    0,
+    2
+  ],
+  "transformers_version": "4.56.1"
+}

checkpoint-1500/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1500/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91f8300ae0214997e3ac0b2c45d763ac14b0b94bc62a09a4ba6028845abdc6d0
+size 538090408

checkpoint-1500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab2984dd7b044339b6b5b36845e08c04249dad9ab18d87ff0a6305d118b41031
+size 1076349050

checkpoint-1500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
+size 14244

checkpoint-1500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05c5a8254ba3982d5a11403ecd64e131e7bc842516095781dbccca9a89adde29
+size 1064

checkpoint-1500/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-1500/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-1500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

checkpoint-1500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,184 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.7256894049346879,
+  "eval_steps": 500,
+  "global_step": 1500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.9665795528888703,
+      "epoch": 0.04837929366231253,
+      "grad_norm": 4.359325885772705,
+      "learning_rate": 4.760522496371553e-05,
+      "loss": 1.7769,
+      "mean_token_accuracy": 0.5657356014847755,
+      "num_tokens": 50640.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.9170209395885467,
+      "epoch": 0.09675858732462506,
+      "grad_norm": 2.7793142795562744,
+      "learning_rate": 4.5186260280599906e-05,
+      "loss": 1.7405,
+      "mean_token_accuracy": 0.567837278842926,
+      "num_tokens": 99726.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.8398987126350403,
+      "epoch": 0.14513788098693758,
+      "grad_norm": 2.6684820652008057,
+      "learning_rate": 4.276729559748428e-05,
+      "loss": 1.6837,
+      "mean_token_accuracy": 0.5797487896680832,
+      "num_tokens": 150973.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.854783646464348,
+      "epoch": 0.1935171746492501,
+      "grad_norm": 2.2839877605438232,
+      "learning_rate": 4.0348330914368655e-05,
+      "loss": 1.6963,
+      "mean_token_accuracy": 0.5789705204963684,
+      "num_tokens": 203850.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.8420159757137298,
+      "epoch": 0.24189646831156264,
+      "grad_norm": 5.425283908843994,
+      "learning_rate": 3.792936623125303e-05,
+      "loss": 1.6699,
+      "mean_token_accuracy": 0.5854617989063263,
+      "num_tokens": 251974.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.8752372413873672,
+      "epoch": 0.29027576197387517,
+      "grad_norm": 3.1188437938690186,
+      "learning_rate": 3.55104015481374e-05,
+      "loss": 1.7033,
+      "mean_token_accuracy": 0.5757111895084381,
+      "num_tokens": 301563.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.8328426551818848,
+      "epoch": 0.3386550556361877,
+      "grad_norm": 3.9171409606933594,
+      "learning_rate": 3.309143686502178e-05,
+      "loss": 1.6552,
+      "mean_token_accuracy": 0.5872596988081932,
+      "num_tokens": 352401.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.7965506362915038,
+      "epoch": 0.3870343492985002,
+      "grad_norm": 3.305162191390991,
+      "learning_rate": 3.0672472181906144e-05,
+      "loss": 1.6148,
+      "mean_token_accuracy": 0.591159172654152,
+      "num_tokens": 401032.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.828446706533432,
+      "epoch": 0.43541364296081275,
+      "grad_norm": 3.2749598026275635,
+      "learning_rate": 2.8253507498790522e-05,
+      "loss": 1.6748,
+      "mean_token_accuracy": 0.5816510277986526,
+      "num_tokens": 453704.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.8458160006999969,
+      "epoch": 0.4837929366231253,
+      "grad_norm": 4.897568702697754,
+      "learning_rate": 2.5834542815674896e-05,
+      "loss": 1.6887,
+      "mean_token_accuracy": 0.5766134199500084,
+      "num_tokens": 503598.0,
+      "step": 1000
+    },
+    {
+      "entropy": 1.7846248948574066,
+      "epoch": 0.5321722302854378,
+      "grad_norm": 2.791334629058838,
+      "learning_rate": 2.3415578132559267e-05,
+      "loss": 1.6216,
+      "mean_token_accuracy": 0.5865279313921928,
+      "num_tokens": 556410.0,
+      "step": 1100
+    },
+    {
+      "entropy": 1.8374267256259917,
+      "epoch": 0.5805515239477503,
+      "grad_norm": 3.3878092765808105,
+      "learning_rate": 2.0996613449443638e-05,
+      "loss": 1.6959,
+      "mean_token_accuracy": 0.5801611566543579,
+      "num_tokens": 609825.0,
+      "step": 1200
+    },
+    {
+      "entropy": 1.8186497938632966,
+      "epoch": 0.6289308176100629,
+      "grad_norm": 2.651001214981079,
+      "learning_rate": 1.8577648766328012e-05,
+      "loss": 1.6508,
+      "mean_token_accuracy": 0.5890785497426987,
+      "num_tokens": 661304.0,
+      "step": 1300
+    },
+    {
+      "entropy": 1.83908866584301,
+      "epoch": 0.6773101112723754,
+      "grad_norm": 3.2351417541503906,
+      "learning_rate": 1.6158684083212386e-05,
+      "loss": 1.6724,
+      "mean_token_accuracy": 0.5820642611384392,
+      "num_tokens": 711260.0,
+      "step": 1400
+    },
+    {
+      "entropy": 1.801530545949936,
+      "epoch": 0.7256894049346879,
+      "grad_norm": 2.7686285972595215,
+      "learning_rate": 1.373971940009676e-05,
+      "loss": 1.6177,
+      "mean_token_accuracy": 0.5914771935343742,
+      "num_tokens": 761107.0,
+      "step": 1500
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 2067,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 646531805394432.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-1500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0243dd8856240fbf0fd7008bf18425c39b8b15885c5015aadc422caea0037a44
+size 5752

checkpoint-1500/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2000/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dtype": "float32",
+  "eos_token_id": 0,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 576,
+  "initializer_range": 0.041666666666666664,
+  "intermediate_size": 1536,
+  "is_llama_config": true,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 9,
+  "num_hidden_layers": 30,
+  "num_key_value_heads": 3,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_interleaved": false,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.56.1",
+  "use_cache": true,
+  "vocab_size": 49152
+}

checkpoint-2000/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": [
+    0,
+    2
+  ],
+  "transformers_version": "4.56.1"
+}

checkpoint-2000/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2000/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:584e31ce045b77e2d8caa1ae7e8e7ee76fdba17301428089ffca350c7fb7337e
+size 538090408

checkpoint-2000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a58cdd2b12225dd40c4962180667e3bd9db0c8c8239fc00a63ff0a79863ae6f8
+size 1076349050

checkpoint-2000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
+size 14244

checkpoint-2000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42928213bfcfe4bec3101791e0c7e3efce3a6ae55386c28d08b0a8b5952cf77d
+size 1064

checkpoint-2000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-2000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

checkpoint-2000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,234 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9675858732462506,
+  "eval_steps": 500,
+  "global_step": 2000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.9665795528888703,
+      "epoch": 0.04837929366231253,
+      "grad_norm": 4.359325885772705,
+      "learning_rate": 4.760522496371553e-05,
+      "loss": 1.7769,
+      "mean_token_accuracy": 0.5657356014847755,
+      "num_tokens": 50640.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.9170209395885467,
+      "epoch": 0.09675858732462506,
+      "grad_norm": 2.7793142795562744,
+      "learning_rate": 4.5186260280599906e-05,
+      "loss": 1.7405,
+      "mean_token_accuracy": 0.567837278842926,
+      "num_tokens": 99726.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.8398987126350403,
+      "epoch": 0.14513788098693758,
+      "grad_norm": 2.6684820652008057,
+      "learning_rate": 4.276729559748428e-05,
+      "loss": 1.6837,
+      "mean_token_accuracy": 0.5797487896680832,
+      "num_tokens": 150973.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.854783646464348,
+      "epoch": 0.1935171746492501,
+      "grad_norm": 2.2839877605438232,
+      "learning_rate": 4.0348330914368655e-05,
+      "loss": 1.6963,
+      "mean_token_accuracy": 0.5789705204963684,
+      "num_tokens": 203850.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.8420159757137298,
+      "epoch": 0.24189646831156264,
+      "grad_norm": 5.425283908843994,
+      "learning_rate": 3.792936623125303e-05,
+      "loss": 1.6699,
+      "mean_token_accuracy": 0.5854617989063263,
+      "num_tokens": 251974.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.8752372413873672,
+      "epoch": 0.29027576197387517,
+      "grad_norm": 3.1188437938690186,
+      "learning_rate": 3.55104015481374e-05,
+      "loss": 1.7033,
+      "mean_token_accuracy": 0.5757111895084381,
+      "num_tokens": 301563.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.8328426551818848,
+      "epoch": 0.3386550556361877,
+      "grad_norm": 3.9171409606933594,
+      "learning_rate": 3.309143686502178e-05,
+      "loss": 1.6552,
+      "mean_token_accuracy": 0.5872596988081932,
+      "num_tokens": 352401.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.7965506362915038,
+      "epoch": 0.3870343492985002,
+      "grad_norm": 3.305162191390991,
+      "learning_rate": 3.0672472181906144e-05,
+      "loss": 1.6148,
+      "mean_token_accuracy": 0.591159172654152,
+      "num_tokens": 401032.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.828446706533432,
+      "epoch": 0.43541364296081275,
+      "grad_norm": 3.2749598026275635,
+      "learning_rate": 2.8253507498790522e-05,
+      "loss": 1.6748,
+      "mean_token_accuracy": 0.5816510277986526,
+      "num_tokens": 453704.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.8458160006999969,
+      "epoch": 0.4837929366231253,
+      "grad_norm": 4.897568702697754,
+      "learning_rate": 2.5834542815674896e-05,
+      "loss": 1.6887,
+      "mean_token_accuracy": 0.5766134199500084,
+      "num_tokens": 503598.0,
+      "step": 1000
+    },
+    {
+      "entropy": 1.7846248948574066,
+      "epoch": 0.5321722302854378,
+      "grad_norm": 2.791334629058838,
+      "learning_rate": 2.3415578132559267e-05,
+      "loss": 1.6216,
+      "mean_token_accuracy": 0.5865279313921928,
+      "num_tokens": 556410.0,
+      "step": 1100
+    },
+    {
+      "entropy": 1.8374267256259917,
+      "epoch": 0.5805515239477503,
+      "grad_norm": 3.3878092765808105,
+      "learning_rate": 2.0996613449443638e-05,
+      "loss": 1.6959,
+      "mean_token_accuracy": 0.5801611566543579,
+      "num_tokens": 609825.0,
+      "step": 1200
+    },
+    {
+      "entropy": 1.8186497938632966,
+      "epoch": 0.6289308176100629,
+      "grad_norm": 2.651001214981079,
+      "learning_rate": 1.8577648766328012e-05,
+      "loss": 1.6508,
+      "mean_token_accuracy": 0.5890785497426987,
+      "num_tokens": 661304.0,
+      "step": 1300
+    },
+    {
+      "entropy": 1.83908866584301,
+      "epoch": 0.6773101112723754,
+      "grad_norm": 3.2351417541503906,
+      "learning_rate": 1.6158684083212386e-05,
+      "loss": 1.6724,
+      "mean_token_accuracy": 0.5820642611384392,
+      "num_tokens": 711260.0,
+      "step": 1400
+    },
+    {
+      "entropy": 1.801530545949936,
+      "epoch": 0.7256894049346879,
+      "grad_norm": 2.7686285972595215,
+      "learning_rate": 1.373971940009676e-05,
+      "loss": 1.6177,
+      "mean_token_accuracy": 0.5914771935343742,
+      "num_tokens": 761107.0,
+      "step": 1500
+    },
+    {
+      "entropy": 1.8067762792110442,
+      "epoch": 0.7740686985970004,
+      "grad_norm": 4.601089000701904,
+      "learning_rate": 1.1320754716981132e-05,
+      "loss": 1.6515,
+      "mean_token_accuracy": 0.5850321623682976,
+      "num_tokens": 810527.0,
+      "step": 1600
+    },
+    {
+      "entropy": 1.8136928272247315,
+      "epoch": 0.822447992259313,
+      "grad_norm": 2.1550955772399902,
+      "learning_rate": 8.901790033865507e-06,
+      "loss": 1.6373,
+      "mean_token_accuracy": 0.5840527075529098,
+      "num_tokens": 862714.0,
+      "step": 1700
+    },
+    {
+      "entropy": 1.7681069767475128,
+      "epoch": 0.8708272859216255,
+      "grad_norm": 2.4196958541870117,
+      "learning_rate": 6.48282535074988e-06,
+      "loss": 1.5913,
+      "mean_token_accuracy": 0.6007963755726814,
+      "num_tokens": 914855.0,
+      "step": 1800
+    },
+    {
+      "entropy": 1.838038477897644,
+      "epoch": 0.919206579583938,
+      "grad_norm": 5.095026016235352,
+      "learning_rate": 4.063860667634252e-06,
+      "loss": 1.6622,
+      "mean_token_accuracy": 0.5819806972146034,
+      "num_tokens": 966292.0,
+      "step": 1900
+    },
+    {
+      "entropy": 1.8221340310573577,
+      "epoch": 0.9675858732462506,
+      "grad_norm": 2.757889747619629,
+      "learning_rate": 1.6448959845186262e-06,
+      "loss": 1.6351,
+      "mean_token_accuracy": 0.5879274764657021,
+      "num_tokens": 1014330.0,
+      "step": 2000
+    }
+  ],
+  "logging_steps": 100,
+  "max_steps": 2067,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 861103143827712.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-2000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0243dd8856240fbf0fd7008bf18425c39b8b15885c5015aadc422caea0037a44
+size 5752

checkpoint-2000/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2067/config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dtype": "float32",
+  "eos_token_id": 0,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 576,
+  "initializer_range": 0.041666666666666664,
+  "intermediate_size": 1536,
+  "is_llama_config": true,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 9,
+  "num_hidden_layers": 30,
+  "num_key_value_heads": 3,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_interleaved": false,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.56.1",
+  "use_cache": true,
+  "vocab_size": 49152
+}

checkpoint-2067/generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 0,
+  "eos_token_id": [
+    0,
+    2
+  ],
+  "transformers_version": "4.56.1"
+}

checkpoint-2067/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2067/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d3bedef1d4d4cb4182ef659f6bad3a5970545a62dfd847b56ceab9a66747861
+size 538090408

checkpoint-2067/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88abf750276bd19226157a24e3b489d79535dad0710c1bf6b9b2cd9bf274aa6c
+size 1076349050

checkpoint-2067/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d9cd6a0487226e5bd30d1846894c82af483733ab4381b75bae9c0745e05d405
+size 14244

checkpoint-2067/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66a8911dbca2c60588eab8a3220f7ca25f8eb8068ad402110ebe854c5ac21ac6
+size 1064

checkpoint-2067/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-2067/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-2067/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}