ThomasTheMaker commited on Nov 18, 2025

Commit

0087b06

verified ·

1 Parent(s): 9c3c7a7

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

README.md +14 -0
arc_100m_16b/checkpoint-15000/chat_template.jinja +15 -0
arc_100m_16b/checkpoint-15000/config.json +30 -0
arc_100m_16b/checkpoint-15000/generation_config.json +8 -0
arc_100m_16b/checkpoint-15000/special_tokens_map.json +30 -0
arc_100m_16b/checkpoint-15000/tokenizer.json +0 -0
arc_100m_16b/checkpoint-15000/tokenizer_config.json +43 -0
arc_100m_16b/checkpoint-30000/chat_template.jinja +15 -0
arc_100m_16b/checkpoint-30000/config.json +30 -0
arc_100m_16b/checkpoint-30000/generation_config.json +8 -0
arc_100m_16b/checkpoint-30000/special_tokens_map.json +30 -0
arc_100m_16b/checkpoint-30000/tokenizer.json +0 -0
arc_100m_16b/checkpoint-30000/tokenizer_config.json +43 -0
arc_100m_16b/checkpoint-35000/chat_template.jinja +15 -0
arc_100m_16b/checkpoint-35000/config.json +30 -0
arc_100m_16b/checkpoint-35000/generation_config.json +8 -0
arc_100m_16b/checkpoint-35000/special_tokens_map.json +30 -0
arc_100m_16b/checkpoint-35000/tokenizer.json +0 -0
arc_100m_16b/checkpoint-35000/tokenizer_config.json +43 -0
arc_100m_16b/checkpoint-40000/chat_template.jinja +15 -0
arc_100m_16b/checkpoint-40000/config.json +30 -0
arc_100m_16b/checkpoint-40000/generation_config.json +8 -0
arc_100m_16b/checkpoint-40000/special_tokens_map.json +30 -0
arc_100m_16b/checkpoint-40000/tokenizer.json +0 -0
arc_100m_16b/checkpoint-45000/chat_template.jinja +15 -0
arc_100m_16b/checkpoint-45000/config.json +30 -0
arc_100m_16b/checkpoint-45000/generation_config.json +8 -0
arc_100m_16b/checkpoint-45000/special_tokens_map.json +30 -0
arc_100m_16b/checkpoint-45000/tokenizer.json +0 -0
arc_100m_16b/checkpoint-45000/tokenizer_config.json +43 -0
arc_100m_16b/checkpoint-75000/chat_template.jinja +15 -0
arc_100m_16b/checkpoint-75000/config.json +30 -0
arc_100m_16b/checkpoint-75000/generation_config.json +8 -0
arc_100m_16b/checkpoint-75000/special_tokens_map.json +30 -0
arc_100m_16b/checkpoint-75000/tokenizer.json +0 -0
arc_100m_16b/checkpoint-75000/tokenizer_config.json +43 -0
arc_100m_16b/checkpoint-80000/chat_template.jinja +15 -0
arc_100m_16b/checkpoint-80000/config.json +30 -0
arc_100m_16b/checkpoint-80000/generation_config.json +8 -0
arc_100m_16b/checkpoint-80000/special_tokens_map.json +30 -0
arc_100m_16b/checkpoint-80000/tokenizer.json +0 -0
arc_100m_16b/checkpoint-80000/tokenizer_config.json +43 -0
arc_100m_16b/final-model/chat_template.jinja +15 -0
arc_100m_16b/final-model/config.json +30 -0
arc_100m_16b/final-model/generation_config.json +8 -0
arc_100m_16b/final-model/special_tokens_map.json +30 -0
arc_100m_16b/final-model/tokenizer.json +0 -0
arc_100m_16b/final-model/tokenizer_config.json +43 -0
pre.py +345 -0
requirements.txt +145 -0

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Pretraining arc lm
+git clone https://github.com/huggingface/nanotron
+python3 -m venv venv
+source venv/bin/activate
+python -m pip install --upgrade pip
+cd nanotron
+pip install -e .
+pip install datasets transformers datatrove[io] numba wandb
+pip install wheel ninja triton flash-attn --no-build-isolation
+huggingface-cli login
+wandb login
+cd ..

arc_100m_16b/checkpoint-15000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

arc_100m_16b/checkpoint-15000/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "bfloat16",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "vocab_size": 32000
+}

arc_100m_16b/checkpoint-15000/generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2,
+  "transformers_version": "4.57.1",
+  "use_cache": false
+}

arc_100m_16b/checkpoint-15000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

arc_100m_16b/checkpoint-15000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

arc_100m_16b/checkpoint-15000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

arc_100m_16b/checkpoint-30000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

arc_100m_16b/checkpoint-30000/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "bfloat16",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "vocab_size": 32000
+}

arc_100m_16b/checkpoint-30000/generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2,
+  "transformers_version": "4.57.1",
+  "use_cache": false
+}

arc_100m_16b/checkpoint-30000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

arc_100m_16b/checkpoint-30000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

arc_100m_16b/checkpoint-30000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

arc_100m_16b/checkpoint-35000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

arc_100m_16b/checkpoint-35000/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "bfloat16",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "vocab_size": 32000
+}

arc_100m_16b/checkpoint-35000/generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2,
+  "transformers_version": "4.57.1",
+  "use_cache": false
+}

arc_100m_16b/checkpoint-35000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

arc_100m_16b/checkpoint-35000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

arc_100m_16b/checkpoint-35000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

arc_100m_16b/checkpoint-40000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

arc_100m_16b/checkpoint-40000/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "bfloat16",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "vocab_size": 32000
+}

arc_100m_16b/checkpoint-40000/generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2,
+  "transformers_version": "4.57.1",
+  "use_cache": false
+}

arc_100m_16b/checkpoint-40000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

arc_100m_16b/checkpoint-40000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

arc_100m_16b/checkpoint-45000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

arc_100m_16b/checkpoint-45000/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "bfloat16",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "vocab_size": 32000
+}

arc_100m_16b/checkpoint-45000/generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2,
+  "transformers_version": "4.57.1",
+  "use_cache": false
+}

arc_100m_16b/checkpoint-45000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

arc_100m_16b/checkpoint-45000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

arc_100m_16b/checkpoint-45000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

arc_100m_16b/checkpoint-75000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

arc_100m_16b/checkpoint-75000/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "bfloat16",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "vocab_size": 32000
+}

arc_100m_16b/checkpoint-75000/generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2,
+  "transformers_version": "4.57.1",
+  "use_cache": false
+}

arc_100m_16b/checkpoint-75000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

arc_100m_16b/checkpoint-75000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

arc_100m_16b/checkpoint-75000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

arc_100m_16b/checkpoint-80000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

arc_100m_16b/checkpoint-80000/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "bfloat16",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "vocab_size": 32000
+}

arc_100m_16b/checkpoint-80000/generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2,
+  "transformers_version": "4.57.1",
+  "use_cache": false
+}

arc_100m_16b/checkpoint-80000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

arc_100m_16b/checkpoint-80000/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

arc_100m_16b/checkpoint-80000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

arc_100m_16b/final-model/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,15 @@

+{% for message in messages %}
+{% if message['role'] == 'user' %}
+{{ '<|user|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'system' %}
+{{ '<|system|>
+' + message['content'] + eos_token }}
+{% elif message['role'] == 'assistant' %}
+{{ '<|assistant|>
+'  + message['content'] + eos_token }}
+{% endif %}
+{% if loop.last and add_generation_prompt %}
+{{ '<|assistant|>' }}
+{% endif %}
+{% endfor %}

arc_100m_16b/final-model/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "dtype": "bfloat16",
+  "eos_token_id": 2,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "max_position_embeddings": 4096,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 4,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.57.1",
+  "use_cache": false,
+  "vocab_size": 32000
+}

arc_100m_16b/final-model/generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2,
+  "transformers_version": "4.57.1",
+  "use_cache": false
+}

arc_100m_16b/final-model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

arc_100m_16b/final-model/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

arc_100m_16b/final-model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

pre.py ADDED Viewed

	@@ -0,0 +1,345 @@

+import os
+import math
+import time
+import random
+from itertools import islice
+import numpy as np
+import torch
+from torch.cuda.amp import GradScaler, autocast
+from datasets import load_dataset
+from transformers import (
+    AutoTokenizer,
+    LlamaConfig,
+    LlamaForCausalLM,
+    get_cosine_schedule_with_warmup,
+)
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+HF_TOKEN = os.environ.get("HF_TOKEN")
+if not HF_TOKEN:
+    raise ValueError("HF_TOKEN environment variable must be set")
+RAW_DATASET_NAME = "ThomasTheMaker/Arc-Corpus"
+TOKENIZER_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+MAX_DATASET_ROWS = 9600_000
+OUTPUT_DIR = "output_arc_lm_100m"
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+BLOCK_SIZE = 4096
+BATCH_SIZE = 24
+GRAD_ACCUM_STEPS = 2
+NUM_EPOCHS = 1
+LEARNING_RATE = 3.0e-4
+WEIGHT_DECAY = 0.1
+WARMUP_RATIO = 0.01
+GRAD_CLIP = 1.0
+LOG_EVERY = 50
+SAVE_EVERY = 5_000
+RANDOM_SEED = 42
+random.seed(RANDOM_SEED)
+np.random.seed(RANDOM_SEED)
+torch.manual_seed(RANDOM_SEED)
+torch.cuda.manual_seed_all(RANDOM_SEED)
+print("📦 Loading dataset stream...")
+stream_ds = load_dataset(
+    RAW_DATASET_NAME,
+    split="train",
+    streaming=True,
+    token=HF_TOKEN,
+)
+def ensure_text(example):
+    content = (example.get("text") or "").strip()
+    if not content:
+        content = "No content provided."
+    return {"text": content}
+print("🔡 Loading tokenizer:", TOKENIZER_NAME)
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)
+special_tokens = {
+    "bos_token": "<s>",
+    "eos_token": "</s>",
+    "unk_token": "<unk>",
+    "pad_token": "<pad>",
+}
+to_add = {k: v for k, v in special_tokens.items() if getattr(tokenizer, k, None) is None}
+if to_add:
+    print("➕ Adding special tokens:", to_add)
+    tokenizer.add_special_tokens(to_add)
+pad_id = tokenizer.pad_token_id
+bos_id = tokenizer.bos_token_id
+eos_id = tokenizer.eos_token_id
+print(f"✅ Tokenizer vocab size: {len(tokenizer)}")
+print(f"   pad_id={pad_id}, bos_id={bos_id}, eos_id={eos_id}")
+print()
+formatted_stream = stream_ds.map(ensure_text)
+print("📊 Estimating dataset size...")
+sample_size = min(1000, MAX_DATASET_ROWS)
+sample_tokens = 0
+temp_stream = stream_ds.map(ensure_text)
+for i, ex in enumerate(islice(temp_stream, sample_size)):
+    text = ex["text"]
+    ids = tokenizer(text, add_special_tokens=False)["input_ids"]
+    sample_tokens += len(ids) + 1
+avg_tokens_per_doc = sample_tokens / sample_size
+print(f"   Sampled {sample_size} documents, avg {avg_tokens_per_doc:.1f} tokens/doc")
+num_docs = MAX_DATASET_ROWS
+estimated_tokens = int(num_docs * avg_tokens_per_doc)
+print(f"   Using first {num_docs:,} documents")
+print(f"   Estimated total tokens: {estimated_tokens:,}")
+TOKENS_PER_STEP = BLOCK_SIZE * BATCH_SIZE * GRAD_ACCUM_STEPS
+TOTAL_STEPS = (estimated_tokens * NUM_EPOCHS) // TOKENS_PER_STEP
+print(f"📊 Training for {TOTAL_STEPS:,} steps ({NUM_EPOCHS} epoch(s))")
+print(f"   Tokens per step: {TOKENS_PER_STEP:,}")
+print(f"   Total tokens: {estimated_tokens * NUM_EPOCHS:,}")
+print()
+print()
+peek = list(islice(stream_ds.map(ensure_text), 1))
+print("🔎 Sample:")
+print((peek[0]["text"] if peek else "<empty>")[:500])
+print()
+formatted_stream = stream_ds.map(ensure_text)
+config = LlamaConfig(
+    vocab_size=len(tokenizer),
+    hidden_size=768,
+    intermediate_size=2048,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    num_key_value_heads=4,
+    max_position_embeddings=BLOCK_SIZE,
+    rms_norm_eps=1e-6,
+    initializer_range=0.02,
+    use_cache=False,
+    pad_token_id=pad_id,
+    bos_token_id=bos_id,
+    eos_token_id=eos_id,
+    tie_word_embeddings=False,
+)
+print("🧩 Building model...")
+model = LlamaForCausalLM(config)
+model.resize_token_embeddings(len(tokenizer))
+model.gradient_checkpointing_enable()
+device = "cuda" if torch.cuda.is_available() else "cpu"
+if torch.cuda.is_available():
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
+use_fp16 = torch.cuda.is_available() and (not use_bf16)
+if use_bf16:
+    dtype = torch.bfloat16
+elif use_fp16:
+    dtype = torch.float16
+else:
+    dtype = torch.float32
+model = model.to(device, dtype=dtype)
+print(
+    f"✅ Model ready: {sum(p.numel() for p in model.parameters())/1e6:.1f}M params, "
+    f"dtype={dtype}, device={device}"
+)
+print()
+def token_block_stream(hf_stream, tokenizer, block_size, eos_id):
+    buffer = []
+    for ex in hf_stream:
+        text = ex["text"]
+        ids = tokenizer(text, add_special_tokens=False)["input_ids"]
+        ids.append(eos_id)
+        buffer.extend(ids)
+        while len(buffer) >= block_size:
+            block = buffer[:block_size]
+            buffer = buffer[block_size:]
+            yield torch.tensor(block, dtype=torch.long)
+optimizer = torch.optim.AdamW(
+    model.parameters(),
+    lr=LEARNING_RATE,
+    weight_decay=WEIGHT_DECAY,
+    betas=(0.9, 0.95),
+)
+num_warmup_steps = int(TOTAL_STEPS * WARMUP_RATIO)
+scheduler = get_cosine_schedule_with_warmup(
+    optimizer,
+    num_warmup_steps=num_warmup_steps,
+    num_training_steps=TOTAL_STEPS,
+)
+scaler = GradScaler(enabled=use_fp16)
+print("🚀 Starting pretraining...")
+print(
+    f"   BLOCK_SIZE={BLOCK_SIZE}, BATCH_SIZE={BATCH_SIZE}, "
+    f"GRAD_ACCUM_STEPS={GRAD_ACCUM_STEPS}, TOTAL_STEPS={TOTAL_STEPS}"
+)
+print(
+    f"   Effective tokens/step ≈ {BLOCK_SIZE * BATCH_SIZE * GRAD_ACCUM_STEPS:,}"
+)
+print(f"   Learning rate: {LEARNING_RATE}, Warmup steps: {num_warmup_steps}")
+print()
+global_step = 0
+micro_step = 0
+running_loss = 0.0
+start_time = time.time()
+window_start_time = time.time()
+window_start_step = 0
+loss_history = []
+lr_history = []
+throughput_history = []
+step_history = []
+def multi_epoch_stream(base_stream, num_epochs, max_rows):
+    for epoch in range(num_epochs):
+        print(f"📚 Starting epoch {epoch + 1}/{num_epochs}")
+        row_count = 0
+        for item in base_stream:
+            if row_count >= max_rows:
+                break
+            yield item
+            row_count += 1
+        print(f"   Processed {row_count:,} rows in epoch {epoch + 1}")
+formatted_stream_base = stream_ds.map(ensure_text)
+multi_epoch_data = multi_epoch_stream(formatted_stream_base, NUM_EPOCHS, MAX_DATASET_ROWS)
+block_iter = token_block_stream(multi_epoch_data, tokenizer, BLOCK_SIZE, eos_id)
+model.train()
+pbar = tqdm(total=TOTAL_STEPS, desc="Training", unit="step")
+autocast_ctx = autocast(enabled=(use_bf16 or use_fp16), dtype=torch.bfloat16 if use_bf16 else torch.float16)
+with autocast_ctx:
+    while global_step < TOTAL_STEPS:
+        blocks = []
+        for _ in range(BATCH_SIZE):
+            try:
+                block = next(block_iter)
+                blocks.append(block)
+            except StopIteration:
+                print(f"\n✅ Dataset exhausted after {global_step} steps")
+                break
+        if len(blocks) < BATCH_SIZE:
+            print(f"   Completed training with partial batch of {len(blocks)} blocks")
+            break
+        input_ids = torch.stack(blocks).to(device)
+        attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=device)
+        labels = input_ids.clone()
+        outputs = model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+        )
+        loss = outputs.loss / GRAD_ACCUM_STEPS
+        if use_fp16:
+            scaler.scale(loss).backward()
+        else:
+            loss.backward()
+        running_loss += loss.item()
+        micro_step += 1
+        if micro_step % GRAD_ACCUM_STEPS == 0:
+            if use_fp16:
+                scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
+            if use_fp16:
+                scaler.step(optimizer)
+                scaler.update()
+            else:
+                optimizer.step()
+            optimizer.zero_grad(set_to_none=True)
+            scheduler.step()
+            global_step += 1
+            pbar.update(1)
+            if global_step % LOG_EVERY == 0:
+                avg_loss = running_loss / LOG_EVERY
+                current_lr = scheduler.get_last_lr()[0]
+                window_elapsed = time.time() - window_start_time
+                window_steps = global_step - window_start_step
+                tok_per_step = BLOCK_SIZE * BATCH_SIZE * GRAD_ACCUM_STEPS
+                window_tps = (tok_per_step * window_steps) / window_elapsed if window_elapsed > 0 else 0
+                total_elapsed = time.time() - start_time
+                total_tps = (tok_per_step * global_step) / total_elapsed if total_elapsed > 0 else 0
+                pbar.set_postfix({
+                    "loss": f"{avg_loss:.4f}",
+                    "lr": f"{current_lr:.2e}",
+                    "tok/s": f"{int(window_tps):,}"
+                })
+                running_loss = 0.0
+                window_start_time = time.time()
+                window_start_step = global_step
+            if global_step % SAVE_EVERY == 0:
+                ckpt_dir = os.path.join(OUTPUT_DIR, f"checkpoint-{global_step}")
+                print(f"\n💾 Saving checkpoint to {ckpt_dir}")
+                os.makedirs(ckpt_dir, exist_ok=True)
+                model.save_pretrained(ckpt_dir)
+                tokenizer.save_pretrained(ckpt_dir)
+                torch.save({
+                    'global_step': global_step,
+                    'optimizer_state_dict': optimizer.state_dict(),
+                    'scheduler_state_dict': scheduler.state_dict(),
+                    'scaler_state_dict': scaler.state_dict() if use_fp16 else None,
+                }, os.path.join(ckpt_dir, "training_state.pt"))
+pbar.close()
+print("\n✅ Training complete!")
+print("💾 Saving final model...")
+final_dir = os.path.join(OUTPUT_DIR, "final-model")
+os.makedirs(final_dir, exist_ok=True)
+model.save_pretrained(final_dir)
+tokenizer.save_pretrained(final_dir)
+torch.save({
+    'global_step': global_step,
+    'optimizer_state_dict': optimizer.state_dict(),
+    'scheduler_state_dict': scheduler.state_dict(),
+    'scaler_state_dict': scaler.state_dict() if use_fp16 else None,
+}, os.path.join(final_dir, "training_state.pt"))
+print("🎉 Done!")

requirements.txt ADDED Viewed

	@@ -0,0 +1,145 @@

+accelerate==1.11.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.2
+aiosignal==1.4.0
+anyio==4.11.0
+async-timeout==5.0.1
+attrs==25.4.0
+certifi==2025.11.12
+charset-normalizer==3.4.4
+datasets==4.4.1
+dill==0.4.0
+evaluate==0.4.6
+exceptiongroup==1.3.0
+filelock==3.20.0
+frozenlist==1.8.0
+fsspec==2025.10.0
+h11==0.16.0
+hf-xet==1.2.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.36.0
+idna==3.11
+Jinja2==3.1.6
+MarkupSafe==3.0.3
+mpmath==1.3.0
+multidict==6.7.0
+multiprocess==0.70.18
+networkx==3.4.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+packaging==25.0
+pandas==2.3.3
+propcache==0.4.1
+psutil==7.1.3
+pyarrow==22.0.0
+python-dateutil==2.9.0.post0
+pytz==2025.2
+PyYAML==6.0.3
+regex==2025.11.3
+requests==2.32.5
+safetensors==0.6.2
+six==1.17.0
+sniffio==1.3.1
+sympy==1.14.0
+tokenizers==0.22.1
+torch==2.9.0
+tqdm==4.67.1
+transformers==4.57.1
+triton==3.5.0
+trl==0.25.0
+typing_extensions==4.15.0
+tzdata==2025.2
+urllib3==2.5.0
+xxhash==3.6.0
+yarl==1.22.0
+accelerate==1.11.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.13.2
+aiosignal==1.4.0
+anyio==4.11.0
+async-timeout==5.0.1
+attrs==25.4.0
+certifi==2025.11.12
+charset-normalizer==3.4.4
+contourpy==1.3.2
+cycler==0.12.1
+datasets==4.4.1
+dill==0.4.0
+evaluate==0.4.6
+exceptiongroup==1.3.0
+filelock==3.20.0
+fonttools==4.60.1
+frozenlist==1.8.0
+fsspec==2025.10.0
+h11==0.16.0
+hf-xet==1.2.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.36.0
+idna==3.11
+Jinja2==3.1.6
+kiwisolver==1.4.9
+MarkupSafe==3.0.3
+matplotlib==3.10.7
+mpmath==1.3.0
+multidict==6.7.0
+multiprocess==0.70.18
+networkx==3.4.2
+numpy==2.2.6
+nvidia-cublas-cu12==12.8.4.1
+nvidia-cuda-cupti-cu12==12.8.90
+nvidia-cuda-nvrtc-cu12==12.8.93
+nvidia-cuda-runtime-cu12==12.8.90
+nvidia-cudnn-cu12==9.10.2.21
+nvidia-cufft-cu12==11.3.3.83
+nvidia-cufile-cu12==1.13.1.3
+nvidia-curand-cu12==10.3.9.90
+nvidia-cusolver-cu12==11.7.3.90
+nvidia-cusparse-cu12==12.5.8.93
+nvidia-cusparselt-cu12==0.7.1
+nvidia-nccl-cu12==2.27.5
+nvidia-nvjitlink-cu12==12.8.93
+nvidia-nvshmem-cu12==3.3.20
+nvidia-nvtx-cu12==12.8.90
+packaging==25.0
+pandas==2.3.3
+pillow==12.0.0
+propcache==0.4.1
+psutil==7.1.3
+pyarrow==22.0.0
+pyparsing==3.2.5
+python-dateutil==2.9.0.post0
+pytz==2025.2
+PyYAML==6.0.3
+regex==2025.11.3
+requests==2.32.5
+safetensors==0.6.2
+six==1.17.0
+sniffio==1.3.1
+sympy==1.14.0
+tokenizers==0.22.1
+torch==2.9.0
+tqdm==4.67.1
+transformers==4.57.1
+triton==3.5.0
+trl==0.25.0
+typing_extensions==4.15.0
+tzdata==2025.2
+urllib3==2.5.0
+xxhash==3.6.0
+yarl==1.22.0