ThomasTheMaker commited on
Commit
0087b06
·
verified ·
1 Parent(s): 9c3c7a7

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. README.md +14 -0
  2. arc_100m_16b/checkpoint-15000/chat_template.jinja +15 -0
  3. arc_100m_16b/checkpoint-15000/config.json +30 -0
  4. arc_100m_16b/checkpoint-15000/generation_config.json +8 -0
  5. arc_100m_16b/checkpoint-15000/special_tokens_map.json +30 -0
  6. arc_100m_16b/checkpoint-15000/tokenizer.json +0 -0
  7. arc_100m_16b/checkpoint-15000/tokenizer_config.json +43 -0
  8. arc_100m_16b/checkpoint-30000/chat_template.jinja +15 -0
  9. arc_100m_16b/checkpoint-30000/config.json +30 -0
  10. arc_100m_16b/checkpoint-30000/generation_config.json +8 -0
  11. arc_100m_16b/checkpoint-30000/special_tokens_map.json +30 -0
  12. arc_100m_16b/checkpoint-30000/tokenizer.json +0 -0
  13. arc_100m_16b/checkpoint-30000/tokenizer_config.json +43 -0
  14. arc_100m_16b/checkpoint-35000/chat_template.jinja +15 -0
  15. arc_100m_16b/checkpoint-35000/config.json +30 -0
  16. arc_100m_16b/checkpoint-35000/generation_config.json +8 -0
  17. arc_100m_16b/checkpoint-35000/special_tokens_map.json +30 -0
  18. arc_100m_16b/checkpoint-35000/tokenizer.json +0 -0
  19. arc_100m_16b/checkpoint-35000/tokenizer_config.json +43 -0
  20. arc_100m_16b/checkpoint-40000/chat_template.jinja +15 -0
  21. arc_100m_16b/checkpoint-40000/config.json +30 -0
  22. arc_100m_16b/checkpoint-40000/generation_config.json +8 -0
  23. arc_100m_16b/checkpoint-40000/special_tokens_map.json +30 -0
  24. arc_100m_16b/checkpoint-40000/tokenizer.json +0 -0
  25. arc_100m_16b/checkpoint-45000/chat_template.jinja +15 -0
  26. arc_100m_16b/checkpoint-45000/config.json +30 -0
  27. arc_100m_16b/checkpoint-45000/generation_config.json +8 -0
  28. arc_100m_16b/checkpoint-45000/special_tokens_map.json +30 -0
  29. arc_100m_16b/checkpoint-45000/tokenizer.json +0 -0
  30. arc_100m_16b/checkpoint-45000/tokenizer_config.json +43 -0
  31. arc_100m_16b/checkpoint-75000/chat_template.jinja +15 -0
  32. arc_100m_16b/checkpoint-75000/config.json +30 -0
  33. arc_100m_16b/checkpoint-75000/generation_config.json +8 -0
  34. arc_100m_16b/checkpoint-75000/special_tokens_map.json +30 -0
  35. arc_100m_16b/checkpoint-75000/tokenizer.json +0 -0
  36. arc_100m_16b/checkpoint-75000/tokenizer_config.json +43 -0
  37. arc_100m_16b/checkpoint-80000/chat_template.jinja +15 -0
  38. arc_100m_16b/checkpoint-80000/config.json +30 -0
  39. arc_100m_16b/checkpoint-80000/generation_config.json +8 -0
  40. arc_100m_16b/checkpoint-80000/special_tokens_map.json +30 -0
  41. arc_100m_16b/checkpoint-80000/tokenizer.json +0 -0
  42. arc_100m_16b/checkpoint-80000/tokenizer_config.json +43 -0
  43. arc_100m_16b/final-model/chat_template.jinja +15 -0
  44. arc_100m_16b/final-model/config.json +30 -0
  45. arc_100m_16b/final-model/generation_config.json +8 -0
  46. arc_100m_16b/final-model/special_tokens_map.json +30 -0
  47. arc_100m_16b/final-model/tokenizer.json +0 -0
  48. arc_100m_16b/final-model/tokenizer_config.json +43 -0
  49. pre.py +345 -0
  50. requirements.txt +145 -0
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Pretraining arc lm
2
+
3
+ git clone https://github.com/huggingface/nanotron
4
+ python3 -m venv venv
5
+ source venv/bin/activate
6
+ python -m pip install --upgrade pip
7
+ cd nanotron
8
+ pip install -e .
9
+ pip install datasets transformers datatrove[io] numba wandb
10
+ pip install wheel ninja triton flash-attn --no-build-isolation
11
+ huggingface-cli login
12
+ wandb login
13
+
14
+ cd ..
arc_100m_16b/checkpoint-15000/chat_template.jinja ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% for message in messages %}
2
+ {% if message['role'] == 'user' %}
3
+ {{ '<|user|>
4
+ ' + message['content'] + eos_token }}
5
+ {% elif message['role'] == 'system' %}
6
+ {{ '<|system|>
7
+ ' + message['content'] + eos_token }}
8
+ {% elif message['role'] == 'assistant' %}
9
+ {{ '<|assistant|>
10
+ ' + message['content'] + eos_token }}
11
+ {% endif %}
12
+ {% if loop.last and add_generation_prompt %}
13
+ {{ '<|assistant|>' }}
14
+ {% endif %}
15
+ {% endfor %}
arc_100m_16b/checkpoint-15000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "max_position_embeddings": 4096,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "num_key_value_heads": 4,
21
+ "pad_token_id": 2,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_scaling": null,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "transformers_version": "4.57.1",
28
+ "use_cache": false,
29
+ "vocab_size": 32000
30
+ }
arc_100m_16b/checkpoint-15000/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.57.1",
7
+ "use_cache": false
8
+ }
arc_100m_16b/checkpoint-15000/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
arc_100m_16b/checkpoint-15000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
arc_100m_16b/checkpoint-15000/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 2048,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
arc_100m_16b/checkpoint-30000/chat_template.jinja ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% for message in messages %}
2
+ {% if message['role'] == 'user' %}
3
+ {{ '<|user|>
4
+ ' + message['content'] + eos_token }}
5
+ {% elif message['role'] == 'system' %}
6
+ {{ '<|system|>
7
+ ' + message['content'] + eos_token }}
8
+ {% elif message['role'] == 'assistant' %}
9
+ {{ '<|assistant|>
10
+ ' + message['content'] + eos_token }}
11
+ {% endif %}
12
+ {% if loop.last and add_generation_prompt %}
13
+ {{ '<|assistant|>' }}
14
+ {% endif %}
15
+ {% endfor %}
arc_100m_16b/checkpoint-30000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "max_position_embeddings": 4096,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "num_key_value_heads": 4,
21
+ "pad_token_id": 2,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_scaling": null,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "transformers_version": "4.57.1",
28
+ "use_cache": false,
29
+ "vocab_size": 32000
30
+ }
arc_100m_16b/checkpoint-30000/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.57.1",
7
+ "use_cache": false
8
+ }
arc_100m_16b/checkpoint-30000/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
arc_100m_16b/checkpoint-30000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
arc_100m_16b/checkpoint-30000/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 2048,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
arc_100m_16b/checkpoint-35000/chat_template.jinja ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% for message in messages %}
2
+ {% if message['role'] == 'user' %}
3
+ {{ '<|user|>
4
+ ' + message['content'] + eos_token }}
5
+ {% elif message['role'] == 'system' %}
6
+ {{ '<|system|>
7
+ ' + message['content'] + eos_token }}
8
+ {% elif message['role'] == 'assistant' %}
9
+ {{ '<|assistant|>
10
+ ' + message['content'] + eos_token }}
11
+ {% endif %}
12
+ {% if loop.last and add_generation_prompt %}
13
+ {{ '<|assistant|>' }}
14
+ {% endif %}
15
+ {% endfor %}
arc_100m_16b/checkpoint-35000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "max_position_embeddings": 4096,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "num_key_value_heads": 4,
21
+ "pad_token_id": 2,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_scaling": null,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "transformers_version": "4.57.1",
28
+ "use_cache": false,
29
+ "vocab_size": 32000
30
+ }
arc_100m_16b/checkpoint-35000/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.57.1",
7
+ "use_cache": false
8
+ }
arc_100m_16b/checkpoint-35000/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
arc_100m_16b/checkpoint-35000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
arc_100m_16b/checkpoint-35000/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 2048,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
arc_100m_16b/checkpoint-40000/chat_template.jinja ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% for message in messages %}
2
+ {% if message['role'] == 'user' %}
3
+ {{ '<|user|>
4
+ ' + message['content'] + eos_token }}
5
+ {% elif message['role'] == 'system' %}
6
+ {{ '<|system|>
7
+ ' + message['content'] + eos_token }}
8
+ {% elif message['role'] == 'assistant' %}
9
+ {{ '<|assistant|>
10
+ ' + message['content'] + eos_token }}
11
+ {% endif %}
12
+ {% if loop.last and add_generation_prompt %}
13
+ {{ '<|assistant|>' }}
14
+ {% endif %}
15
+ {% endfor %}
arc_100m_16b/checkpoint-40000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "max_position_embeddings": 4096,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "num_key_value_heads": 4,
21
+ "pad_token_id": 2,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_scaling": null,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "transformers_version": "4.57.1",
28
+ "use_cache": false,
29
+ "vocab_size": 32000
30
+ }
arc_100m_16b/checkpoint-40000/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.57.1",
7
+ "use_cache": false
8
+ }
arc_100m_16b/checkpoint-40000/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
arc_100m_16b/checkpoint-40000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
arc_100m_16b/checkpoint-45000/chat_template.jinja ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% for message in messages %}
2
+ {% if message['role'] == 'user' %}
3
+ {{ '<|user|>
4
+ ' + message['content'] + eos_token }}
5
+ {% elif message['role'] == 'system' %}
6
+ {{ '<|system|>
7
+ ' + message['content'] + eos_token }}
8
+ {% elif message['role'] == 'assistant' %}
9
+ {{ '<|assistant|>
10
+ ' + message['content'] + eos_token }}
11
+ {% endif %}
12
+ {% if loop.last and add_generation_prompt %}
13
+ {{ '<|assistant|>' }}
14
+ {% endif %}
15
+ {% endfor %}
arc_100m_16b/checkpoint-45000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "max_position_embeddings": 4096,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "num_key_value_heads": 4,
21
+ "pad_token_id": 2,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_scaling": null,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "transformers_version": "4.57.1",
28
+ "use_cache": false,
29
+ "vocab_size": 32000
30
+ }
arc_100m_16b/checkpoint-45000/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.57.1",
7
+ "use_cache": false
8
+ }
arc_100m_16b/checkpoint-45000/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
arc_100m_16b/checkpoint-45000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
arc_100m_16b/checkpoint-45000/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 2048,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
arc_100m_16b/checkpoint-75000/chat_template.jinja ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% for message in messages %}
2
+ {% if message['role'] == 'user' %}
3
+ {{ '<|user|>
4
+ ' + message['content'] + eos_token }}
5
+ {% elif message['role'] == 'system' %}
6
+ {{ '<|system|>
7
+ ' + message['content'] + eos_token }}
8
+ {% elif message['role'] == 'assistant' %}
9
+ {{ '<|assistant|>
10
+ ' + message['content'] + eos_token }}
11
+ {% endif %}
12
+ {% if loop.last and add_generation_prompt %}
13
+ {{ '<|assistant|>' }}
14
+ {% endif %}
15
+ {% endfor %}
arc_100m_16b/checkpoint-75000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "max_position_embeddings": 4096,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "num_key_value_heads": 4,
21
+ "pad_token_id": 2,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_scaling": null,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "transformers_version": "4.57.1",
28
+ "use_cache": false,
29
+ "vocab_size": 32000
30
+ }
arc_100m_16b/checkpoint-75000/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.57.1",
7
+ "use_cache": false
8
+ }
arc_100m_16b/checkpoint-75000/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
arc_100m_16b/checkpoint-75000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
arc_100m_16b/checkpoint-75000/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 2048,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
arc_100m_16b/checkpoint-80000/chat_template.jinja ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% for message in messages %}
2
+ {% if message['role'] == 'user' %}
3
+ {{ '<|user|>
4
+ ' + message['content'] + eos_token }}
5
+ {% elif message['role'] == 'system' %}
6
+ {{ '<|system|>
7
+ ' + message['content'] + eos_token }}
8
+ {% elif message['role'] == 'assistant' %}
9
+ {{ '<|assistant|>
10
+ ' + message['content'] + eos_token }}
11
+ {% endif %}
12
+ {% if loop.last and add_generation_prompt %}
13
+ {{ '<|assistant|>' }}
14
+ {% endif %}
15
+ {% endfor %}
arc_100m_16b/checkpoint-80000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "max_position_embeddings": 4096,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "num_key_value_heads": 4,
21
+ "pad_token_id": 2,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_scaling": null,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "transformers_version": "4.57.1",
28
+ "use_cache": false,
29
+ "vocab_size": 32000
30
+ }
arc_100m_16b/checkpoint-80000/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.57.1",
7
+ "use_cache": false
8
+ }
arc_100m_16b/checkpoint-80000/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
arc_100m_16b/checkpoint-80000/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
arc_100m_16b/checkpoint-80000/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 2048,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
arc_100m_16b/final-model/chat_template.jinja ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% for message in messages %}
2
+ {% if message['role'] == 'user' %}
3
+ {{ '<|user|>
4
+ ' + message['content'] + eos_token }}
5
+ {% elif message['role'] == 'system' %}
6
+ {{ '<|system|>
7
+ ' + message['content'] + eos_token }}
8
+ {% elif message['role'] == 'assistant' %}
9
+ {{ '<|assistant|>
10
+ ' + message['content'] + eos_token }}
11
+ {% endif %}
12
+ {% if loop.last and add_generation_prompt %}
13
+ {{ '<|assistant|>' }}
14
+ {% endif %}
15
+ {% endfor %}
arc_100m_16b/final-model/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 2,
10
+ "head_dim": 64,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 2048,
15
+ "max_position_embeddings": 4096,
16
+ "mlp_bias": false,
17
+ "model_type": "llama",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "num_key_value_heads": 4,
21
+ "pad_token_id": 2,
22
+ "pretraining_tp": 1,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_scaling": null,
25
+ "rope_theta": 10000.0,
26
+ "tie_word_embeddings": false,
27
+ "transformers_version": "4.57.1",
28
+ "use_cache": false,
29
+ "vocab_size": 32000
30
+ }
arc_100m_16b/final-model/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.57.1",
7
+ "use_cache": false
8
+ }
arc_100m_16b/final-model/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
arc_100m_16b/final-model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
arc_100m_16b/final-model/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 2048,
37
+ "pad_token": "</s>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
pre.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import math
3
+ import time
4
+ import random
5
+ from itertools import islice
6
+
7
+ import numpy as np
8
+ import torch
9
+ from torch.cuda.amp import GradScaler, autocast
10
+ from datasets import load_dataset
11
+ from transformers import (
12
+ AutoTokenizer,
13
+ LlamaConfig,
14
+ LlamaForCausalLM,
15
+ get_cosine_schedule_with_warmup,
16
+ )
17
+ from tqdm import tqdm
18
+ import matplotlib.pyplot as plt
19
+
20
+ HF_TOKEN = os.environ.get("HF_TOKEN")
21
+ if not HF_TOKEN:
22
+ raise ValueError("HF_TOKEN environment variable must be set")
23
+
24
+ RAW_DATASET_NAME = "ThomasTheMaker/Arc-Corpus"
25
+ TOKENIZER_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
26
+ MAX_DATASET_ROWS = 9600_000
27
+
28
+ OUTPUT_DIR = "output_arc_lm_100m"
29
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
30
+
31
+ BLOCK_SIZE = 4096
32
+ BATCH_SIZE = 24
33
+ GRAD_ACCUM_STEPS = 2
34
+ NUM_EPOCHS = 1
35
+ LEARNING_RATE = 3.0e-4
36
+ WEIGHT_DECAY = 0.1
37
+ WARMUP_RATIO = 0.01
38
+ GRAD_CLIP = 1.0
39
+ LOG_EVERY = 50
40
+ SAVE_EVERY = 5_000
41
+ RANDOM_SEED = 42
42
+
43
+ random.seed(RANDOM_SEED)
44
+ np.random.seed(RANDOM_SEED)
45
+ torch.manual_seed(RANDOM_SEED)
46
+ torch.cuda.manual_seed_all(RANDOM_SEED)
47
+
48
+ print("📦 Loading dataset stream...")
49
+ stream_ds = load_dataset(
50
+ RAW_DATASET_NAME,
51
+ split="train",
52
+ streaming=True,
53
+ token=HF_TOKEN,
54
+ )
55
+
56
+ def ensure_text(example):
57
+ content = (example.get("text") or "").strip()
58
+ if not content:
59
+ content = "No content provided."
60
+ return {"text": content}
61
+
62
+ print("🔡 Loading tokenizer:", TOKENIZER_NAME)
63
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)
64
+
65
+ special_tokens = {
66
+ "bos_token": "<s>",
67
+ "eos_token": "</s>",
68
+ "unk_token": "<unk>",
69
+ "pad_token": "<pad>",
70
+ }
71
+
72
+ to_add = {k: v for k, v in special_tokens.items() if getattr(tokenizer, k, None) is None}
73
+ if to_add:
74
+ print("➕ Adding special tokens:", to_add)
75
+ tokenizer.add_special_tokens(to_add)
76
+
77
+ pad_id = tokenizer.pad_token_id
78
+ bos_id = tokenizer.bos_token_id
79
+ eos_id = tokenizer.eos_token_id
80
+
81
+ print(f"✅ Tokenizer vocab size: {len(tokenizer)}")
82
+ print(f" pad_id={pad_id}, bos_id={bos_id}, eos_id={eos_id}")
83
+ print()
84
+
85
+ formatted_stream = stream_ds.map(ensure_text)
86
+
87
+ print("📊 Estimating dataset size...")
88
+ sample_size = min(1000, MAX_DATASET_ROWS)
89
+ sample_tokens = 0
90
+
91
+ temp_stream = stream_ds.map(ensure_text)
92
+ for i, ex in enumerate(islice(temp_stream, sample_size)):
93
+ text = ex["text"]
94
+ ids = tokenizer(text, add_special_tokens=False)["input_ids"]
95
+ sample_tokens += len(ids) + 1
96
+
97
+ avg_tokens_per_doc = sample_tokens / sample_size
98
+ print(f" Sampled {sample_size} documents, avg {avg_tokens_per_doc:.1f} tokens/doc")
99
+
100
+ num_docs = MAX_DATASET_ROWS
101
+ estimated_tokens = int(num_docs * avg_tokens_per_doc)
102
+ print(f" Using first {num_docs:,} documents")
103
+ print(f" Estimated total tokens: {estimated_tokens:,}")
104
+
105
+ TOKENS_PER_STEP = BLOCK_SIZE * BATCH_SIZE * GRAD_ACCUM_STEPS
106
+ TOTAL_STEPS = (estimated_tokens * NUM_EPOCHS) // TOKENS_PER_STEP
107
+ print(f"📊 Training for {TOTAL_STEPS:,} steps ({NUM_EPOCHS} epoch(s))")
108
+ print(f" Tokens per step: {TOKENS_PER_STEP:,}")
109
+ print(f" Total tokens: {estimated_tokens * NUM_EPOCHS:,}")
110
+ print()
111
+
112
+ print()
113
+
114
+ peek = list(islice(stream_ds.map(ensure_text), 1))
115
+ print("🔎 Sample:")
116
+ print((peek[0]["text"] if peek else "<empty>")[:500])
117
+ print()
118
+
119
+ formatted_stream = stream_ds.map(ensure_text)
120
+
121
+ config = LlamaConfig(
122
+ vocab_size=len(tokenizer),
123
+ hidden_size=768,
124
+ intermediate_size=2048,
125
+ num_hidden_layers=12,
126
+ num_attention_heads=12,
127
+ num_key_value_heads=4,
128
+ max_position_embeddings=BLOCK_SIZE,
129
+ rms_norm_eps=1e-6,
130
+ initializer_range=0.02,
131
+ use_cache=False,
132
+ pad_token_id=pad_id,
133
+ bos_token_id=bos_id,
134
+ eos_token_id=eos_id,
135
+ tie_word_embeddings=False,
136
+ )
137
+
138
+ print("🧩 Building model...")
139
+ model = LlamaForCausalLM(config)
140
+ model.resize_token_embeddings(len(tokenizer))
141
+ model.gradient_checkpointing_enable()
142
+
143
+ device = "cuda" if torch.cuda.is_available() else "cpu"
144
+
145
+ if torch.cuda.is_available():
146
+ torch.backends.cuda.matmul.allow_tf32 = True
147
+ torch.backends.cudnn.allow_tf32 = True
148
+
149
+ use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
150
+ use_fp16 = torch.cuda.is_available() and (not use_bf16)
151
+
152
+ if use_bf16:
153
+ dtype = torch.bfloat16
154
+ elif use_fp16:
155
+ dtype = torch.float16
156
+ else:
157
+ dtype = torch.float32
158
+
159
+ model = model.to(device, dtype=dtype)
160
+
161
+ print(
162
+ f"✅ Model ready: {sum(p.numel() for p in model.parameters())/1e6:.1f}M params, "
163
+ f"dtype={dtype}, device={device}"
164
+ )
165
+ print()
166
+
167
+ def token_block_stream(hf_stream, tokenizer, block_size, eos_id):
168
+ buffer = []
169
+
170
+ for ex in hf_stream:
171
+ text = ex["text"]
172
+ ids = tokenizer(text, add_special_tokens=False)["input_ids"]
173
+ ids.append(eos_id)
174
+ buffer.extend(ids)
175
+
176
+ while len(buffer) >= block_size:
177
+ block = buffer[:block_size]
178
+ buffer = buffer[block_size:]
179
+ yield torch.tensor(block, dtype=torch.long)
180
+
181
+ optimizer = torch.optim.AdamW(
182
+ model.parameters(),
183
+ lr=LEARNING_RATE,
184
+ weight_decay=WEIGHT_DECAY,
185
+ betas=(0.9, 0.95),
186
+ )
187
+
188
+ num_warmup_steps = int(TOTAL_STEPS * WARMUP_RATIO)
189
+ scheduler = get_cosine_schedule_with_warmup(
190
+ optimizer,
191
+ num_warmup_steps=num_warmup_steps,
192
+ num_training_steps=TOTAL_STEPS,
193
+ )
194
+
195
+ scaler = GradScaler(enabled=use_fp16)
196
+
197
+ print("🚀 Starting pretraining...")
198
+ print(
199
+ f" BLOCK_SIZE={BLOCK_SIZE}, BATCH_SIZE={BATCH_SIZE}, "
200
+ f"GRAD_ACCUM_STEPS={GRAD_ACCUM_STEPS}, TOTAL_STEPS={TOTAL_STEPS}"
201
+ )
202
+ print(
203
+ f" Effective tokens/step ≈ {BLOCK_SIZE * BATCH_SIZE * GRAD_ACCUM_STEPS:,}"
204
+ )
205
+ print(f" Learning rate: {LEARNING_RATE}, Warmup steps: {num_warmup_steps}")
206
+ print()
207
+
208
+ global_step = 0
209
+ micro_step = 0
210
+ running_loss = 0.0
211
+ start_time = time.time()
212
+ window_start_time = time.time()
213
+ window_start_step = 0
214
+
215
+ loss_history = []
216
+ lr_history = []
217
+ throughput_history = []
218
+ step_history = []
219
+
220
+ def multi_epoch_stream(base_stream, num_epochs, max_rows):
221
+ for epoch in range(num_epochs):
222
+ print(f"📚 Starting epoch {epoch + 1}/{num_epochs}")
223
+ row_count = 0
224
+ for item in base_stream:
225
+ if row_count >= max_rows:
226
+ break
227
+ yield item
228
+ row_count += 1
229
+ print(f" Processed {row_count:,} rows in epoch {epoch + 1}")
230
+
231
+ formatted_stream_base = stream_ds.map(ensure_text)
232
+ multi_epoch_data = multi_epoch_stream(formatted_stream_base, NUM_EPOCHS, MAX_DATASET_ROWS)
233
+ block_iter = token_block_stream(multi_epoch_data, tokenizer, BLOCK_SIZE, eos_id)
234
+
235
+ model.train()
236
+
237
+ pbar = tqdm(total=TOTAL_STEPS, desc="Training", unit="step")
238
+
239
+ autocast_ctx = autocast(enabled=(use_bf16 or use_fp16), dtype=torch.bfloat16 if use_bf16 else torch.float16)
240
+ with autocast_ctx:
241
+ while global_step < TOTAL_STEPS:
242
+ blocks = []
243
+ for _ in range(BATCH_SIZE):
244
+ try:
245
+ block = next(block_iter)
246
+ blocks.append(block)
247
+ except StopIteration:
248
+ print(f"\n✅ Dataset exhausted after {global_step} steps")
249
+ break
250
+
251
+ if len(blocks) < BATCH_SIZE:
252
+ print(f" Completed training with partial batch of {len(blocks)} blocks")
253
+ break
254
+
255
+ input_ids = torch.stack(blocks).to(device)
256
+ attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=device)
257
+ labels = input_ids.clone()
258
+
259
+ outputs = model(
260
+ input_ids=input_ids,
261
+ attention_mask=attention_mask,
262
+ labels=labels,
263
+ )
264
+ loss = outputs.loss / GRAD_ACCUM_STEPS
265
+
266
+ if use_fp16:
267
+ scaler.scale(loss).backward()
268
+ else:
269
+ loss.backward()
270
+
271
+ running_loss += loss.item()
272
+ micro_step += 1
273
+
274
+ if micro_step % GRAD_ACCUM_STEPS == 0:
275
+ if use_fp16:
276
+ scaler.unscale_(optimizer)
277
+
278
+ torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
279
+
280
+ if use_fp16:
281
+ scaler.step(optimizer)
282
+ scaler.update()
283
+ else:
284
+ optimizer.step()
285
+
286
+ optimizer.zero_grad(set_to_none=True)
287
+ scheduler.step()
288
+
289
+ global_step += 1
290
+ pbar.update(1)
291
+
292
+ if global_step % LOG_EVERY == 0:
293
+ avg_loss = running_loss / LOG_EVERY
294
+ current_lr = scheduler.get_last_lr()[0]
295
+
296
+ window_elapsed = time.time() - window_start_time
297
+ window_steps = global_step - window_start_step
298
+ tok_per_step = BLOCK_SIZE * BATCH_SIZE * GRAD_ACCUM_STEPS
299
+ window_tps = (tok_per_step * window_steps) / window_elapsed if window_elapsed > 0 else 0
300
+
301
+ total_elapsed = time.time() - start_time
302
+ total_tps = (tok_per_step * global_step) / total_elapsed if total_elapsed > 0 else 0
303
+
304
+ pbar.set_postfix({
305
+ "loss": f"{avg_loss:.4f}",
306
+ "lr": f"{current_lr:.2e}",
307
+ "tok/s": f"{int(window_tps):,}"
308
+ })
309
+
310
+ running_loss = 0.0
311
+ window_start_time = time.time()
312
+ window_start_step = global_step
313
+
314
+ if global_step % SAVE_EVERY == 0:
315
+ ckpt_dir = os.path.join(OUTPUT_DIR, f"checkpoint-{global_step}")
316
+ print(f"\n💾 Saving checkpoint to {ckpt_dir}")
317
+ os.makedirs(ckpt_dir, exist_ok=True)
318
+ model.save_pretrained(ckpt_dir)
319
+ tokenizer.save_pretrained(ckpt_dir)
320
+
321
+ torch.save({
322
+ 'global_step': global_step,
323
+ 'optimizer_state_dict': optimizer.state_dict(),
324
+ 'scheduler_state_dict': scheduler.state_dict(),
325
+ 'scaler_state_dict': scaler.state_dict() if use_fp16 else None,
326
+ }, os.path.join(ckpt_dir, "training_state.pt"))
327
+
328
+ pbar.close()
329
+
330
+ print("\n✅ Training complete!")
331
+ print("💾 Saving final model...")
332
+
333
+ final_dir = os.path.join(OUTPUT_DIR, "final-model")
334
+ os.makedirs(final_dir, exist_ok=True)
335
+ model.save_pretrained(final_dir)
336
+ tokenizer.save_pretrained(final_dir)
337
+
338
+ torch.save({
339
+ 'global_step': global_step,
340
+ 'optimizer_state_dict': optimizer.state_dict(),
341
+ 'scheduler_state_dict': scheduler.state_dict(),
342
+ 'scaler_state_dict': scaler.state_dict() if use_fp16 else None,
343
+ }, os.path.join(final_dir, "training_state.pt"))
344
+
345
+ print("🎉 Done!")
requirements.txt ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==1.11.0
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.13.2
4
+ aiosignal==1.4.0
5
+ anyio==4.11.0
6
+ async-timeout==5.0.1
7
+ attrs==25.4.0
8
+ certifi==2025.11.12
9
+ charset-normalizer==3.4.4
10
+ datasets==4.4.1
11
+ dill==0.4.0
12
+ evaluate==0.4.6
13
+ exceptiongroup==1.3.0
14
+ filelock==3.20.0
15
+ frozenlist==1.8.0
16
+ fsspec==2025.10.0
17
+ h11==0.16.0
18
+ hf-xet==1.2.0
19
+ httpcore==1.0.9
20
+ httpx==0.28.1
21
+ huggingface-hub==0.36.0
22
+ idna==3.11
23
+ Jinja2==3.1.6
24
+ MarkupSafe==3.0.3
25
+ mpmath==1.3.0
26
+ multidict==6.7.0
27
+ multiprocess==0.70.18
28
+ networkx==3.4.2
29
+ numpy==2.2.6
30
+ nvidia-cublas-cu12==12.8.4.1
31
+ nvidia-cuda-cupti-cu12==12.8.90
32
+ nvidia-cuda-nvrtc-cu12==12.8.93
33
+ nvidia-cuda-runtime-cu12==12.8.90
34
+ nvidia-cudnn-cu12==9.10.2.21
35
+ nvidia-cufft-cu12==11.3.3.83
36
+ nvidia-cufile-cu12==1.13.1.3
37
+ nvidia-curand-cu12==10.3.9.90
38
+ nvidia-cusolver-cu12==11.7.3.90
39
+ nvidia-cusparse-cu12==12.5.8.93
40
+ nvidia-cusparselt-cu12==0.7.1
41
+ nvidia-nccl-cu12==2.27.5
42
+ nvidia-nvjitlink-cu12==12.8.93
43
+ nvidia-nvshmem-cu12==3.3.20
44
+ nvidia-nvtx-cu12==12.8.90
45
+ packaging==25.0
46
+ pandas==2.3.3
47
+ propcache==0.4.1
48
+ psutil==7.1.3
49
+ pyarrow==22.0.0
50
+ python-dateutil==2.9.0.post0
51
+ pytz==2025.2
52
+ PyYAML==6.0.3
53
+ regex==2025.11.3
54
+ requests==2.32.5
55
+ safetensors==0.6.2
56
+ six==1.17.0
57
+ sniffio==1.3.1
58
+ sympy==1.14.0
59
+ tokenizers==0.22.1
60
+ torch==2.9.0
61
+ tqdm==4.67.1
62
+ transformers==4.57.1
63
+ triton==3.5.0
64
+ trl==0.25.0
65
+ typing_extensions==4.15.0
66
+ tzdata==2025.2
67
+ urllib3==2.5.0
68
+ xxhash==3.6.0
69
+ yarl==1.22.0
70
+ accelerate==1.11.0
71
+ aiohappyeyeballs==2.6.1
72
+ aiohttp==3.13.2
73
+ aiosignal==1.4.0
74
+ anyio==4.11.0
75
+ async-timeout==5.0.1
76
+ attrs==25.4.0
77
+ certifi==2025.11.12
78
+ charset-normalizer==3.4.4
79
+ contourpy==1.3.2
80
+ cycler==0.12.1
81
+ datasets==4.4.1
82
+ dill==0.4.0
83
+ evaluate==0.4.6
84
+ exceptiongroup==1.3.0
85
+ filelock==3.20.0
86
+ fonttools==4.60.1
87
+ frozenlist==1.8.0
88
+ fsspec==2025.10.0
89
+ h11==0.16.0
90
+ hf-xet==1.2.0
91
+ httpcore==1.0.9
92
+ httpx==0.28.1
93
+ huggingface-hub==0.36.0
94
+ idna==3.11
95
+ Jinja2==3.1.6
96
+ kiwisolver==1.4.9
97
+ MarkupSafe==3.0.3
98
+ matplotlib==3.10.7
99
+ mpmath==1.3.0
100
+ multidict==6.7.0
101
+ multiprocess==0.70.18
102
+ networkx==3.4.2
103
+ numpy==2.2.6
104
+ nvidia-cublas-cu12==12.8.4.1
105
+ nvidia-cuda-cupti-cu12==12.8.90
106
+ nvidia-cuda-nvrtc-cu12==12.8.93
107
+ nvidia-cuda-runtime-cu12==12.8.90
108
+ nvidia-cudnn-cu12==9.10.2.21
109
+ nvidia-cufft-cu12==11.3.3.83
110
+ nvidia-cufile-cu12==1.13.1.3
111
+ nvidia-curand-cu12==10.3.9.90
112
+ nvidia-cusolver-cu12==11.7.3.90
113
+ nvidia-cusparse-cu12==12.5.8.93
114
+ nvidia-cusparselt-cu12==0.7.1
115
+ nvidia-nccl-cu12==2.27.5
116
+ nvidia-nvjitlink-cu12==12.8.93
117
+ nvidia-nvshmem-cu12==3.3.20
118
+ nvidia-nvtx-cu12==12.8.90
119
+ packaging==25.0
120
+ pandas==2.3.3
121
+ pillow==12.0.0
122
+ propcache==0.4.1
123
+ psutil==7.1.3
124
+ pyarrow==22.0.0
125
+ pyparsing==3.2.5
126
+ python-dateutil==2.9.0.post0
127
+ pytz==2025.2
128
+ PyYAML==6.0.3
129
+ regex==2025.11.3
130
+ requests==2.32.5
131
+ safetensors==0.6.2
132
+ six==1.17.0
133
+ sniffio==1.3.1
134
+ sympy==1.14.0
135
+ tokenizers==0.22.1
136
+ torch==2.9.0
137
+ tqdm==4.67.1
138
+ transformers==4.57.1
139
+ triton==3.5.0
140
+ trl==0.25.0
141
+ typing_extensions==4.15.0
142
+ tzdata==2025.2
143
+ urllib3==2.5.0
144
+ xxhash==3.6.0
145
+ yarl==1.22.0