Brain2nd commited on
Commit
95a79b1
·
verified ·
1 Parent(s): 23666d5

Upload V4 1.16B pretrain checkpoint step10500

Browse files
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - neuronspark
5
+ - snn
6
+ - causal-lm
7
+ - pretrain
8
+ - deepspeed
9
+ - checkpoint
10
+ ---
11
+
12
+ # NeuronSpark-V4-1.16B-Pretrain
13
+
14
+ NeuronSpark V4 autoregressive pretraining checkpoint.
15
+
16
+ This repository contains a complete training checkpoint for continued pretraining, not only inference weights.
17
+
18
+ ## Checkpoint
19
+
20
+ - Architecture: NeuronSpark V4 causal language model
21
+ - Scale: 1.16B parameters
22
+ - Checkpoint step: 10500
23
+ - Tokens seen: 2,063,372,760 supervised tokens
24
+ - Sequence length: 2048
25
+ - Training mode: autoregressive pretraining
26
+ - Optimizer: Muon + Adam + Lion
27
+ - DeepSpeed: ZeRO-0
28
+ - Precision: bf16 training path
29
+
30
+ ## Included Files
31
+
32
+ - `model.safetensors`: Hugging Face model weights for loading/evaluation.
33
+ - `config.json`, `configuration_neuronspark.py`, `modeling_neuronspark.py`: self-contained custom model code/config.
34
+ - `tokenizer.json`, `tokenizer_config.json`, `chat_template.jinja`: tokenizer assets.
35
+ - `training_state.pth`: saved training step and token counter.
36
+ - `deepspeed/`: DeepSpeed checkpoint state for continued training.
37
+
38
+ ## Continue Training
39
+
40
+ Download or snapshot this repository, then resume with the original training script:
41
+
42
+ ```bash
43
+ deepspeed --num_gpus=8 train_pretrain.py \
44
+ --config_json configs/smoke_1p16b.json \
45
+ --data_path <pretokenized_data_dir> \
46
+ --tokenizer_path tokenizer_v3 \
47
+ --out_dir <new_output_dir> \
48
+ --deepspeed_config configs/ds_zero0_v4.json \
49
+ --max_length 2048 \
50
+ --batch_size 12 \
51
+ --accumulation_steps 1 \
52
+ --optimizer muon_adam_lion \
53
+ --learning_rate 2e-4 \
54
+ --muon_lr 0.005 \
55
+ --lion_lr 1e-4 \
56
+ --warmup_iters 500 \
57
+ --grad_clip 0.5 \
58
+ --resume <downloaded_checkpoint_dir>
59
+ ```
60
+
61
+
62
+ ## Provenance
63
+
64
+ This is a V4 pretraining checkpoint from the current NeuronSpark V4 branch. It is not the historical V2.5/V3 checkpoint family.
chat_template.jinja ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
27
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
28
+ {%- elif message.role == "assistant" %}
29
+ {%- set content = message.content %}
30
+ {%- set reasoning_content = '' %}
31
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
32
+ {%- set reasoning_content = message.reasoning_content %}
33
+ {%- else %}
34
+ {%- if '</think>' in message.content %}
35
+ {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
36
+ {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37
+ {%- endif %}
38
+ {%- endif %}
39
+ {%- if loop.index0 > ns.last_query_index %}
40
+ {%- if loop.last or (not loop.last and reasoning_content) %}
41
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
42
+ {%- else %}
43
+ {{- '<|im_start|>' + message.role + '\n' + content }}
44
+ {%- endif %}
45
+ {%- else %}
46
+ {{- '<|im_start|>' + message.role + '\n' + content }}
47
+ {%- endif %}
48
+ {%- if message.tool_calls %}
49
+ {%- for tool_call in message.tool_calls %}
50
+ {%- if (loop.first and content) or (not loop.first) %}
51
+ {{- '\n' }}
52
+ {%- endif %}
53
+ {%- if tool_call.function %}
54
+ {%- set tool_call = tool_call.function %}
55
+ {%- endif %}
56
+ {{- '<tool_call>\n{"name": "' }}
57
+ {{- tool_call.name }}
58
+ {{- '", "arguments": ' }}
59
+ {%- if tool_call.arguments is string %}
60
+ {{- tool_call.arguments }}
61
+ {%- else %}
62
+ {{- tool_call.arguments | tojson }}
63
+ {%- endif %}
64
+ {{- '}\n</tool_call>' }}
65
+ {%- endfor %}
66
+ {%- endif %}
67
+ {{- '<|im_end|>\n' }}
68
+ {%- elif message.role == "tool" %}
69
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
70
+ {{- '<|im_start|>user' }}
71
+ {%- endif %}
72
+ {{- '\n<tool_response>\n' }}
73
+ {{- message.content }}
74
+ {{- '\n</tool_response>' }}
75
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
76
+ {{- '<|im_end|>\n' }}
77
+ {%- endif %}
78
+ {%- endif %}
79
+ {%- endfor %}
80
+ {%- if add_generation_prompt %}
81
+ {{- '<|im_start|>assistant\n' }}
82
+ {%- if enable_thinking is defined and enable_thinking is false %}
83
+ {{- '<think>\n\n</think>\n\n' }}
84
+ {%- endif %}
85
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "D": 1024,
3
+ "D_ff": 3072,
4
+ "D_key": 128,
5
+ "D_value": 128,
6
+ "K": 12,
7
+ "N": 8,
8
+ "ahp_init": 0.0,
9
+ "architectures": [
10
+ "NeuronSparkForCausalLM"
11
+ ],
12
+ "auto_map": {
13
+ "AutoConfig": "configuration_neuronspark.NeuronSparkConfig",
14
+ "AutoModelForCausalLM": "modeling_neuronspark.NeuronSparkForCausalLM"
15
+ },
16
+ "bias_balancing_ema": 0.99,
17
+ "bias_balancing_lr": 0.001,
18
+ "bos_token_id": 1,
19
+ "dtype": "bfloat16",
20
+ "eos_token_id": 2,
21
+ "eps_explore": 0.05,
22
+ "k_predictor_hidden": 256,
23
+ "memory_layer_interval": 4,
24
+ "model_type": "neuronspark",
25
+ "num_hidden_layers": 24,
26
+ "num_layers": 24,
27
+ "ponder_T_final": 0.3,
28
+ "ponder_T_init": 2.0,
29
+ "rope_layout": "transformer_interleaved",
30
+ "spike_mode": "quantal",
31
+ "surrogate_alpha": 4.0,
32
+ "transformers_version": "5.6.2",
33
+ "use_ahp": false,
34
+ "use_cache": false,
35
+ "v_th_min": 0.02,
36
+ "v_th_reg_weight": 0.0,
37
+ "vocab_size": 128387
38
+ }
configuration_neuronspark.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class NeuronSparkConfig(PretrainedConfig):
5
+ model_type = "neuronspark"
6
+
7
+ def __init__(
8
+ self,
9
+ vocab_size=64002,
10
+ D=1024,
11
+ N=8,
12
+ K=12,
13
+ num_layers=24,
14
+ D_ff=3072,
15
+ v_th_min=0.02, # selective threshold floor; mainline v4.1 uses quantal release
16
+ memory_layer_interval=4,
17
+ D_key=128,
18
+ D_value=128,
19
+ # SNNAttention RoPE channel pairing. New V4 runs use the mainstream
20
+ # Transformer even/odd interleaved layout; "native" is kept only for
21
+ # loading historical checkpoints trained with the old half-split layout.
22
+ rope_layout="transformer_interleaved",
23
+ # 神经元发放形式 (v4.1 — 见 docs/v4_status_and_roadmap.md §神经元设计)
24
+ # "quantal" = 当前 V4 架构语义: output = v_th·𝟙[V_pre>v_th],
25
+ # V_post = V_pre - v_th·𝟙[...] (use_ahp=False 时), 剩余余量留膜里。
26
+ # "supra" = v3 / early-v4 的 bio-ReLU 历史形式, 仅用于 ablation-only 对照。
27
+ spike_mode="quantal",
28
+ # surrogate gradient α (sigmoid surrogate, 仅 spike_mode="quantal" 时用于 output 的反向)
29
+ surrogate_alpha=4.0,
30
+ # 后超极化 (AHP / 不应期): 发放后膜额外下压 ahp (per-channel 可学), V_post -= ahp·𝟙[V_pre>v_th]
31
+ use_ahp=False,
32
+ ahp_init=0.0, # ahp 参数初始值 (per-channel scalar)
33
+ # PLIFNode.v_th 朝 init 的二次正则 (打破 v_th↔下游W 的尺度冗余: v_th 漂到 floor → W_in 补偿性暴涨 → SNNBlock 膜 runaway → NaN). 0 = 关.
34
+ v_th_reg_weight=0.0,
35
+ # v3 PonderNet fields (input-conditioned KPredictor)
36
+ k_predictor_hidden=None,
37
+ ponder_T_init=2.0,
38
+ ponder_T_final=0.3,
39
+ eps_explore=0.05,
40
+ bias_balancing_lr=1e-3,
41
+ bias_balancing_ema=0.99,
42
+ bos_token_id=1,
43
+ eos_token_id=2,
44
+ **kwargs,
45
+ ):
46
+ self.vocab_size = vocab_size
47
+ self.D = D
48
+ self.N = N
49
+ self.K = K
50
+ self.num_layers = num_layers
51
+ # HF GenerationMixin / DynamicCache 期望 num_hidden_layers 字段
52
+ self.num_hidden_layers = num_layers
53
+ # SNN 没有 KV cache, 关掉避免 HF 试图建 DynamicCache
54
+ self.use_cache = False
55
+ self.D_ff = D_ff
56
+ self.v_th_min = v_th_min
57
+ self.memory_layer_interval = memory_layer_interval
58
+ self.D_key = D_key
59
+ self.D_value = D_value
60
+ self.rope_layout = rope_layout
61
+ self.spike_mode = spike_mode
62
+ self.surrogate_alpha = surrogate_alpha
63
+ self.use_ahp = use_ahp
64
+ self.ahp_init = ahp_init
65
+ self.v_th_reg_weight = v_th_reg_weight
66
+ # v3 PonderNet
67
+ self.k_predictor_hidden = k_predictor_hidden
68
+ self.ponder_T_init = ponder_T_init
69
+ self.ponder_T_final = ponder_T_final
70
+ self.eps_explore = eps_explore
71
+ self.bias_balancing_lr = bias_balancing_lr
72
+ self.bias_balancing_ema = bias_balancing_ema
73
+
74
+ # auto_map: HF 文件路径/类名 两段式(neuronspark/ 子目录)
75
+ kwargs.setdefault("auto_map", {
76
+ "AutoConfig": "configuration_neuronspark.NeuronSparkConfig",
77
+ "AutoModelForCausalLM": "modeling_neuronspark.NeuronSparkForCausalLM",
78
+ })
79
+ kwargs.setdefault("architectures", ["NeuronSparkForCausalLM"])
80
+ kwargs.setdefault("dtype", "bfloat16")
81
+
82
+ super().__init__(
83
+ bos_token_id=bos_token_id,
84
+ eos_token_id=eos_token_id,
85
+ **kwargs,
86
+ )
deepspeed/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f4b85cadc39a51c86547603f28beaf6ec95c1f99d987ae52ccad8879a873bce
3
+ size 3289813267
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "transformers_version": "5.6.2",
8
+ "use_cache": false
9
+ }
latest ADDED
@@ -0,0 +1 @@
 
 
1
+ deepspeed
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3a22d378f69debb1e04ecfd6f77b1f98ad875968343098407c58124f3208fb2
3
+ size 2471985328
modeling_neuronspark.py ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_origin": "filtered from Qwen/Qwen3-1.7B-Base: dropped non-EN/ZH language tokens (23282 of 151643)",
3
+ "add_prefix_space": false,
4
+ "backend": "tokenizers",
5
+ "bos_token": null,
6
+ "clean_up_tokenization_spaces": false,
7
+ "eos_token": "<|endoftext|>",
8
+ "errors": "replace",
9
+ "extra_special_tokens": [
10
+ "<|im_start|>",
11
+ "<|im_end|>",
12
+ "<|object_ref_start|>",
13
+ "<|object_ref_end|>",
14
+ "<|box_start|>",
15
+ "<|box_end|>",
16
+ "<|quad_start|>",
17
+ "<|quad_end|>",
18
+ "<|vision_start|>",
19
+ "<|vision_end|>",
20
+ "<|vision_pad|>",
21
+ "<|image_pad|>",
22
+ "<|video_pad|>"
23
+ ],
24
+ "is_local": true,
25
+ "local_files_only": false,
26
+ "model_max_length": 131072,
27
+ "pad_token": "<|endoftext|>",
28
+ "split_special_tokens": false,
29
+ "tokenizer_class": "Qwen2Tokenizer",
30
+ "unk_token": null
31
+ }
training_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28b7ad0dcb88d196ebb5109088780dd0e90a4932ffed12a49cb9cf2325830ae1
3
+ size 1367