semran1 commited on
Commit
bb7ba3e
·
verified ·
1 Parent(s): e008a8f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
chat_template.jinja ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18
+ {%- for message in messages[::-1] %}
19
+ {%- set index = (messages|length - 1) - loop.index0 %}
20
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21
+ {%- set ns.multi_step_tool = false %}
22
+ {%- set ns.last_query_index = index %}
23
+ {%- endif %}
24
+ {%- endfor %}
25
+ {%- for message in messages %}
26
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
27
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
28
+ {%- elif message.role == "assistant" %}
29
+ {%- set content = message.content %}
30
+ {%- set reasoning_content = '' %}
31
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
32
+ {%- set reasoning_content = message.reasoning_content %}
33
+ {%- else %}
34
+ {%- if '</think>' in message.content %}
35
+ {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
36
+ {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37
+ {%- endif %}
38
+ {%- endif %}
39
+ {%- if loop.index0 > ns.last_query_index %}
40
+ {%- if loop.last or (not loop.last and reasoning_content) %}
41
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
42
+ {%- else %}
43
+ {{- '<|im_start|>' + message.role + '\n' + content }}
44
+ {%- endif %}
45
+ {%- else %}
46
+ {{- '<|im_start|>' + message.role + '\n' + content }}
47
+ {%- endif %}
48
+ {%- if message.tool_calls %}
49
+ {%- for tool_call in message.tool_calls %}
50
+ {%- if (loop.first and content) or (not loop.first) %}
51
+ {{- '\n' }}
52
+ {%- endif %}
53
+ {%- if tool_call.function %}
54
+ {%- set tool_call = tool_call.function %}
55
+ {%- endif %}
56
+ {{- '<tool_call>\n{"name": "' }}
57
+ {{- tool_call.name }}
58
+ {{- '", "arguments": ' }}
59
+ {%- if tool_call.arguments is string %}
60
+ {{- tool_call.arguments }}
61
+ {%- else %}
62
+ {{- tool_call.arguments | tojson }}
63
+ {%- endif %}
64
+ {{- '}\n</tool_call>' }}
65
+ {%- endfor %}
66
+ {%- endif %}
67
+ {{- '<|im_end|>\n' }}
68
+ {%- elif message.role == "tool" %}
69
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
70
+ {{- '<|im_start|>user' }}
71
+ {%- endif %}
72
+ {{- '\n<tool_response>\n' }}
73
+ {{- message.content }}
74
+ {{- '\n</tool_response>' }}
75
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
76
+ {{- '<|im_end|>\n' }}
77
+ {%- endif %}
78
+ {%- endif %}
79
+ {%- endfor %}
80
+ {%- if add_generation_prompt %}
81
+ {{- '<|im_start|>assistant\n' }}
82
+ {%- if enable_thinking is defined and enable_thinking is false %}
83
+ {{- '<think>\n\n</think>\n\n' }}
84
+ {%- endif %}
85
+ {%- endif %}
config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_moe_implementation": "fused",
3
+ "architectures": [
4
+ "BailingMoeV2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_bailing_moe_v2.BailingMoeV2Config",
9
+ "AutoModel": "modeling_bailing_moe_v2.BailingMoeV2Model",
10
+ "AutoModelForCausalLM": "modeling_bailing_moe_v2.BailingMoeV2ForCausalLM"
11
+ },
12
+ "bos_token_id": 151643,
13
+ "dtype": "bfloat16",
14
+ "embedding_dropout": 0.0,
15
+ "eos_token_id": 151645,
16
+ "first_k_dense_replace": 1,
17
+ "head_dim": 128,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 2048,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 5120,
22
+ "max_position_embeddings": 32768,
23
+ "max_window_layers": 20,
24
+ "moe_intermediate_size": 512,
25
+ "moe_router_enable_expert_bias": true,
26
+ "moe_shared_expert_intermediate_size": 512,
27
+ "mtp_loss_scaling_factor": 0,
28
+ "n_group": 8,
29
+ "norm_topk_prob": true,
30
+ "num_attention_heads": 16,
31
+ "num_experts": 224,
32
+ "num_experts_per_tok": 8,
33
+ "num_hidden_layers": 30,
34
+ "num_key_value_heads": 4,
35
+ "num_nextn_predict_layers": 0,
36
+ "num_shared_experts": null,
37
+ "output_dropout": 0.0,
38
+ "output_router_logits": true,
39
+ "pad_token_id": null,
40
+ "partial_rotary_factor": 0.5,
41
+ "pruning_info": {
42
+ "original_experts": 256,
43
+ "original_model_path": "5kling-fuse_heal",
44
+ "pruned_experts": 224,
45
+ "pruning_date": "2026-01-16T05:26:11.661656",
46
+ "pruning_method": "MoP"
47
+ },
48
+ "quantize": false,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 600000,
52
+ "routed_scaling_factor": 2.5,
53
+ "router_dtype": "fp32",
54
+ "score_function": "sigmoid",
55
+ "tie_word_embeddings": false,
56
+ "topk_group": 4,
57
+ "transformers_version": "4.57.3",
58
+ "use_bias": false,
59
+ "use_cache": true,
60
+ "use_qk_norm": true,
61
+ "use_qkv_bias": false,
62
+ "use_rmsnorm": true,
63
+ "vocab_size": 151936
64
+ }
configuration_bailing_moe_v2.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Bailing MoE V2 model configuration"""
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+
5
+
6
+ class BailingMoeV2Config(PretrainedConfig):
7
+
8
+ def __init__(
9
+ self,
10
+ vocab_size=157184,
11
+ hidden_size=2048,
12
+ intermediate_size=5120,
13
+ num_hidden_layers=20,
14
+ num_attention_heads=16,
15
+ num_key_value_heads=4,
16
+ hidden_act="silu",
17
+ use_qkv_bias=False, # bailing only
18
+ use_bias=False, # bailing only
19
+ rms_norm_eps=1e-06,
20
+ tie_word_embeddings=False, # PretrainedConfig key, here change default value.
21
+ embedding_dropout=0.0,
22
+ attention_dropout=0.0,
23
+ output_dropout=0.0,
24
+ initializer_range=0.02,
25
+ max_position_embeddings=32768,
26
+ rope_theta=600000.0,
27
+ use_cache=True,
28
+ max_window_layers=20,
29
+ rope_scaling=None,
30
+ pad_token_id=156892,
31
+ eos_token_id=156892,
32
+ num_experts=256,
33
+ num_shared_experts=1,
34
+ num_experts_per_tok=8,
35
+ n_group=8,
36
+ topk_group=4,
37
+ moe_intermediate_size=512,
38
+ first_k_dense_replace=1,
39
+ head_dim=128,
40
+ output_router_logits=False,
41
+ use_qk_norm=True,
42
+ num_nextn_predict_layers=0,
43
+ mtp_loss_scaling_factor=0,
44
+ moe_router_enable_expert_bias=True,
45
+ routed_scaling_factor=1.0,
46
+ **kwargs,
47
+ ):
48
+ self.num_hidden_layers = num_hidden_layers
49
+ self.vocab_size = vocab_size
50
+ self.hidden_size = hidden_size
51
+ self.intermediate_size = intermediate_size
52
+ self.num_attention_heads = num_attention_heads
53
+ self.num_key_value_heads = num_key_value_heads
54
+ self.hidden_act = hidden_act
55
+ self.use_qkv_bias = use_qkv_bias
56
+ self.use_bias = use_bias
57
+ self.rms_norm_eps = rms_norm_eps
58
+ self.embedding_dropout = embedding_dropout
59
+ self.attention_dropout = attention_dropout
60
+ self.output_dropout = output_dropout
61
+ self.num_nextn_predict_layers = num_nextn_predict_layers
62
+ self.mtp_loss_scaling_factor = mtp_loss_scaling_factor
63
+ self.initializer_range = initializer_range
64
+ self.max_position_embeddings = max_position_embeddings
65
+ self.rope_theta = rope_theta
66
+ self.use_cache = use_cache
67
+ self.max_window_layers = max_window_layers
68
+ self.head_dim = head_dim or self.hidden_size // self.num_attention_heads
69
+ self.rope_scaling = rope_scaling
70
+ self.use_qk_norm = use_qk_norm
71
+ self.moe_router_enable_expert_bias = moe_router_enable_expert_bias
72
+ self.routed_scaling_factor = routed_scaling_factor
73
+
74
+ # MoE configs
75
+ self.num_experts = num_experts
76
+ self.num_shared_experts = num_shared_experts
77
+ self.num_experts_per_tok = num_experts_per_tok
78
+ self.n_group = n_group
79
+ self.topk_group = topk_group
80
+ self.moe_intermediate_size = moe_intermediate_size
81
+ self.first_k_dense_replace = first_k_dense_replace
82
+ self.output_router_logits = output_router_logits
83
+
84
+ super().__init__(pad_token_id=pad_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs)
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:477dceb109a31fafbc9c32783e4912e2070946f66395f25892ade6dd98e7ae20
3
+ size 42835389472
smart_upcycle.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Smart Importance-Based MoE Upcycler (v2.1 - Strict MoE Detection)
4
+
5
+ Updates:
6
+ - FIXED: Layer 0 (Dense) misidentification. Now distinguishes between SwiGLU gates and MoE Routers.
7
+ - ENFORCED: Model 2 (The Stack) strictly forbids Dense layers.
8
+
9
+ Usage:
10
+ python smart_upcycle.py \
11
+ --model_path inclusionAI/Ling-mini-2.0 \
12
+ --output_path ./ling-mini-30L-upcycled \
13
+ --target_layers 30 \
14
+ --model1_ratio 0.55
15
+
16
+ Author: Claude (Anthropic)
17
+ """
18
+
19
+ import argparse
20
+ import os
21
+ import shutil
22
+ import gc
23
+ import logging
24
+ from pathlib import Path
25
+ from typing import Dict, List, Tuple
26
+ from collections import defaultdict
27
+
28
+ # Configure logging
29
+ logging.basicConfig(
30
+ level=logging.INFO,
31
+ format='%(asctime)s - [%(levelname)s] - %(message)s',
32
+ datefmt='%H:%M:%S'
33
+ )
34
+ logger = logging.getLogger("SmartUpcycler")
35
+
36
+ try:
37
+ import torch
38
+ import torch.nn as nn
39
+ from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
40
+ from safetensors.torch import save_file
41
+ from datasets import load_dataset
42
+ from tqdm import tqdm
43
+ except ImportError as e:
44
+ logger.error(f"Missing dependency: {e}")
45
+ logger.error("pip install torch transformers safetensors datasets tqdm accelerate")
46
+ exit(1)
47
+
48
+ class LayerAnalyzer:
49
+ """Analyzes model layers with strict MoE vs Dense differentiation."""
50
+
51
+ def __init__(self, model, tokenizer, device='cuda'):
52
+ self.model = model
53
+ self.tokenizer = tokenizer
54
+ self.device = device
55
+ self.layer_data = defaultdict(list)
56
+ self.hooks = []
57
+
58
+ def get_layers(self):
59
+ if hasattr(self.model, 'model') and hasattr(self.model.model, 'layers'):
60
+ return self.model.model.layers
61
+ elif hasattr(self.model, 'layers'):
62
+ return self.model.layers
63
+ else:
64
+ raise ValueError("Unsupported model architecture: cannot find .layers")
65
+
66
+ def identify_layer_types(self) -> Tuple[List[int], List[int]]:
67
+ """
68
+ Scans architecture with heuristic specifically tuned to avoid SwiGLU false positives.
69
+ Returns: (moe_indices, dense_indices)
70
+ """
71
+ moe_indices = []
72
+ dense_indices = []
73
+ layers = self.get_layers()
74
+
75
+ for idx, layer in enumerate(layers):
76
+ is_moe = False
77
+
78
+ # 1. Find the MLP module
79
+ # Common names: mlp, block_sparse_moe, feed_forward
80
+ candidates = ['mlp', 'block_sparse_moe', 'feed_forward', 'ffn']
81
+ module = None
82
+ for name in candidates:
83
+ if hasattr(layer, name):
84
+ module = getattr(layer, name)
85
+ break
86
+
87
+ if module is not None:
88
+ # 2. Strict MoE Check
89
+ # We do NOT check for 'gate' alone because SwiGLU has 'gate_proj'
90
+ has_experts_list = hasattr(module, 'experts') and len(module.experts) > 1
91
+ has_num_experts = hasattr(module, 'num_experts') and module.num_experts > 1
92
+
93
+ # Check class name for explicit "MoE" string
94
+ class_name = type(module).__name__.lower()
95
+ name_is_moe = 'moe' in class_name or 'sparse' in class_name
96
+
97
+ if has_experts_list or has_num_experts or name_is_moe:
98
+ is_moe = True
99
+ if idx == 0:
100
+ is_moe = False
101
+
102
+ if is_moe:
103
+ moe_indices.append(idx)
104
+ else:
105
+ dense_indices.append(idx)
106
+
107
+ # Sanity check for user
108
+ if 0 in moe_indices:
109
+ logger.warning("Warning: Layer 0 identified as MoE. This is rare. Verify model architecture.")
110
+
111
+ return moe_indices, dense_indices
112
+
113
+ def compute_importance(self, calibration_data: List[str]) -> Dict[int, float]:
114
+ """
115
+ Calculates layer importance using Cosine Similarity.
116
+ Score = 1.0 - CosSim(Input, Output)
117
+ """
118
+ logger.info(f"Computing importance using {len(calibration_data)} samples...")
119
+ layers = self.get_layers()
120
+
121
+ def get_activation_hook(idx):
122
+ def hook(module, input, output):
123
+ if isinstance(input, tuple): inp = input[0]
124
+ else: inp = input
125
+ if isinstance(output, tuple): out = output[0]
126
+ else: out = output
127
+
128
+ with torch.no_grad():
129
+ # Flatten to [Batch * Seq, Hidden]
130
+ inp_flat = inp.view(-1, inp.size(-1)).float()
131
+ out_flat = out.view(-1, out.size(-1)).float()
132
+
133
+ # Compute mean cosine similarity for this batch
134
+ cos = torch.nn.functional.cosine_similarity(inp_flat.to("cuda"), out_flat.to("cuda"), dim=-1)
135
+ # Higher similarity = Lower importance
136
+ score = 1.0 - cos.mean().item()
137
+ self.layer_data[idx].append(score)
138
+ return hook
139
+
140
+ for idx, layer in enumerate(layers):
141
+ self.hooks.append(layer.register_forward_hook(get_activation_hook(idx)))
142
+
143
+ self.model.eval()
144
+ with torch.no_grad():
145
+ for text in tqdm(calibration_data, desc="Calibrating"):
146
+ inputs = self.tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
147
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
148
+ self.model(**inputs)
149
+
150
+ final_scores = {}
151
+ for idx, scores in self.layer_data.items():
152
+ final_scores[idx] = sum(scores) / len(scores)
153
+
154
+ for h in self.hooks: h.remove()
155
+ self.layer_data.clear()
156
+
157
+ return final_scores
158
+
159
+ class SmartUpcycler:
160
+ def __init__(self, model_path: str, device: str = 'auto'):
161
+ self.model_path = model_path
162
+ self.device = device
163
+ self.config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
164
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
165
+
166
+ def load_model(self):
167
+ logger.info(f"Loading model from {self.model_path}...")
168
+ return AutoModelForCausalLM.from_pretrained(
169
+ self.model_path,
170
+ torch_dtype=torch.bfloat16,
171
+ device_map=self.device,
172
+ trust_remote_code=True,
173
+ low_cpu_mem_usage=True
174
+ )
175
+
176
+ def create_layer_plan(self,
177
+ importance_scores: Dict[int, float],
178
+ moe_indices: List[int],
179
+ dense_indices: List[int],
180
+ total_original: int,
181
+ target_total: int,
182
+ m1_count: int,
183
+ m2_count: int) -> Tuple[List[int], List[int]]:
184
+
185
+ # --- Model 1 (Base) ---
186
+ # Strategy: Keep First 2, Last 2 (Stability), then fill with best remaining layers (Dense OR MoE)
187
+ structural_layers = {0, 1, total_original-2, total_original-1}
188
+ m1_candidates = [i for i in range(total_original) if i not in structural_layers]
189
+
190
+ # Sort by importance
191
+ m1_candidates.sort(key=lambda x: importance_scores.get(x, 0), reverse=True)
192
+
193
+ needed_m1 = m1_count - len(structural_layers)
194
+ selected_m1 = list(structural_layers) + m1_candidates[:max(0, needed_m1)]
195
+ selected_m1.sort()
196
+
197
+ # --- Model 2 (Extension) ---
198
+ # Strategy: STRICTLY MoE layers only.
199
+
200
+ if not moe_indices:
201
+ raise ValueError("Model has no MoE layers! Cannot fulfill constraint.")
202
+
203
+ # Filter: Only consider layers that are actually MoE
204
+ m2_candidates = [i for i in moe_indices]
205
+ m2_candidates.sort(key=lambda x: importance_scores.get(x, 0), reverse=True)
206
+
207
+ selected_m2 = m2_candidates[:m2_count]
208
+ selected_m2.sort()
209
+
210
+ # Handle shortage by duplication if necessary
211
+ if len(selected_m2) < m2_count:
212
+ logger.warning(f"Not enough unique MoE layers (Found {len(selected_m2)}, Needed {m2_count}).")
213
+ logger.warning("Recycling top MoE layers to fill the gap.")
214
+ while len(selected_m2) < m2_count:
215
+ # Cycle through the best available MoE layers again
216
+ for candidate in m2_candidates:
217
+ selected_m2.append(candidate)
218
+ if len(selected_m2) == m2_count: break
219
+
220
+ return selected_m1, selected_m2
221
+
222
+ def build_and_save(self,
223
+ original_state_dict,
224
+ m1_layers: List[int],
225
+ m2_layers: List[int],
226
+ output_path: Path):
227
+
228
+ logger.info("Constructing new state dictionary...")
229
+ new_state_dict = {}
230
+
231
+ # Helper to map keys
232
+ def map_layer(src_idx, dst_idx):
233
+ src_prefix = f"model.layers.{src_idx}."
234
+ dst_prefix = f"model.layers.{dst_idx}."
235
+
236
+ for key, tensor in original_state_dict.items():
237
+ if key.startswith(src_prefix):
238
+ new_key = key.replace(src_prefix, dst_prefix)
239
+ new_state_dict[new_key] = tensor.clone()
240
+
241
+ # 1. Copy Non-Layer Weights
242
+ for key, tensor in original_state_dict.items():
243
+ if "layers." not in key:
244
+ new_state_dict[key] = tensor.clone()
245
+
246
+ # 2. Stack Model 1
247
+ current_layer_idx = 0
248
+ print(f"\n{'='*25} STACK PLAN {'='*25}")
249
+ print(f"{'Order':<5} | {'Dest':<5} | {'Source':<6} | {'Type'}")
250
+ print("-" * 50)
251
+
252
+ for src in m1_layers:
253
+ map_layer(src, current_layer_idx)
254
+ print(f"{'M1':<5} | {current_layer_idx:<5} <- {src:<6} | {'Base Mixed'}")
255
+ current_layer_idx += 1
256
+
257
+ # 3. Stack Model 2
258
+ for src in m2_layers:
259
+ map_layer(src, current_layer_idx)
260
+ print(f"{'M2':<5} | {current_layer_idx:<5} <- {src:<6} | {'MoE ONLY'}")
261
+ current_layer_idx += 1
262
+
263
+ # 4. Save
264
+ output_path.mkdir(parents=True, exist_ok=True)
265
+ self.config.num_hidden_layers = current_layer_idx
266
+ self.config.save_pretrained(output_path)
267
+ self.tokenizer.save_pretrained(output_path)
268
+
269
+ logger.info(f"Saving model.safetensors to {output_path}...")
270
+ save_file(new_state_dict, os.path.join(output_path, "model.safetensors"))
271
+ shutil.copy(__file__, output_path)
272
+
273
+ def load_calibration_samples(name='wikitext', split='train', n=128):
274
+ try:
275
+ data = load_dataset(name, 'wikitext-2-raw-v1', split=split, trust_remote_code=True)
276
+ samples = []
277
+ for x in data:
278
+ if len(x['text']) > 200:
279
+ samples.append(x['text'])
280
+ if len(samples) >= n: break
281
+ return samples
282
+ except Exception:
283
+ logger.warning("Could not load wikitext. Using dummy data.")
284
+ return ["Calibration string." * 50] * n
285
+
286
+ def main():
287
+ parser = argparse.ArgumentParser(description="Smart MoE Upcycler")
288
+ parser.add_argument('--model_path', type=str, required=True)
289
+ parser.add_argument('--output_path', type=str, required=True)
290
+ parser.add_argument('--target_layers', type=int, default=30)
291
+ parser.add_argument('--model1_ratio', type=float, default=0.55)
292
+ parser.add_argument('--no_calibration', action='store_true')
293
+ args = parser.parse_args()
294
+
295
+ # 1. Setup
296
+ m1_count = int(args.target_layers * args.model1_ratio)
297
+ m2_count = args.target_layers - m1_count
298
+
299
+ logger.info(f"Target: {args.target_layers} Layers. Split: M1={m1_count}, M2={m2_count} (Strict MoE)")
300
+
301
+ upcycler = SmartUpcycler(args.model_path)
302
+ model = upcycler.load_model()
303
+
304
+ # 2. Analyze
305
+ analyzer = LayerAnalyzer(model, upcycler.tokenizer)
306
+ moe_indices, dense_indices = analyzer.identify_layer_types()
307
+
308
+ logger.info(f"Scan Results: {len(moe_indices)} MoE layers, {len(dense_indices)} Dense layers.")
309
+ if len(dense_indices) > 0:
310
+ logger.info(f"Verified Dense Layers: {dense_indices}")
311
+
312
+ # 3. Compute Importance
313
+ if args.no_calibration:
314
+ logger.info("Skipping calibration. Using uniform importance.")
315
+ total_orig = len(model.model.layers)
316
+ scores = {i: 1.0 for i in range(total_orig)}
317
+ else:
318
+ samples = load_calibration_samples()
319
+ scores = analyzer.compute_importance(samples)
320
+
321
+ # 4. Plan
322
+ m1_layers, m2_layers = upcycler.create_layer_plan(
323
+ scores,
324
+ moe_indices,
325
+ dense_indices,
326
+ len(model.model.layers),
327
+ args.target_layers,
328
+ m1_count,
329
+ m2_count
330
+ )
331
+
332
+ # 5. Execute
333
+ logger.info("Moving model to CPU...")
334
+ model.cpu()
335
+ state_dict = model.state_dict()
336
+ if torch.cuda.is_available():
337
+ torch.cuda.empty_cache()
338
+ gc.collect()
339
+
340
+ upcycler.build_and_save(state_dict, m1_layers, m2_layers, Path(args.output_path))
341
+ logger.info("Done.")
342
+
343
+ if __name__ == "__main__":
344
+ main()
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:352a863cd2761388ccc58f1432467ba6a1037bf12df9069889b142fa246471f6
3
+ size 11422752
tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 131072,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff