yujiepan commited on
Commit
0a2fba1
·
verified ·
1 Parent(s): b749f2c

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ base_model:
4
+ - zai-org/GLM-4.7-Flash
5
+ ---
6
+
7
+ This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [zai-org/GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash).
8
+
9
+ ### Example usage:
10
+
11
+ - vLLM
12
+
13
+ ```bash
14
+ # Multi-token prediction is supported
15
+ model_id=yujiepan/glm-4.7-flash-tiny-random
16
+ vllm serve $model_id \
17
+ --tensor-parallel-size 2 \
18
+ --speculative-config.method mtp \
19
+ --speculative-config.num_speculative_tokens 1 \
20
+ --tool-call-parser glm47 \
21
+ --reasoning-parser glm45 \
22
+ --enable-auto-tool-choice
23
+ ```
24
+
25
+ - SGLang
26
+
27
+ ```bash
28
+ # Multi-token prediction is supported
29
+ model_id=yujiepan/glm-4.7-flash-tiny-random
30
+ python3 -m sglang.launch_server --model-path $model_id --tp-size 2 \
31
+ --tool-call-parser glm47 \
32
+ --reasoning-parser glm45 \
33
+ --speculative-algorithm EAGLE \
34
+ --speculative-num-steps 3 \
35
+ --speculative-eagle-topk 1 \
36
+ --speculative-num-draft-tokens 4
37
+ ```
38
+
39
+ - Transformers
40
+
41
+ ```python
42
+ import torch
43
+ from transformers import AutoModelForCausalLM, AutoTokenizer
44
+
45
+ # Load model and tokenizer
46
+ model_id = "yujiepan/glm-4.7-flash-tiny-random"
47
+ messages = [{"role": "user", "content": "hello"}]
48
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
49
+ inputs = tokenizer.apply_chat_template(
50
+ messages,
51
+ tokenize=True,
52
+ add_generation_prompt=True,
53
+ return_dict=True,
54
+ return_tensors="pt",
55
+ )
56
+ model = AutoModelForCausalLM.from_pretrained(
57
+ pretrained_model_name_or_path=model_id,
58
+ torch_dtype=torch.bfloat16,
59
+ device_map="cuda",
60
+ )
61
+ inputs = inputs.to(model.device)
62
+ generated_ids = model.generate(
63
+ **inputs, max_new_tokens=32, do_sample=False)
64
+ output_text = tokenizer.decode(
65
+ generated_ids[0][inputs.input_ids.shape[1]:])
66
+ print(output_text)
67
+ ```
68
+
69
+ ### Codes to create this repo:
70
+
71
+ ```python
72
+ import json
73
+ from copy import deepcopy
74
+ from pathlib import Path
75
+
76
+ import accelerate
77
+ import torch
78
+ import torch.nn as nn
79
+ from huggingface_hub import file_exists, hf_hub_download
80
+ from transformers import (
81
+ AutoConfig,
82
+ AutoModelForCausalLM,
83
+ AutoProcessor,
84
+ GenerationConfig,
85
+ set_seed,
86
+ )
87
+
88
+ source_model_id = "zai-org/GLM-4.7-Flash"
89
+ save_folder = "/tmp/yujiepan/glm-4.7-flash-tiny-random"
90
+
91
+ processor = AutoProcessor.from_pretrained(
92
+ source_model_id, trust_remote_code=True)
93
+ processor.save_pretrained(save_folder)
94
+
95
+ with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
96
+ config_json = json.load(f)
97
+ config_json.update({
98
+ 'kv_lora_rank': 384,
99
+ 'num_key_value_heads': 1,
100
+ 'q_lora_rank': 32,
101
+ 'qk_nope_head_dim': 64,
102
+ 'qk_rope_head_dim': 192,
103
+ 'v_head_dim': 64,
104
+ 'num_key_value_heads': 4,
105
+ 'num_attention_heads': 4,
106
+ })
107
+ config_json['hidden_size'] = 8
108
+ config_json['intermediate_size'] = 32
109
+ config_json['moe_intermediate_size'] = 32
110
+ config_json['num_hidden_layers'] = 2
111
+ config_json['tie_word_embeddings'] = False
112
+ config_json['use_cache'] = True
113
+ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
114
+ json.dump(config_json, f, indent=2)
115
+
116
+ config = AutoConfig.from_pretrained(
117
+ save_folder,
118
+ trust_remote_code=True,
119
+ )
120
+ print(config)
121
+ torch.set_default_dtype(torch.bfloat16)
122
+ model = AutoModelForCausalLM.from_config(config)
123
+ torch.set_default_dtype(torch.float32)
124
+ if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
125
+ model.generation_config = GenerationConfig.from_pretrained(
126
+ source_model_id, trust_remote_code=True,
127
+ )
128
+ model.generation_config.do_sample = True
129
+ print(model.generation_config)
130
+ model = model.cpu()
131
+ set_seed(42)
132
+ with torch.no_grad():
133
+ for name, p in sorted(model.named_parameters()):
134
+ torch.nn.init.normal_(p, 0, 0.1)
135
+ print(name, p.shape)
136
+ # MTP
137
+ set_seed(42)
138
+ model.model.layers.append(nn.ModuleDict(dict(
139
+ embed_tokens=deepcopy(model.model.embed_tokens),
140
+ shared_head=nn.ModuleDict(dict(
141
+ norm=nn.RMSNorm(config.hidden_size),
142
+ head=deepcopy(model.model.embed_tokens),
143
+ )),
144
+ eh_proj=nn.Linear(config.hidden_size * 2,
145
+ config.hidden_size, bias=False),
146
+ enorm=nn.RMSNorm(config.hidden_size),
147
+ hnorm=nn.RMSNorm(config.hidden_size),
148
+ input_layernorm=nn.RMSNorm(config.hidden_size),
149
+ post_attention_layernorm=nn.RMSNorm(config.hidden_size),
150
+ self_attn=deepcopy(model.model.layers[1].self_attn),
151
+ mlp=deepcopy(model.model.layers[1].mlp),
152
+ )))
153
+ for i in range(1, len(model.model.layers)):
154
+ model.model.layers[i].mlp.gate.e_score_correction_bias = torch.rand_like(
155
+ model.model.layers[i].mlp.gate.e_score_correction_bias).float()
156
+ model.save_pretrained(save_folder)
157
+ print(model)
158
+ ```
159
+
160
+ ### Printing the model:
161
+
162
+ ```text
163
+ Glm4MoeLiteForCausalLM(
164
+ (model): Glm4MoeLiteModel(
165
+ (embed_tokens): Embedding(154880, 8, padding_idx=154820)
166
+ (layers): ModuleList(
167
+ (0): Glm4MoeLiteDecoderLayer(
168
+ (self_attn): Glm4MoeLiteAttention(
169
+ (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
170
+ (q_a_layernorm): Glm4MoeLiteRMSNorm((32,), eps=1e-06)
171
+ (q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
172
+ (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
173
+ (kv_a_layernorm): Glm4MoeLiteRMSNorm((384,), eps=1e-06)
174
+ (kv_b_proj): Linear(in_features=384, out_features=512, bias=False)
175
+ (o_proj): Linear(in_features=256, out_features=8, bias=False)
176
+ )
177
+ (mlp): Glm4MoeLiteMLP(
178
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
179
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
180
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
181
+ (act_fn): SiLUActivation()
182
+ )
183
+ (input_layernorm): Glm4MoeLiteRMSNorm((8,), eps=1e-05)
184
+ (post_attention_layernorm): Glm4MoeLiteRMSNorm((8,), eps=1e-05)
185
+ )
186
+ (1): Glm4MoeLiteDecoderLayer(
187
+ (self_attn): Glm4MoeLiteAttention(
188
+ (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
189
+ (q_a_layernorm): Glm4MoeLiteRMSNorm((32,), eps=1e-06)
190
+ (q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
191
+ (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
192
+ (kv_a_layernorm): Glm4MoeLiteRMSNorm((384,), eps=1e-06)
193
+ (kv_b_proj): Linear(in_features=384, out_features=512, bias=False)
194
+ (o_proj): Linear(in_features=256, out_features=8, bias=False)
195
+ )
196
+ (mlp): Glm4MoeLiteMoE(
197
+ (experts): Glm4MoeLiteNaiveMoe(
198
+ (act_fn): SiLUActivation()
199
+ )
200
+ (gate): Glm4MoeLiteTopkRouter()
201
+ (shared_experts): Glm4MoeLiteMLP(
202
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
203
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
204
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
205
+ (act_fn): SiLUActivation()
206
+ )
207
+ )
208
+ (input_layernorm): Glm4MoeLiteRMSNorm((8,), eps=1e-05)
209
+ (post_attention_layernorm): Glm4MoeLiteRMSNorm((8,), eps=1e-05)
210
+ )
211
+ (2): ModuleDict(
212
+ (embed_tokens): Embedding(154880, 8, padding_idx=154820)
213
+ (shared_head): ModuleDict(
214
+ (norm): RMSNorm((8,), eps=None, elementwise_affine=True)
215
+ (head): Embedding(154880, 8, padding_idx=154820)
216
+ )
217
+ (eh_proj): Linear(in_features=16, out_features=8, bias=False)
218
+ (enorm): RMSNorm((8,), eps=None, elementwise_affine=True)
219
+ (hnorm): RMSNorm((8,), eps=None, elementwise_affine=True)
220
+ (input_layernorm): RMSNorm((8,), eps=None, elementwise_affine=True)
221
+ (post_attention_layernorm): RMSNorm((8,), eps=None, elementwise_affine=True)
222
+ (self_attn): Glm4MoeLiteAttention(
223
+ (q_a_proj): Linear(in_features=8, out_features=32, bias=False)
224
+ (q_a_layernorm): Glm4MoeLiteRMSNorm((32,), eps=1e-06)
225
+ (q_b_proj): Linear(in_features=32, out_features=1024, bias=False)
226
+ (kv_a_proj_with_mqa): Linear(in_features=8, out_features=576, bias=False)
227
+ (kv_a_layernorm): Glm4MoeLiteRMSNorm((384,), eps=1e-06)
228
+ (kv_b_proj): Linear(in_features=384, out_features=512, bias=False)
229
+ (o_proj): Linear(in_features=256, out_features=8, bias=False)
230
+ )
231
+ (mlp): Glm4MoeLiteMoE(
232
+ (experts): Glm4MoeLiteNaiveMoe(
233
+ (act_fn): SiLUActivation()
234
+ )
235
+ (gate): Glm4MoeLiteTopkRouter()
236
+ (shared_experts): Glm4MoeLiteMLP(
237
+ (gate_proj): Linear(in_features=8, out_features=32, bias=False)
238
+ (up_proj): Linear(in_features=8, out_features=32, bias=False)
239
+ (down_proj): Linear(in_features=32, out_features=8, bias=False)
240
+ (act_fn): SiLUActivation()
241
+ )
242
+ )
243
+ )
244
+ )
245
+ (norm): Glm4MoeLiteRMSNorm((8,), eps=1e-05)
246
+ (rotary_emb): Glm4MoeLiteRotaryEmbedding()
247
+ )
248
+ (lm_head): Linear(in_features=8, out_features=154880, bias=False)
249
+ )
250
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [gMASK]<sop>
2
+ {%- if tools -%}
3
+ <|system|>
4
+ # Tools
5
+
6
+ You may call one or more functions to assist with the user query.
7
+
8
+ You are provided with function signatures within <tools></tools> XML tags:
9
+ <tools>
10
+ {% for tool in tools %}
11
+ {{ tool | tojson(ensure_ascii=False) }}
12
+ {% endfor %}
13
+ </tools>
14
+
15
+ For each function call, output the function name and arguments within the following XML format:
16
+ <tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call>{%- endif -%}
17
+ {%- macro visible_text(content) -%}
18
+ {%- if content is string -%}
19
+ {{- content }}
20
+ {%- elif content is iterable and content is not mapping -%}
21
+ {%- for item in content -%}
22
+ {%- if item is mapping and item.type == 'text' -%}
23
+ {{- item.text }}
24
+ {%- elif item is string -%}
25
+ {{- item }}
26
+ {%- endif -%}
27
+ {%- endfor -%}
28
+ {%- else -%}
29
+ {{- content }}
30
+ {%- endif -%}
31
+ {%- endmacro -%}
32
+ {%- set ns = namespace(last_user_index=-1) %}
33
+ {%- for m in messages %}
34
+ {%- if m.role == 'user' %}
35
+ {% set ns.last_user_index = loop.index0 -%}
36
+ {%- endif %}
37
+ {%- endfor %}
38
+ {% for m in messages %}
39
+ {%- if m.role == 'user' -%}<|user|>{{ visible_text(m.content) }}
40
+ {%- elif m.role == 'assistant' -%}
41
+ <|assistant|>
42
+ {%- set reasoning_content = '' %}
43
+ {%- set content = visible_text(m.content) %}
44
+ {%- if m.reasoning_content is string %}
45
+ {%- set reasoning_content = m.reasoning_content %}
46
+ {%- else %}
47
+ {%- if '</think>' in content %}
48
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
49
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
50
+ {%- endif %}
51
+ {%- endif %}
52
+ {%- if ((clear_thinking is defined and not clear_thinking) or loop.index0 > ns.last_user_index) and reasoning_content -%}
53
+ {{ '<think>' + reasoning_content.strip() + '</think>'}}
54
+ {%- else -%}
55
+ {{ '</think>' }}
56
+ {%- endif -%}
57
+ {%- if content.strip() -%}
58
+ {{ content.strip() }}
59
+ {%- endif -%}
60
+ {% if m.tool_calls %}
61
+ {% for tc in m.tool_calls %}
62
+ {%- if tc.function %}
63
+ {%- set tc = tc.function %}
64
+ {%- endif %}
65
+ {{- '<tool_call>' + tc.name -}}
66
+ {% set _args = tc.arguments %}{% for k, v in _args.items() %}<arg_key>{{ k }}</arg_key><arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>{% endfor %}</tool_call>{% endfor %}
67
+ {% endif %}
68
+ {%- elif m.role == 'tool' -%}
69
+ {%- if m.content is string -%}
70
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
71
+ {{- '<|observation|>' }}
72
+ {%- endif %}
73
+ {{- '<tool_response>' }}
74
+ {{- m.content }}
75
+ {{- '</tool_response>' }}
76
+ {%- else -%}
77
+ <|observation|>{% for tr in m.content %}
78
+ <tool_response>{{ tr.output if tr.output is defined else tr }}</tool_response>{% endfor -%}
79
+ {% endif -%}
80
+ {%- elif m.role == 'system' -%}
81
+ <|system|>{{ visible_text(m.content) }}
82
+ {%- endif -%}
83
+ {%- endfor -%}
84
+ {%- if add_generation_prompt -%}
85
+ <|assistant|>{{- '</think>' if (enable_thinking is defined and not enable_thinking) else '<think>' -}}
86
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Glm4MoeLiteForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": [
10
+ 154820,
11
+ 154827,
12
+ 154829
13
+ ],
14
+ "first_k_dense_replace": 1,
15
+ "head_dim": 192,
16
+ "hidden_act": "silu",
17
+ "hidden_size": 8,
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 32,
20
+ "kv_lora_rank": 384,
21
+ "max_position_embeddings": 202752,
22
+ "mlp_layer_types": [
23
+ "dense",
24
+ "sparse"
25
+ ],
26
+ "model_type": "glm4_moe_lite",
27
+ "moe_intermediate_size": 32,
28
+ "n_group": 1,
29
+ "n_routed_experts": 64,
30
+ "n_shared_experts": 1,
31
+ "norm_topk_prob": true,
32
+ "num_attention_heads": 4,
33
+ "num_experts_per_tok": 4,
34
+ "num_hidden_layers": 2,
35
+ "num_key_value_heads": 4,
36
+ "num_nextn_predict_layers": 1,
37
+ "pad_token_id": 154820,
38
+ "partial_rotary_factor": 1.0,
39
+ "pretraining_tp": 1,
40
+ "q_lora_rank": 32,
41
+ "qk_head_dim": 256,
42
+ "qk_nope_head_dim": 64,
43
+ "qk_rope_head_dim": 192,
44
+ "rms_norm_eps": 1e-05,
45
+ "rope_interleave": true,
46
+ "rope_parameters": {
47
+ "partial_rotary_factor": 1.0,
48
+ "rope_theta": 1000000,
49
+ "rope_type": "default"
50
+ },
51
+ "routed_scaling_factor": 1.8,
52
+ "tie_word_embeddings": false,
53
+ "topk_group": 1,
54
+ "topk_method": "noaux_tc",
55
+ "transformers_version": "5.0.0.dev0",
56
+ "use_cache": true,
57
+ "v_head_dim": 64,
58
+ "vocab_size": 154880
59
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 154820,
6
+ 154827,
7
+ 154829
8
+ ],
9
+ "pad_token_id": 154820,
10
+ "temperature": 1.0,
11
+ "transformers_version": "5.0.0.dev0"
12
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd035174c24e5786d4d7172564bd239feb38916442a6c30dc52762e6fed7aa4d
3
+ size 11585784
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19e773648cb4e65de8660ea6365e10acca112d42a854923df93db4a6f333a82d
3
+ size 20217442
tokenizer_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "clean_up_tokenization_spaces": false,
4
+ "do_lower_case": false,
5
+ "eos_token": "<|endoftext|>",
6
+ "extra_special_tokens": [
7
+ "<|endoftext|>",
8
+ "[MASK]",
9
+ "[gMASK]",
10
+ "[sMASK]",
11
+ "<sop>",
12
+ "<eop>",
13
+ "<|system|>",
14
+ "<|user|>",
15
+ "<|assistant|>",
16
+ "<|observation|>",
17
+ "<|begin_of_image|>",
18
+ "<|end_of_image|>",
19
+ "<|begin_of_video|>",
20
+ "<|end_of_video|>",
21
+ "<|begin_of_audio|>",
22
+ "<|end_of_audio|>",
23
+ "<|begin_of_transcription|>",
24
+ "<|end_of_transcription|>"
25
+ ],
26
+ "is_local": false,
27
+ "model_max_length": 128000,
28
+ "model_specific_special_tokens": {},
29
+ "pad_token": "<|endoftext|>",
30
+ "padding_side": "left",
31
+ "remove_space": false,
32
+ "tokenizer_class": "TokenizersBackend"
33
+ }