samsja commited on
Commit
8fc6463
·
verified ·
1 Parent(s): 33bdebb

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - prime-rl
5
+ - moe
6
+ - test-model
7
+ library_name: transformers
8
+ ---
9
+
10
+ <div align="center">
11
+ <img src="https://cdn-avatars.huggingface.co/v1/production/uploads/61e020e4a343274bb132e138/H2mcdPRWtl4iKLd-OYYBc.jpeg" width="200"/>
12
+ </div>
13
+
14
+ # glm4-moe-tiny
15
+
16
+ A small (~543M parameter) GLM-4 MoE model for testing only. It is generally compatible with vLLM and HuggingFace Transformers but is meant to be used with [prime-rl](https://github.com/PrimeIntellect-ai/prime-rl).
17
+
18
+ Fine-tuned on [PrimeIntellect/Reverse-Text-SFT](https://huggingface.co/datasets/PrimeIntellect/Reverse-Text-SFT) to provide a non-trivial distribution for KL divergence during RL.
19
+
20
+ ## Quick Start
21
+
22
+ ```bash
23
+ uv run rl @ configs/ci/integration/rl_moe/glm4_moe.toml
24
+ ```
25
+
26
+ See the [Testing MoE at Small Scale](https://github.com/PrimeIntellect-ai/prime-rl/blob/main/docs/testing-moe-at-small-scale.md) guide for full instructions.
27
+
28
+ ## Model Details
29
+
30
+ | Parameter | Value |
31
+ |-----------|-------|
32
+ | Hidden size | 1024 |
33
+ | Layers | 24 |
34
+ | Experts | 8 |
35
+ | Active experts | 4 |
36
+ | Parameters | ~543M |
37
+
38
+ ## Links
39
+
40
+ - [prime-rl](https://github.com/PrimeIntellect-ai/prime-rl) - RL training framework
41
+ - [PrimeIntellect](https://www.primeintellect.ai/) - Building infrastructure for decentralized AI
STABLE ADDED
File without changes
chat_template.jinja ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [gMASK]<sop>
2
+ {%- if tools -%}
3
+ <|system|>
4
+ # 可用工具
5
+ {% for tool in tools %}
6
+ {%- set function = tool.function if tool.get("function") else tool %}
7
+
8
+ ## {{ function.name }}
9
+
10
+ {{ function | tojson(indent=4, ensure_ascii=False) }}
11
+ 在调用上述函数时,请使用 Json 格式表示调用的参数。
12
+ {%- endfor %}
13
+ {%- endif -%}
14
+
15
+ {%- for msg in messages %}
16
+ {%- if msg.role == 'system' %}
17
+ <|system|>
18
+ {{ msg.content }}
19
+ {%- endif %}
20
+ {%- endfor %}
21
+
22
+ {%- for message in messages if message.role != 'system' %}
23
+ {%- set role = message['role'] %}
24
+ {%- set content = message['content'] %}
25
+ {%- set meta = message.get("metadata", "") %}
26
+
27
+ {%- if role == 'user' %}
28
+ <|user|>
29
+ {{ content }}
30
+ {%- elif role == 'assistant' and not meta %}
31
+ <|assistant|>
32
+ {{ content }}
33
+ {%- elif role == 'assistant' and meta %}
34
+ <|assistant|>{{ meta }}
35
+ {{ content }}
36
+ {%- elif role == 'observation' %}
37
+ <|observation|>
38
+ {{ content }}
39
+ {%- endif %}
40
+ {%- endfor %}
41
+ {% if add_generation_prompt %}<|assistant|>{% endif %}
config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Glm4MoeForCausalLM"
4
+ ],
5
+ "attention_bias": true,
6
+ "attention_dropout": 0.0,
7
+ "dtype": "float32",
8
+ "eos_token_id": [
9
+ 151329,
10
+ 151336,
11
+ 151338
12
+ ],
13
+ "first_k_dense_replace": 1,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 1024,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 2048,
18
+ "max_position_embeddings": 131072,
19
+ "model_type": "glm4_moe",
20
+ "moe_intermediate_size": 256,
21
+ "n_group": 1,
22
+ "n_routed_experts": 8,
23
+ "n_shared_experts": 1,
24
+ "norm_topk_prob": true,
25
+ "num_attention_heads": 16,
26
+ "num_experts_per_tok": 4,
27
+ "num_hidden_layers": 24,
28
+ "num_key_value_heads": 4,
29
+ "pad_token_id": 151329,
30
+ "partial_rotary_factor": 0.5,
31
+ "rms_norm_eps": 1e-05,
32
+ "rope_parameters": {
33
+ "partial_rotary_factor": 0.5,
34
+ "rope_theta": 1000000,
35
+ "rope_type": "default"
36
+ },
37
+ "rope_theta": 1000000,
38
+ "routed_scaling_factor": 1.0,
39
+ "tie_word_embeddings": false,
40
+ "topk_group": 1,
41
+ "transformers_version": "5.2.0.dev0",
42
+ "use_cache": false,
43
+ "use_grouped_mm": false,
44
+ "use_qk_norm": false,
45
+ "vocab_size": 151552
46
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [
4
+ 151329,
5
+ 151336,
6
+ 151338
7
+ ],
8
+ "output_attentions": false,
9
+ "output_hidden_states": false,
10
+ "pad_token_id": 151329,
11
+ "transformers_version": "5.2.0.dev0",
12
+ "use_cache": true
13
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e5f3d6bcba5415c9ab75c710c612f5e03e85e4b6f8c7ae33fa3fdf09cf8a5fc
3
+ size 1085409024
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76ebeac0d8bd7879ead7b43c16b44981f277e47225de2bd7de9ae1a6cc664a8c
3
+ size 19966496
tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "clean_up_tokenization_spaces": false,
4
+ "do_lower_case": false,
5
+ "eos_token": "<|user|>",
6
+ "is_local": true,
7
+ "model_input_names": [
8
+ "input_ids",
9
+ "attention_mask"
10
+ ],
11
+ "model_max_length": 128000,
12
+ "pad_token": "<|user|>",
13
+ "padding_side": "left",
14
+ "remove_space": false,
15
+ "tokenizer_class": "TokenizersBackend"
16
+ }