dmayboroda commited on Feb 5

Commit

31e8e0e

verified ·

1 Parent(s): 4d6385b

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
README.md +337 -0
chat_template.jinja +86 -0
config.json +78 -0
generation_config.json +10 -0
hf_quant_config.json +14 -0
model-00001-of-00041.safetensors +3 -0
model-00002-of-00041.safetensors +3 -0
model-00003-of-00041.safetensors +3 -0
model-00004-of-00041.safetensors +3 -0
model-00005-of-00041.safetensors +3 -0
model-00006-of-00041.safetensors +3 -0
model-00007-of-00041.safetensors +3 -0
model-00008-of-00041.safetensors +3 -0
model-00009-of-00041.safetensors +3 -0
model-00010-of-00041.safetensors +3 -0
model-00011-of-00041.safetensors +3 -0
model-00012-of-00041.safetensors +3 -0
model-00013-of-00041.safetensors +3 -0
model-00014-of-00041.safetensors +3 -0
model-00015-of-00041.safetensors +3 -0
model-00016-of-00041.safetensors +3 -0
model-00017-of-00041.safetensors +3 -0
model-00018-of-00041.safetensors +3 -0
model-00019-of-00041.safetensors +3 -0
model-00020-of-00041.safetensors +3 -0
model-00021-of-00041.safetensors +3 -0
model-00022-of-00041.safetensors +3 -0
model-00023-of-00041.safetensors +3 -0
model-00024-of-00041.safetensors +3 -0
model-00025-of-00041.safetensors +3 -0
model-00026-of-00041.safetensors +3 -0
model-00027-of-00041.safetensors +3 -0
model-00028-of-00041.safetensors +3 -0
model-00029-of-00041.safetensors +3 -0
model-00030-of-00041.safetensors +3 -0
model-00031-of-00041.safetensors +3 -0
model-00032-of-00041.safetensors +3 -0
model-00033-of-00041.safetensors +3 -0
model-00034-of-00041.safetensors +3 -0
model-00035-of-00041.safetensors +3 -0
model-00036-of-00041.safetensors +3 -0
model-00037-of-00041.safetensors +3 -0
model-00038-of-00041.safetensors +3 -0
model-00039-of-00041.safetensors +3 -0
model-00040-of-00041.safetensors +3 -0
model-00041-of-00041.safetensors +3 -0
model.safetensors.index.json +3 -0
special_tokens_map.json +34 -0
tokenizer.json +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model.safetensors.index.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,337 @@

+---
+datasets:
+- abisee/cnn_dailymail
+- nvidia/Nemotron-Post-Training-Dataset-v2
+base_model:
+- zai-org/GLM-4.7
+base_model_relation: quantized
+license: mit
+pipeline_tag: text-generation
+---
+# GLM-4.7-NVFP4
+**Format:** NVFP4 — optimal partial quantization of weights & activations to NVFP4.
+**Base model:** `zai-org/GLM-4.7`
+**How it was made:** [AutoQuantized](https://nvidia.github.io/Model-Optimizer/guides/_pytorch_quantization.html#optimal-partial-quantization-using-auto-quantize) with [NVIDIA Model-Optimizer](https://github.com/NVIDIA/Model-Optimizer/) (NVFP4), using the default calibration mix. ([cnn_dailymail](https://huggingface.co/datasets/abisee/cnn_dailymail) and [nemotron-post-training-dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2))
+Check the [original model card](https://huggingface.co/zai-org/GLM-4.7) for information about this model.
+---
+### **MMLU Benchmark Results: Salyut1/GLM-4.7-NVFP4**
+#### **Summary Table**
+| Groups | Version | Metric | Value | Stderr |
+| --- | --- | --- | --- | --- |
+| **MMLU (Total)** | 2 | acc ↑ | **0.8348** | ± 0.0030 |
+| **Social Sciences** | 2 | acc ↑ | **0.9051** | ± 0.0052 |
+| **Other** | 2 | acc ↑ | **0.8684** | ± 0.0058 |
+| **STEM** | 2 | acc ↑ | **0.8351** | ± 0.0064 |
+| **Humanities** | 2 | acc ↑ | **0.7664** | ± 0.0059 |
+#### **STEM**
+| Tasks | n-shot | Metric | Value | Stderr |
+| --- | --- | --- | --- | --- |
+| High School Biology | 0 | acc ↑ | 0.9516 | ± 0.0122 |
+| College Biology | 0 | acc ↑ | 0.9514 | ± 0.0180 |
+| Astronomy | 0 | acc ↑ | 0.9474 | ± 0.0182 |
+| High School Computer Science | 0 | acc ↑ | 0.9300 | ± 0.0256 |
+| Conceptual Physics | 0 | acc ↑ | 0.9064 | ± 0.0190 |
+| Elementary Mathematics | 0 | acc ↑ | 0.8862 | ± 0.0164 |
+| Electrical Engineering | 0 | acc ↑ | 0.8690 | ± 0.0281 |
+| High School Statistics | 0 | acc ↑ | 0.8565 | ± 0.0239 |
+| College Computer Science | 0 | acc ↑ | 0.8400 | ± 0.0368 |
+| Anatomy | 0 | acc ↑ | 0.8296 | ± 0.0325 |
+| High School Physics | 0 | acc ↑ | 0.7947 | ± 0.0330 |
+| High School Chemistry | 0 | acc ↑ | 0.7882 | ± 0.0287 |
+| Machine Learning | 0 | acc ↑ | 0.7679 | ± 0.0401 |
+| College Physics | 0 | acc ↑ | 0.7647 | ± 0.0422 |
+| Abstract Algebra | 0 | acc ↑ | 0.6800 | ± 0.0469 |
+| College Chemistry | 0 | acc ↑ | 0.6800 | ± 0.0469 |
+| College Mathematics | 0 | acc ↑ | 0.6800 | ± 0.0469 |
+| High School Mathematics | 0 | acc ↑ | 0.6481 | ± 0.0291 |
+#### **Social Sciences**
+| Tasks | n-shot | Metric | Value | Stderr |
+| --- | --- | --- | --- | --- |
+| High School Government/Politics | 0 | acc ↑ | 0.9793 | ± 0.0103 |
+| High School Microeconomics | 0 | acc ↑ | 0.9706 | ± 0.0110 |
+| High School Psychology | 0 | acc ↑ | 0.9523 | ± 0.0091 |
+| Human Sexuality | 0 | acc ↑ | 0.9313 | ± 0.0222 |
+| Sociology | 0 | acc ↑ | 0.9204 | ± 0.0191 |
+| High School Geography | 0 | acc ↑ | 0.9192 | ± 0.0194 |
+| High School Macroeconomics | 0 | acc ↑ | 0.9000 | ± 0.0152 |
+| US Foreign Policy | 0 | acc ↑ | 0.9000 | ± 0.0302 |
+| Professional Psychology | 0 | acc ↑ | 0.8725 | ± 0.0135 |
+| Security Studies | 0 | acc ↑ | 0.8653 | ± 0.0219 |
+| Public Relations | 0 | acc ↑ | 0.7636 | ± 0.0407 |
+| Econometrics | 0 | acc ↑ | 0.7544 | ± 0.0405 |
+#### **Humanities**
+| Tasks | n-shot | Metric | Value | Stderr |
+| --- | --- | --- | --- | --- |
+| High School US History | 0 | acc ↑ | 0.9461 | ± 0.0159 |
+| High School World History | 0 | acc ↑ | 0.9367 | ± 0.0158 |
+| World Religions | 0 | acc ↑ | 0.9064 | ± 0.0223 |
+| Prehistory | 0 | acc ↑ | 0.8981 | ± 0.0168 |
+| International Law | 0 | acc ↑ | 0.8926 | ± 0.0283 |
+| Jurisprudence | 0 | acc ↑ | 0.8889 | ± 0.0304 |
+| Logical Fallacies | 0 | acc ↑ | 0.8834 | ± 0.0252 |
+| High School European History | 0 | acc ↑ | 0.8788 | ± 0.0255 |
+| Moral Disputes | 0 | acc ↑ | 0.8699 | ± 0.0181 |
+| Philosophy | 0 | acc ↑ | 0.8617 | ± 0.0196 |
+| Formal Logic | 0 | acc ↑ | 0.7460 | ± 0.0389 |
+| Professional Law | 0 | acc ↑ | 0.6610 | ± 0.0121 |
+| Moral Scenarios | 0 | acc ↑ | 0.6425 | ± 0.0160 |
+#### **Other**
+| Tasks | n-shot | Metric | Value | Stderr |
+| --- | --- | --- | --- | --- |
+| Medical Genetics | 0 | acc ↑ | 0.9800 | ± 0.0141 |
+| Marketing | 0 | acc ↑ | 0.9530 | ± 0.0139 |
+| Miscellaneous | 0 | acc ↑ | 0.9374 | ± 0.0087 |
+| Professional Medicine | 0 | acc ↑ | 0.9301 | ± 0.0155 |
+| Clinical Knowledge | 0 | acc ↑ | 0.9057 | ± 0.0180 |
+| Nutrition | 0 | acc ↑ | 0.9052 | ± 0.0168 |
+| Management | 0 | acc ↑ | 0.8932 | ± 0.0306 |
+| Business Ethics | 0 | acc ↑ | 0.8600 | ± 0.0349 |
+| Computer Security | 0 | acc ↑ | 0.8600 | ± 0.0349 |
+| Human Aging | 0 | acc ↑ | 0.8161 | ± 0.0260 |
+| College Medicine | 0 | acc ↑ | 0.7977 | ± 0.0306 |
+| Professional Accounting | 0 | acc ↑ | 0.7624 | ± 0.0254 |
+| Global Facts | 0 | acc ↑ | 0.6500 | ± 0.0479 |
+| Virology | 0 | acc ↑ | 0.5723 | ± 0.0385 |
+---
+sglang Inference Note:
+```
+vim /sgl-workspace/sglang/python/sglang/srt/layers/quantization/modelopt_quant.py
+```
+change the code in 1637 line like this:
+```
+# Validate weight scales
+assert_dim = 2 if layer.moe_runner_config.is_gated else 1
+for name, weight_scale in [
+    ("w13", layer.w13_weight_scale),
+    ("w2", layer.w2_weight_scale),
+]:
+    pass
+    #assert (
+    #    weight_scale.shape[assert_dim] % 16 == 0
+    #), f"Expected {name}_weight_scale.dim({assert_dim}) to be divisible by 16"
+    #assert (
+    #    weight_scale.dtype == torch.float8_e4m3fn
+    #), f"{name} Weight Blockscale must be represented as FP8-E4M3"
+```
+deploy command GLM-4.7-NVFP4 on sglang:
+```
+python3 -m sglang.launch_server --model-path  GLM-4.7-NVFP4/   --quantization modelopt_fp4  --tp 8 --attention-backend flashinfer
+```
+# perf
+We performed deployment on 8x 5090s, and the stress test performance data is provided below.
+```
+Benchmarking summary:
++-----------------------------------+-----------+
+| Key                               |     Value |
++===================================+===========+
+| Time taken for tests (s)          |   25.7834 |
++-----------------------------------+-----------+
+| Number of concurrency             |    1      |
++-----------------------------------+-----------+
+| Total requests                    |    1      |
++-----------------------------------+-----------+
+| Succeed requests                  |    1      |
++-----------------------------------+-----------+
+| Failed requests                   |    0      |
++-----------------------------------+-----------+
+| Output token throughput (tok/s)   |   39.7154 |
++-----------------------------------+-----------+
+| Total token throughput (tok/s)    |   59.5731 |
++-----------------------------------+-----------+
+| Request throughput (req/s)        |    0.0388 |
++-----------------------------------+-----------+
+| Average latency (s)               |   25.7834 |
++-----------------------------------+-----------+
+| Average time to first token (s)   |    0.7891 |
++-----------------------------------+-----------+
+| Average time per output token (s) |    0.0244 |
++-----------------------------------+-----------+
+| Average inter-token latency (s)   |    0.0244 |
++-----------------------------------+-----------+
+| Average input tokens per request  |  512      |
++-----------------------------------+-----------+
+| Average output tokens per request | 1024      |
++-----------------------------------+-----------+
+2025-12-26 07:12:02 - evalscope - INFO:
+Percentile results:
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+| Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+|     10%     |  0.7891  |  0.024  |  0.0244  |   25.7834   |     512      |     1024      |    39.7154     |    59.5731    |
+|     25%     |  0.7891  | 0.0241  |  0.0244  |   25.7834   |     512      |     1024      |    39.7154     |    59.5731    |
+|     50%     |  0.7891  | 0.0243  |  0.0244  |   25.7834   |     512      |     1024      |    39.7154     |    59.5731    |
+|     66%     |  0.7891  | 0.0244  |  0.0244  |   25.7834   |     512      |     1024      |    39.7154     |    59.5731    |
+|     75%     |  0.7891  | 0.0244  |  0.0244  |   25.7834   |     512      |     1024      |    39.7154     |    59.5731    |
+|     80%     |  0.7891  | 0.0246  |  0.0244  |   25.7834   |     512      |     1024      |    39.7154     |    59.5731    |
+|     90%     |  0.7891  |  0.025  |  0.0244  |   25.7834   |     512      |     1024      |    39.7154     |    59.5731    |
+|     95%     |  0.7891  | 0.0257  |  0.0244  |   25.7834   |     512      |     1024      |    39.7154     |    59.5731    |
+|     98%     |  0.7891  | 0.0267  |  0.0244  |   25.7834   |     512      |     1024      |    39.7154     |    59.5731    |
+|     99%     |  0.7891  | 0.0273  |  0.0244  |   25.7834   |     512      |     1024      |    39.7154     |    59.5731    |
+Benchmarking summary:
++-----------------------------------+-----------+
+| Key                               |     Value |
++===================================+===========+
+| Time taken for tests (s)          |   36.4068 |
++-----------------------------------+-----------+
+| Number of concurrency             |    8      |
++-----------------------------------+-----------+
+| Total requests                    |    8      |
++-----------------------------------+-----------+
+| Succeed requests                  |    8      |
++-----------------------------------+-----------+
+| Failed requests                   |    0      |
++-----------------------------------+-----------+
+| Output token throughput (tok/s)   |  225.013  |
++-----------------------------------+-----------+
+| Total token throughput (tok/s)    |  337.519  |
++-----------------------------------+-----------+
+| Request throughput (req/s)        |    0.2197 |
++-----------------------------------+-----------+
+| Average latency (s)               |   36.3904 |
++-----------------------------------+-----------+
+| Average time to first token (s)   |    2.4183 |
++-----------------------------------+-----------+
+| Average time per output token (s) |    0.0332 |
++-----------------------------------+-----------+
+| Average inter-token latency (s)   |    0.0332 |
++-----------------------------------+-----------+
+| Average input tokens per request  |  512      |
++-----------------------------------+-----------+
+| Average output tokens per request | 1024      |
++-----------------------------------+-----------+
+2025-12-26 07:14:21 - evalscope - INFO:
+Percentile results:
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+| Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+|     10%     |  1.4982  | 0.0301  |  0.0326  |   36.2968   |     512      |     1024      |    28.1277     |    42.1915    |
+|     25%     |  2.1396  | 0.0322  |  0.0326  |   36.403    |     512      |     1024      |    28.1287     |    42.1931    |
+|     50%     |  2.141   | 0.0327  |  0.0335  |   36.4039   |     512      |     1024      |    28.1291     |    42.1936    |
+|     66%     |  3.0959  | 0.0329  |  0.0335  |   36.4041   |     512      |     1024      |    28.1295     |    42.1943    |
+|     75%     |  3.0961  |  0.033  |  0.0335  |   36.4045   |     512      |     1024      |    28.1305     |    42.1958    |
+|     80%     |  3.0961  | 0.0331  |  0.0335  |   36.4045   |     512      |     1024      |    28.1305     |    42.1958    |
+|     90%     |  3.0962  | 0.0336  |  0.0341  |   36.4054   |     512      |     1024      |    28.2119     |    42.3178    |
+|     95%     |  3.0962  | 0.0342  |  0.0341  |   36.4054   |     512      |     1024      |    28.2119     |    42.3178    |
+|     98%     |  3.0962  | 0.0355  |  0.0341  |   36.4054   |     512      |     1024      |    28.2119     |    42.3178    |
+|     99%     |  3.0962  | 0.0363  |  0.0341  |   36.4054   |     512      |     1024      |    28.2119     |    42.3178    |
+Benchmarking summary:
++-----------------------------------+-----------+
+| Key                               |     Value |
++===================================+===========+
+| Time taken for tests (s)          |   43.0028 |
++-----------------------------------+-----------+
+| Number of concurrency             |   16      |
++-----------------------------------+-----------+
+| Total requests                    |   16      |
++-----------------------------------+-----------+
+| Succeed requests                  |   16      |
++-----------------------------------+-----------+
+| Failed requests                   |    0      |
++-----------------------------------+-----------+
+| Output token throughput (tok/s)   |  380.998  |
++-----------------------------------+-----------+
+| Total token throughput (tok/s)    |  571.498  |
++-----------------------------------+-----------+
+| Request throughput (req/s)        |    0.3721 |
++-----------------------------------+-----------+
+| Average latency (s)               |   42.8878 |
++-----------------------------------+-----------+
+| Average time to first token (s)   |    2.933  |
++-----------------------------------+-----------+
+| Average time per output token (s) |    0.0391 |
++-----------------------------------+-----------+
+| Average inter-token latency (s)   |    0.039  |
++-----------------------------------+-----------+
+| Average input tokens per request  |  512      |
++-----------------------------------+-----------+
+| Average output tokens per request | 1024      |
++-----------------------------------+-----------+
+2025-12-26 07:17:55 - evalscope - INFO:
+Percentile results:
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+| Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+|     10%     |  2.1016  | 0.0354  |  0.0384  |   42.8554   |     512      |     1024      |    23.8153     |    35.7229    |
+|     25%     |  2.104   | 0.0358  |  0.0384  |   42.8585   |     512      |     1024      |    23.8899     |    35.8348    |
+|     50%     |  3.0384  | 0.0371  |  0.0389  |   42.8599   |     512      |     1024      |     23.892     |    35.8381    |
+|     66%     |  3.5629  |  0.04   |  0.0391  |   42.8631   |     512      |     1024      |    23.8925     |    35.8387    |
+|     75%     |  3.5643  | 0.0407  |  0.0398  |   42.9931   |     512      |     1024      |    23.8927     |    35.8391    |
+|     80%     |  3.5643  |  0.041  |  0.0398  |   42.9931   |     512      |     1024      |    23.8927     |    35.8391    |
+|     90%     |   3.65   | 0.0417  |  0.0398  |   42.9976   |     512      |     1024      |    23.8943     |    35.8415    |
+|     95%     |  3.6512  | 0.0425  |  0.0408  |   42.9981   |     512      |     1024      |    23.9443     |    35.9165    |
+|     98%     |  3.6512  | 0.0435  |  0.0408  |   42.9981   |     512      |     1024      |    23.9443     |    35.9165    |
+|     99%     |  3.6512  | 0.0449  |  0.0408  |   42.9981   |     512      |     1024      |    23.9443     |    35.9165    |
+Benchmarking summary:
++-----------------------------------+-----------+
+| Key                               |     Value |
++===================================+===========+
+| Time taken for tests (s)          |   51.9487 |
++-----------------------------------+-----------+
+| Number of concurrency             |   32      |
++-----------------------------------+-----------+
+| Total requests                    |   32      |
++-----------------------------------+-----------+
+| Succeed requests                  |   32      |
++-----------------------------------+-----------+
+| Failed requests                   |    0      |
++-----------------------------------+-----------+
+| Output token throughput (tok/s)   |  630.776  |
++-----------------------------------+-----------+
+| Total token throughput (tok/s)    |  946.164  |
++-----------------------------------+-----------+
+| Request throughput (req/s)        |    0.616  |
++-----------------------------------+-----------+
+| Average latency (s)               |   51.9342 |
++-----------------------------------+-----------+
+| Average time to first token (s)   |    3.4479 |
++-----------------------------------+-----------+
+| Average time per output token (s) |    0.0474 |
++-----------------------------------+-----------+
+| Average inter-token latency (s)   |    0.0474 |
++-----------------------------------+-----------+
+| Average input tokens per request  |  512      |
++-----------------------------------+-----------+
+| Average output tokens per request | 1024      |
++-----------------------------------+-----------+
+2025-12-26 07:20:36 - evalscope - INFO:
+Percentile results:
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+| Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
++-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
+|     10%     |  1.5371  | 0.0434  |  0.0458  |   51.9326   |     512      |     1024      |     19.714     |    29.5709    |
+|     25%     |  2.1556  | 0.0445  |  0.0463  |   51.9352   |     512      |     1024      |    19.7147     |    29.5721    |
+|     50%     |  3.386   | 0.0456  |  0.0475  |   51.9383   |     512      |     1024      |    19.7158     |    29.5737    |
+|     66%     |  4.6167  | 0.0464  |  0.0481  |   51.9401   |     512      |     1024      |    19.7168     |    29.5752    |
+|     75%     |  4.618   | 0.0469  |  0.0487  |   51.9412   |     512      |     1024      |    19.7172     |    29.5757    |
+|     80%     |  5.0425  | 0.0472  |  0.0487  |   51.9414   |     512      |     1024      |    19.7172     |    29.5758    |
+|     90%     |  5.0448  | 0.0482  |  0.0493  |   51.9429   |     512      |     1024      |    19.7179     |    29.5768    |
+|     95%     |  5.125   | 0.0491  |  0.0493  |   51.9448   |     512      |     1024      |    19.7193     |    29.5789    |
+|     98%     |  5.1261  | 0.0503  |  0.0498  |   51.9463   |     512      |     1024      |    19.7633     |    29.645     |
+|     99%     |  5.1261  | 0.0511  |  0.0498  |   51.9463   |     512      |     1024      |    19.7633     |    29.645     |
+```

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,86 @@

+[gMASK]<sop>
+{%- if tools -%}
+<|system|>
+# Tools
+You may call one or more functions to assist with the user query.
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{% for tool in tools %}
+{{ tool | tojson(ensure_ascii=False) }}
+{% endfor %}
+</tools>
+For each function call, output the function name and arguments within the following XML format:
+<tool_call>{function-name}<arg_key>{arg-key-1}</arg_key><arg_value>{arg-value-1}</arg_value><arg_key>{arg-key-2}</arg_key><arg_value>{arg-value-2}</arg_value>...</tool_call>{%- endif -%}
+{%- macro visible_text(content) -%}
+    {%- if content is string -%}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping -%}
+        {%- for item in content -%}
+            {%- if item is mapping and item.type == 'text' -%}
+                {{- item.text }}
+            {%- elif item is string -%}
+                {{- item }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{- content }}
+    {%- endif -%}
+{%- endmacro -%}
+{%- set ns = namespace(last_user_index=-1) %}
+{%- for m in messages %}
+    {%- if m.role == 'user' %}
+        {% set ns.last_user_index = loop.index0 -%}
+    {%- endif %}
+{%- endfor %}
+{% for m in messages %}
+{%- if m.role == 'user' -%}<|user|>{{ visible_text(m.content) }}
+{%- elif m.role == 'assistant' -%}
+<|assistant|>
+{%- set reasoning_content = '' %}
+{%- set content = visible_text(m.content) %}
+{%- if m.reasoning_content is string %}
+    {%- set reasoning_content = m.reasoning_content %}
+{%- else %}
+    {%- if '</think>' in content %}
+        {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+        {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+    {%- endif %}
+{%- endif %}
+{%- if ((clear_thinking is defined and not clear_thinking) or loop.index0 > ns.last_user_index) and reasoning_content -%}
+{{ '<think>' + reasoning_content.strip() +  '</think>'}}
+{%- else -%}
+{{ '</think>' }}
+{%- endif -%}
+{%- if content.strip() -%}
+{{ content.strip() }}
+{%- endif -%}
+{% if m.tool_calls %}
+{% for tc in m.tool_calls %}
+{%- if tc.function %}
+    {%- set tc = tc.function %}
+{%- endif %}
+{{- '<tool_call>' + tc.name -}}
+{% set _args = tc.arguments %}{% for k, v in _args.items() %}<arg_key>{{ k }}</arg_key><arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>{% endfor %}</tool_call>{% endfor %}
+{% endif %}
+{%- elif m.role == 'tool' -%}
+{%- if m.content is string -%}
+{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+    {{- '<|observation|>' }}
+{%- endif %}
+{{- '<tool_response>' }}
+{{- m.content }}
+{{- '</tool_response>' }}
+{%- else -%}
+<|observation|>{% for tr in m.content %}
+<tool_response>{{ tr.output if tr.output is defined else tr }}</tool_response>{% endfor -%}
+{% endif -%}
+{%- elif m.role == 'system' -%}
+<|system|>{{ visible_text(m.content) }}
+{%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    <|assistant|>{{- '</think>' if (enable_thinking is defined and not enable_thinking) else '<think>' -}}
+{%- endif -%}

config.json ADDED Viewed

	@@ -0,0 +1,78 @@

+{
+    "architectures": [
+        "Glm4MoeForCausalLM"
+    ],
+    "attention_bias": true,
+    "attention_dropout": 0.0,
+    "dtype": "bfloat16",
+    "eos_token_id": [
+        151329,
+        151336,
+        151338
+    ],
+    "first_k_dense_replace": 3,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 5120,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "max_position_embeddings": 202752,
+    "model_type": "glm4_moe",
+    "moe_intermediate_size": 1536,
+    "n_group": 1,
+    "n_routed_experts": 160,
+    "n_shared_experts": 1,
+    "norm_topk_prob": true,
+    "num_attention_heads": 96,
+    "num_experts_per_tok": 8,
+    "num_hidden_layers": 92,
+    "num_key_value_heads": 8,
+    "num_nextn_predict_layers": 1,
+    "pad_token_id": 151329,
+    "partial_rotary_factor": 0.5,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "routed_scaling_factor": 2.5,
+    "tie_word_embeddings": false,
+    "topk_group": 1,
+    "transformers_version": "4.57.3",
+    "use_cache": true,
+    "use_qk_norm": true,
+    "vocab_size": 151552,
+    "quantization_config": {
+        "config_groups": {
+            "group_0": {
+                "input_activations": {
+                    "dynamic": false,
+                    "num_bits": 4,
+                    "type": "float",
+                    "group_size": 16
+                },
+                "weights": {
+                    "dynamic": false,
+                    "num_bits": 4,
+                    "type": "float",
+                    "group_size": 16
+                },
+                "targets": [
+                    "Linear"
+                ]
+            }
+        },
+        "ignore": [
+            "lm_head"
+        ],
+        "quant_algo": "NVFP4",
+        "kv_cache_scheme": {
+            "dynamic": false,
+            "num_bits": 8,
+            "type": "float"
+        },
+        "producer": {
+            "name": "modelopt",
+            "version": "0.40.0"
+        },
+        "quant_method": "modelopt"
+    }
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "eos_token_id": [
+    151329,
+    151336,
+    151338
+  ],
+  "pad_token_id": 151329,
+  "transformers_version": "4.57.3"
+}

hf_quant_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "producer": {
+        "name": "modelopt",
+        "version": "0.40.0"
+    },
+    "quantization": {
+        "quant_algo": "NVFP4",
+        "kv_cache_quant_algo": "FP8",
+        "group_size": 16,
+        "exclude_modules": [
+            "lm_head"
+        ]
+    }
+}

model-00001-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ecdd1683256b7b40ff3fc6706a8d64679c817954a3f28c6376dd44a8bf2bde02
+size 4998646904

model-00002-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7332ecbba03b1d3e2ed5867cb3feec8512e001801cfffa0ef18c2c2ac31f39e4
+size 4996766232

model-00003-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1378802395eb36a726535689b1cd969a3c32d678312444330b20f05b9c9aceb7
+size 4996766592

model-00004-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a6ad22ee2096c223dea8f29962a5f3c19955875d413f1b49b94a80cdaf38b44
+size 4999926688

model-00005-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0681ab255b7fc4d7af2a8ceea54a07efa2e222ebdcde964755427f644cf5f987
+size 4996770616

model-00006-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4d1ecb6cda3dbb313048ad88809ddf0e293aead5e4989e2d0db55d870ed347e
+size 4996770664

model-00007-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d7c8710d3d2cc45b99cf9c42543cb3801f8a57b85936a78e0da10c12645c182
+size 4996771072

model-00008-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cca67b7ac1b34fc58f01da2fa9b9c09011ec88a95e616e49c51e29999dac6df5
+size 4999929272

model-00009-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa844a2f98c79aff09b261e9172ef92f371af0279a38fa7be27b6bf40b0627eb
+size 4996770656

model-00010-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f5c230500588c7909a88aa9228a0a8105fb259b221d8579e54b70808c3d61cb
+size 4996770664

model-00011-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f55c35142f91600ab2542c8ae431214f88bb3046e3e27f2d74bab287065e974
+size 4996771120

model-00012-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:360574e4ea8b00ac0fa8f777613f40836eaa58ffed210f8ac10718d3dc132fdb
+size 4999929320

model-00013-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a419a12dade67dc55170911f3467979b65d28263cad5ba7232c75ff12b77046e
+size 4996770656

model-00014-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1b4f7c7a533dcafbaf08015c34d3e91d3d878b4b64ae4e3fd2cfb257b01aa46
+size 4996770664

model-00015-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a6aba2420e1f643385e045972f351563b54091b3c80c07bf8af17a634f30b5d4
+size 4996771168

model-00016-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b54246ec30f920d4d26707bb1a3cd06a5d5a0d3ed44c4d0a6ad54aa008ccafe4
+size 4999929368

model-00017-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fbf0decdc2d36fb57929248c2a8dbfab6c76e972f5aea41f2e2ebff7949d1f07
+size 4996770656

model-00018-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa552fa21947d91ae2988f25993251dfe3843d4b0b81e1e73707b3cd0f0023e0
+size 4996770712

model-00019-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ccc47a26daf8413c09ae781029d872fe796ea38fee0eb9f464d97e5dd81517f
+size 4996771168

model-00020-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:81a5dd01c6f29e751162f11286b3202790e2a0167930b21e74d38e23ae0c0e5c
+size 4999929392

model-00021-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:694a1a2dba2e2e4157df4f769f194c20f382e07956456645c0a6db8c9e83afd1
+size 4996770656

model-00022-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5e6184d83be6bac104c735aa6d3f5be2b1ba331dfecafc0d87d5e723d1e278f
+size 4996770760

model-00023-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef2cf07d0f47987b3a52cb6a990bc66317244ae3b5c53b70a5d37fb0ed5bd48e
+size 4996771168

model-00024-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90bc0197e86b4ca3b711ff399cd33442f49d454e5f15c27f1609e150ea68f9cf
+size 4999929344

model-00025-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed1d4820cbd00513cf03d447300ffa541ea3cad8803b2e0d0a74d7b632b6f0e0
+size 4996770656

model-00026-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15067f644a73f9f35d5c6caad44c607b4595dffc0bf75fc62a3330450254440b
+size 4996770808

model-00027-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:212e9ee30b2d5056b6d01232c4bf03b3a4df058d7dd0c7e54c79398090650e32
+size 4996771168

model-00028-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aee3ea7ab93c63dd5be663478d0a068e344350c2cadc81548ae267ee7d86c172
+size 4999929296

model-00029-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32ceb322b8e6c17c533efa32fc17af4281f672366ed70df0a2c5c0770a0855c9
+size 4996770656

model-00030-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cdf400d6d4d6e4067c6f82fb373a19335142e1c7db4d450a8cfbdcdb821d70e
+size 4996770856

model-00031-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb64026cca4492ef0dda5bb71e61d26713a4f16cc0dfecd41940d77cbb16da20
+size 4996771168

model-00032-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c800bbb840f146c39963c14d43331c4a995e22ef71c7eb6a33b1a574a19e3eff
+size 4999929288

model-00033-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00e1c555bf6d05fc4e471cd9b88e38deb15bf92f1341caf2c94fc7691b0df499
+size 4996770656

model-00034-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44fb917f390670df7db92c945f77564e814e2f7fc6d485065feb205cc0e26e19
+size 4996770904

model-00035-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aab7c2ac0ff4079843d4e907ea05f132957454ccb696dbd6e9f232d00739a568
+size 4971886168

model-00036-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b218302bd7a6ece6ea466a8dce72c4a5db1ace94a4a7e389d88e584ab9b38db7
+size 4998268584

model-00037-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:737aaee5fa92e6d87ce44bf9502fdeeefe721cd2e01a7becec33b35fa9148f0f
+size 4996770656

model-00038-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e380e90aa7e65118491fdbb7c005c9a27ca6bd02d48a52ae3d58f961ea02647e
+size 4996770928

model-00039-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cbc3b41ebbc6325b22581f09845068852b77f7834349c511cf50a259672c236
+size 4986659240

model-00040-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76edabce20e8d04d46e8dd3cb0e835b8e59540908624d76d9d6d2570645cb398
+size 4389170168

model-00041-of-00041.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5344ed0a545f40db7029547e1e473c2a306c1cc2084c5f4c97808f7705a74aa
+size 1551892608

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c72a8cbe89ba70d649e76ed386882145d24f70cc04e4a8da5a3b5adeb128843
+size 16609598

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>",
+    "<|begin_of_audio|>",
+    "<|end_of_audio|>",
+    "<|begin_of_transcription|>",
+    "<|end_of_transcription|>",
+    "<|code_prefix|>",
+    "<|code_middle|>",
+    "<|code_suffix|>",
+    "/nothink"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>"
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bda8e2146c3bb7b7e0fc96dcc4f0aeff041c6c27952e3ace0665663ebff346ba
+size 19970700