phaedawg commited on
Commit
6ad90cc
·
verified ·
1 Parent(s): 13f33bc

Upload NVFP4 quantized Behemoth-R1-123B-v2

Browse files
README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ base_model: TheDrummer/Behemoth-R1-123B-v2
4
+ tags:
5
+ - nvfp4
6
+ - modelopt
7
+ - quantized
8
+ - blackwell
9
+ - b200
10
+ library_name: transformers
11
+ ---
12
+
13
+ # Behemoth-R1-V2 ModelOpt NVFP4
14
+
15
+ NVFP4 quantized version of [TheDrummer/Behemoth-R1-123B-v2](https://huggingface.co/TheDrummer/Behemoth-R1-123B-v2) using NVIDIA Model Optimizer.
16
+
17
+ ## Quantization Details
18
+
19
+ | Property | Value |
20
+ |----------|-------|
21
+ | **Original Model** | TheDrummer/Behemoth-R1-123B-v2 |
22
+ | **Quantization** | NVFP4 (FP4 weights, FP16 activations) |
23
+ | **Method** | NVIDIA ModelOpt PTQ |
24
+ | **Calibration Samples** | 512 |
25
+ | **Max Sequence Length** | 4096 |
26
+
27
+ ## Hardware Requirements
28
+
29
+ - **Optimal**: NVIDIA Blackwell GPUs (B100, B200, RTX PRO 6000 Blackwell)
30
+ - **Compatible**: Hopper/Ampere (will use weight-only mode)
31
+
32
+ ## Usage with vLLM
33
+
34
+ ```python
35
+ from vllm import LLM, SamplingParams
36
+
37
+ llm = LLM(
38
+ model="TheHouseOfTheDude/Behemoth-R1-V2_ModelOpt-NVFP4",
39
+ quantization="modelopt",
40
+ trust_remote_code=True,
41
+ )
42
+
43
+ sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=512)
44
+ outputs = llm.generate(["Write a story about..."], sampling_params)
45
+ print(outputs[0].outputs[0].text)
46
+ ```
47
+
48
+ ## Chat Template
49
+
50
+ Uses Mistral v7 (Non-Tekken) format. See the original model card for usage details.
51
+
52
+ ## Credits
53
+
54
+ - Original Model: [TheDrummer](https://huggingface.co/TheDrummer)
55
+ - Quantization: TheHouseOfTheDude
56
+ - Quantization Framework: NVIDIA ModelOpt
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}
config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MistralForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 1,
7
+ "dtype": "bfloat16",
8
+ "eos_token_id": 2,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 12288,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 28672,
14
+ "max_position_embeddings": 131072,
15
+ "model_type": "mistral",
16
+ "num_attention_heads": 96,
17
+ "num_hidden_layers": 88,
18
+ "num_key_value_heads": 8,
19
+ "rms_norm_eps": 1e-05,
20
+ "rope_theta": 1000000.0,
21
+ "sliding_window": null,
22
+ "tie_word_embeddings": false,
23
+ "transformers_version": "4.57.6",
24
+ "use_cache": true,
25
+ "vocab_size": 32768,
26
+ "quantization_config": {
27
+ "config_groups": {
28
+ "group_0": {
29
+ "input_activations": {
30
+ "dynamic": false,
31
+ "num_bits": 4,
32
+ "type": "float",
33
+ "group_size": 16
34
+ },
35
+ "weights": {
36
+ "dynamic": false,
37
+ "num_bits": 4,
38
+ "type": "float",
39
+ "group_size": 16
40
+ },
41
+ "targets": [
42
+ "Linear"
43
+ ]
44
+ }
45
+ },
46
+ "ignore": [
47
+ "lm_head"
48
+ ],
49
+ "quant_algo": "NVFP4",
50
+ "producer": {
51
+ "name": "modelopt",
52
+ "version": "0.41.0"
53
+ },
54
+ "quant_method": "modelopt"
55
+ }
56
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.57.6"
6
+ }
hf_quant_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "producer": {
3
+ "name": "modelopt",
4
+ "version": "0.41.0"
5
+ },
6
+ "quantization": {
7
+ "quant_algo": "NVFP4",
8
+ "kv_cache_quant_algo": null,
9
+ "group_size": 16,
10
+ "exclude_modules": [
11
+ "lm_head"
12
+ ]
13
+ }
14
+ }
model-00001-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d3a51efb15092f6412b66fd307880c896b646985a1489c05c5348ae6ce98102
3
+ size 4882434096
model-00002-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:020ae3cc89ab5f131e93e207326d407f09a273a2d9f4bb320fb7232485c3e8be
3
+ size 4869902096
model-00003-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42bc43c141a7c229a7be3af07ec16fb27c90c151f1e17df50129314d09e3255f
3
+ size 4869902232
model-00004-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f96e7e22376c32f5305bdc3d7b992160ae2dd1324c823ed32c0d5172d261d405
3
+ size 4969043384
model-00005-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:318e86dd68b60b5810da52f5f69d03e524b9a27dfa8238adc0d7f90ae5ad3813
3
+ size 4954837344
model-00006-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82a68390d6c39e355eaafe8bcfe8d0c31344d1b22370ff907b6af5342a36c3b7
3
+ size 4869902232
model-00007-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:327ee4c4cb3b3ee8beb6b1176c964f205aa20a3e179fe13281f48ccec32affb6
3
+ size 4969043384
model-00008-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0a3d712ce2dd3c8915b14dccaa1b1f4013b0ef0a7c68bd23d24f53949c74d62
3
+ size 4954837344
model-00009-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2087aeb96c0cf23ec466aa4911ce5ff03caae8ba1a5cddefb03da391d3b54bc0
3
+ size 4869902232
model-00010-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:839ab031b14b27218311a15fd032aea0585bb5a6c6ee03e187944d2a550a4fa7
3
+ size 4969043384
model-00011-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44b34d3d28ec84aa2efd527f601bdc62abb358133f7caa095570462e021bbc6f
3
+ size 4954837344
model-00012-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:858adf31a2042971d09d3b1e3fca275d4fe346a1c8168db133df9ce6e20a9b93
3
+ size 4869902232
model-00013-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff3a9e6c820c60432dfad94de70a5e7b92b12563ef448acf9e415ce906d59e27
3
+ size 4969043384
model-00014-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0139ab9cc4e85f5d5da4adbffad444cdb67bfd2927bc1ea71d129d8cbb67379
3
+ size 4954837344
model-00015-of-00015.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51445b6bf7b940e6c9134f28fef62fd02af1e4435f7d8a86c74aa91a92538bd7
3
+ size 1201743136
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "</s>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff