Firworks commited on
Commit
0f2e3b4
·
verified ·
1 Parent(s): 155e3ef

Add NVFP4 quantized checkpoint

Browse files
README.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets:
3
+ - Rombo-Org/Optimized_Reasoning
4
+ base_model:
5
+ - dphn/Dolphin-Xgen-RL
6
+ ---
7
+ # Dolphin-Xgen-RL-nvfp4
8
+
9
+ **Format:** NVFP4 — weights & activations quantized to FP4 with dual scaling.
10
+ **Base model:** `dphn/Dolphin-Xgen-RL`
11
+ **How it was made:** One-shot calibration with LLM Compressor (NVFP4 recipe), long-seq calibration with Rombo-Org/Optimized_Reasoning.
12
+
13
+ > Notes: Keep `lm_head` in high precision; calibrate on long, domain-relevant sequences.
14
+
15
+ Check the original model card for information about this model.
16
+
17
+ # Running the model with VLLM in Docker
18
+ ```sh
19
+ sudo docker run --runtime nvidia --gpus all -p 8000:8000 --ipc=host vllm/vllm-openai:nightly --model Firworks/Dolphin-Xgen-RL-nvfp4 --dtype auto --max-model-len 32768
20
+ ```
21
+ This was tested on an RTX Pro 6000 Blackwell cloud instance.
22
+
23
+ If there are other models you're interested in seeing quantized to NVFP4 for use on the DGX Spark, or other modern Blackwell (or newer) cards let me know. I'm trying to make more NVFP4 models available to allow more people to try them out.
added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endofprompt|>": 100276,
3
+ "<|im_end|>": 100265,
4
+ "<|im_start|>": 100264
5
+ }
chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ ' + message['content'] + '<|im_end|>' + '
3
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
+ ' }}{% endif %}
config.json ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 100257,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.0,
10
+ "eos_token_id": 100265,
11
+ "head_dim": 128,
12
+ "hidden_act": "silu",
13
+ "hidden_size": 4096,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 14336,
16
+ "max_position_embeddings": 262144,
17
+ "mlp_bias": false,
18
+ "model_type": "llama",
19
+ "num_attention_heads": 32,
20
+ "num_hidden_layers": 45,
21
+ "num_key_value_heads": 8,
22
+ "pad_token_id": 100257,
23
+ "pretraining_tp": 1,
24
+ "quantization_config": {
25
+ "config_groups": {
26
+ "group_0": {
27
+ "format": "nvfp4-pack-quantized",
28
+ "input_activations": {
29
+ "actorder": null,
30
+ "block_structure": null,
31
+ "dynamic": "local",
32
+ "group_size": 16,
33
+ "num_bits": 4,
34
+ "observer": "minmax",
35
+ "observer_kwargs": {},
36
+ "strategy": "tensor_group",
37
+ "symmetric": true,
38
+ "type": "float"
39
+ },
40
+ "output_activations": null,
41
+ "targets": [
42
+ "Linear"
43
+ ],
44
+ "weights": {
45
+ "actorder": null,
46
+ "block_structure": null,
47
+ "dynamic": false,
48
+ "group_size": 16,
49
+ "num_bits": 4,
50
+ "observer": "minmax",
51
+ "observer_kwargs": {},
52
+ "strategy": "tensor_group",
53
+ "symmetric": true,
54
+ "type": "float"
55
+ }
56
+ }
57
+ },
58
+ "format": "nvfp4-pack-quantized",
59
+ "global_compression_ratio": null,
60
+ "ignore": [
61
+ "lm_head"
62
+ ],
63
+ "kv_cache_scheme": null,
64
+ "quant_method": "compressed-tensors",
65
+ "quantization_status": "compressed",
66
+ "sparsity_config": {},
67
+ "transform_config": {},
68
+ "version": "0.12.2"
69
+ },
70
+ "resid_pdrop": 0.0,
71
+ "rms_norm_eps": 1e-06,
72
+ "rope_scaling": null,
73
+ "rope_theta": 128000000,
74
+ "tie_word_embeddings": false,
75
+ "transformers_version": "4.56.2",
76
+ "use_cache": false,
77
+ "vocab_size": 102400
78
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 100257,
4
+ "do_sample": true,
5
+ "eos_token_id": 100265,
6
+ "pad_token_id": 100257,
7
+ "transformers_version": "4.56.2",
8
+ "use_cache": false
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7170a97fc071906744f47e6288b6c7a0d9efbc8670d6130519959bbc75ee574
3
+ size 4999546400
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed4225d98b72b5a1e40b8bcb4dc76eff1c016e8bd0d40543d830d1cac76d9efa
3
+ size 3878306536
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
recipe.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ default_stage:
2
+ default_modifiers:
3
+ QuantizationModifier:
4
+ targets: [Linear]
5
+ ignore: [lm_head, 're:visual.*', 're:.*vision_tower.*', 're:.*video_tower.*', 're:.*audio_tower.*',
6
+ 're:.*multi_modal_projector.*']
7
+ scheme: NVFP4
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|im_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "100257": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "100258": {
13
+ "content": "<|fim_prefix|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "100259": {
21
+ "content": "<|fim_middle|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "100260": {
29
+ "content": "<|fim_suffix|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "100264": {
37
+ "content": "<|im_start|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": false
43
+ },
44
+ "100265": {
45
+ "content": "<|im_end|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "100276": {
53
+ "content": "<|endofprompt|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ }
60
+ },
61
+ "bos_token": "<|endoftext|>",
62
+ "clean_up_tokenization_spaces": false,
63
+ "eos_token": "<|im_end|>",
64
+ "extra_special_tokens": {},
65
+ "model_max_length": 262144,
66
+ "pad_token": "<|endoftext|>",
67
+ "tokenizer_class": "GPT2Tokenizer",
68
+ "unk_token": "<|endoftext|>"
69
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff