cloud19 commited on
Commit
2359572
·
verified ·
1 Parent(s): 7da1274

Upload FP4 Blackwell quantized model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: tensorrt-llm
4
+ tags:
5
+ - quantization
6
+ - nvfp4
7
+ - blackwell
8
+ - nvidia-modelopt
9
+ - sainemo
10
+ ---
11
+
12
+ # SAINEMO-reMIX (FP4 Blackwell)
13
+
14
+ This model is a **FP4 (NVFP4)** quantized version of [Moraliane/SAINEMO-reMIX](https://huggingface.co/Moraliane/SAINEMO-reMIX).
15
+
16
+ It was quantized using **NVIDIA ModelOpt** with the `NVFP4_DEFAULT_CFG` configuration, specifically optimized for **NVIDIA Blackwell (B200)** architecture.
17
+
18
+ ## Calibration Details
19
+ - **Method:** Offline calibration
20
+ - **Dataset:** cnn_dailymail (512 samples)
21
+ - **Format:** Hugging Face Safetensors with ModelOpt quantization metadata (`_amax`)
22
+
23
+ ## Usage
24
+ This checkpoint is designed to be converted and built into a TensorRT-LLM engine.
25
+
26
+ ```bash
27
+ # Example conversion workflow
28
+ python convert_checkpoint.py --model_dir . --output_dir ./trt_ckpt --dtype bfloat16
29
+ trtllm-build --checkpoint_dir ./trt_ckpt ...
30
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] | trim + '
2
+
3
+ ' %}{% set messages = messages[1:] %}{% else %}{% set system_message = '' %}{% endif %}{{- bos_token + system_message}}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST]' + message['content'] | trim + '[/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] | trim + eos_token }}{% endif %}{% endfor %}
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architecture": "MistralForCausalLM",
3
+ "architectures": [
4
+ "MistralForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 2,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 5120,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 14336,
15
+ "max_position_embeddings": 1024000,
16
+ "model_type": "mistral",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 40,
19
+ "num_key_value_heads": 8,
20
+ "pad_token_id": 10,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_parameters": {
23
+ "rope_theta": 1000000.0,
24
+ "rope_type": "default"
25
+ },
26
+ "sliding_window": null,
27
+ "tie_word_embeddings": false,
28
+ "transformers_version": "5.0.0",
29
+ "unsloth_version": "2024.8",
30
+ "use_cache": true,
31
+ "vocab_size": 131072,
32
+ "quantization": {
33
+ "quant_algo": "NVFP4",
34
+ "exclude_modules": ["lm_head"]
35
+ }
36
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 10,
6
+ "transformers_version": "5.0.0",
7
+ "use_cache": true
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72282be4fb23f2334efd8a28b4f24dad496eb24e5dbaaae3373f6d292c7b72ae
3
+ size 24495678128
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8771d3c14b4206fd6b54c60d037eeafce2f37382046c2369c36cd2edd2f099d7
3
+ size 17078391
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "</s>",
7
+ "is_local": false,
8
+ "model_max_length": 1024000,
9
+ "pad_token": "<pad>",
10
+ "padding_side": "left",
11
+ "tokenizer_class": "TokenizersBackend",
12
+ "unk_token": "<unk>"
13
+ }