Upload folder using huggingface_hub
Browse files- README.md +10 -10
- config.json +5 -5
- generation_config.json +1 -1
- model.safetensors +2 -2
- recipe.yaml +1 -1
README.md
CHANGED
|
@@ -34,10 +34,10 @@ The model utilizes **static FP8 quantization** for optimal inference performance
|
|
| 34 |
|
| 35 |
- **Original Model**: [HuggingFaceTB/SmolLM-135M](https://huggingface.co/HuggingFaceTB/SmolLM-135M)
|
| 36 |
- **Source Model**: HuggingFaceTB/SmolLM-135M
|
| 37 |
-
- **Quantized Model**: InternVL3-38B-FP8-
|
| 38 |
-
- **Quantization Method**: FP8
|
| 39 |
-
- **Quantization Library**: [LLM Compressor](https://github.com/vllm-project/llm-compressor) v0.6.
|
| 40 |
-
- **Calibration Dataset**:
|
| 41 |
- **Attention Implementation**: Flash Attention 2 (memory efficient, fastest)
|
| 42 |
- **Quantized by**: [JustJaro](https://huggingface.co/JustJaro)
|
| 43 |
|
|
@@ -50,7 +50,7 @@ from vllm import LLM, SamplingParams
|
|
| 50 |
|
| 51 |
# Load the quantized model
|
| 52 |
model = LLM(
|
| 53 |
-
model="JustJaro/InternVL3-38B-FP8-
|
| 54 |
trust_remote_code=True,
|
| 55 |
max_model_len=8192,
|
| 56 |
tensor_parallel_size=1, # Adjust based on your GPU setup
|
|
@@ -68,7 +68,7 @@ print(response[0].outputs[0].text)
|
|
| 68 |
from transformers import AutoTokenizer, AutoProcessor
|
| 69 |
from llmcompressor import LLM
|
| 70 |
|
| 71 |
-
model_id = "JustJaro/InternVL3-38B-FP8-
|
| 72 |
model = LLM.load(model_id, device="cuda")
|
| 73 |
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
| 74 |
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
|
@@ -93,7 +93,7 @@ print(response)
|
|
| 93 |
- **Weights**: FP8 E4M3 with static per-tensor scales
|
| 94 |
- **Activations**: FP8 E4M3 with static per-tensor scales
|
| 95 |
- **Preserved Components**: Vision tower, embeddings, normalization layers
|
| 96 |
-
- **Calibration**:
|
| 97 |
|
| 98 |
## 📈 Performance Benchmarks
|
| 99 |
|
|
@@ -109,8 +109,8 @@ Expected performance improvements over FP16 baseline:
|
|
| 109 |
This model was created using:
|
| 110 |
|
| 111 |
```
|
| 112 |
-
llmcompressor==0.6.
|
| 113 |
-
transformers==4.
|
| 114 |
torch==2.7.1
|
| 115 |
vllm==not installed
|
| 116 |
```
|
|
@@ -1031,7 +1031,7 @@ Original model: [HuggingFaceTB/SmolLM-135M](https://huggingface.co/HuggingFaceTB
|
|
| 1031 |
## 📞 Contact
|
| 1032 |
|
| 1033 |
For questions about this quantized model:
|
| 1034 |
-
- **Issues**: [Create an issue](https://huggingface.co/JustJaro/InternVL3-38B-FP8-
|
| 1035 |
- **Original Model**: Refer to [HuggingFaceTB/SmolLM-135M](https://huggingface.co/HuggingFaceTB/SmolLM-135M)
|
| 1036 |
|
| 1037 |
---
|
|
|
|
| 34 |
|
| 35 |
- **Original Model**: [HuggingFaceTB/SmolLM-135M](https://huggingface.co/HuggingFaceTB/SmolLM-135M)
|
| 36 |
- **Source Model**: HuggingFaceTB/SmolLM-135M
|
| 37 |
+
- **Quantized Model**: InternVL3-38B-FP8-Static
|
| 38 |
+
- **Quantization Method**: FP8 Static (W8A8)
|
| 39 |
+
- **Quantization Library**: [LLM Compressor](https://github.com/vllm-project/llm-compressor) v0.6.1.dev18+g090baff5
|
| 40 |
+
- **Calibration Dataset**: open_platypus (256 samples, seq_len=2048)
|
| 41 |
- **Attention Implementation**: Flash Attention 2 (memory efficient, fastest)
|
| 42 |
- **Quantized by**: [JustJaro](https://huggingface.co/JustJaro)
|
| 43 |
|
|
|
|
| 50 |
|
| 51 |
# Load the quantized model
|
| 52 |
model = LLM(
|
| 53 |
+
model="JustJaro/InternVL3-38B-FP8-Static",
|
| 54 |
trust_remote_code=True,
|
| 55 |
max_model_len=8192,
|
| 56 |
tensor_parallel_size=1, # Adjust based on your GPU setup
|
|
|
|
| 68 |
from transformers import AutoTokenizer, AutoProcessor
|
| 69 |
from llmcompressor import LLM
|
| 70 |
|
| 71 |
+
model_id = "JustJaro/InternVL3-38B-FP8-Static"
|
| 72 |
model = LLM.load(model_id, device="cuda")
|
| 73 |
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
| 74 |
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
|
|
|
|
| 93 |
- **Weights**: FP8 E4M3 with static per-tensor scales
|
| 94 |
- **Activations**: FP8 E4M3 with static per-tensor scales
|
| 95 |
- **Preserved Components**: Vision tower, embeddings, normalization layers
|
| 96 |
+
- **Calibration**: 256 samples from multimodal dataset
|
| 97 |
|
| 98 |
## 📈 Performance Benchmarks
|
| 99 |
|
|
|
|
| 109 |
This model was created using:
|
| 110 |
|
| 111 |
```
|
| 112 |
+
llmcompressor==0.6.1.dev18+g090baff5
|
| 113 |
+
transformers==4.52.4
|
| 114 |
torch==2.7.1
|
| 115 |
vllm==not installed
|
| 116 |
```
|
|
|
|
| 1031 |
## 📞 Contact
|
| 1032 |
|
| 1033 |
For questions about this quantized model:
|
| 1034 |
+
- **Issues**: [Create an issue](https://huggingface.co/JustJaro/InternVL3-38B-FP8-Static/discussions)
|
| 1035 |
- **Original Model**: Refer to [HuggingFaceTB/SmolLM-135M](https://huggingface.co/HuggingFaceTB/SmolLM-135M)
|
| 1036 |
|
| 1037 |
---
|
config.json
CHANGED
|
@@ -24,12 +24,12 @@
|
|
| 24 |
"input_activations": {
|
| 25 |
"actorder": null,
|
| 26 |
"block_structure": null,
|
| 27 |
-
"dynamic":
|
| 28 |
"group_size": null,
|
| 29 |
"num_bits": 8,
|
| 30 |
-
"observer":
|
| 31 |
"observer_kwargs": {},
|
| 32 |
-
"strategy": "
|
| 33 |
"symmetric": true,
|
| 34 |
"type": "float"
|
| 35 |
},
|
|
@@ -45,7 +45,7 @@
|
|
| 45 |
"num_bits": 8,
|
| 46 |
"observer": "minmax",
|
| 47 |
"observer_kwargs": {},
|
| 48 |
-
"strategy": "
|
| 49 |
"symmetric": true,
|
| 50 |
"type": "float"
|
| 51 |
}
|
|
@@ -65,7 +65,7 @@
|
|
| 65 |
"rope_theta": 10000.0,
|
| 66 |
"tie_word_embeddings": true,
|
| 67 |
"torch_dtype": "bfloat16",
|
| 68 |
-
"transformers_version": "4.
|
| 69 |
"use_cache": true,
|
| 70 |
"vocab_size": 49152
|
| 71 |
}
|
|
|
|
| 24 |
"input_activations": {
|
| 25 |
"actorder": null,
|
| 26 |
"block_structure": null,
|
| 27 |
+
"dynamic": false,
|
| 28 |
"group_size": null,
|
| 29 |
"num_bits": 8,
|
| 30 |
+
"observer": "minmax",
|
| 31 |
"observer_kwargs": {},
|
| 32 |
+
"strategy": "tensor",
|
| 33 |
"symmetric": true,
|
| 34 |
"type": "float"
|
| 35 |
},
|
|
|
|
| 45 |
"num_bits": 8,
|
| 46 |
"observer": "minmax",
|
| 47 |
"observer_kwargs": {},
|
| 48 |
+
"strategy": "tensor",
|
| 49 |
"symmetric": true,
|
| 50 |
"type": "float"
|
| 51 |
}
|
|
|
|
| 65 |
"rope_theta": 10000.0,
|
| 66 |
"tie_word_embeddings": true,
|
| 67 |
"torch_dtype": "bfloat16",
|
| 68 |
+
"transformers_version": "4.52.4",
|
| 69 |
"use_cache": true,
|
| 70 |
"vocab_size": 49152
|
| 71 |
}
|
generation_config.json
CHANGED
|
@@ -2,5 +2,5 @@
|
|
| 2 |
"_from_model_config": true,
|
| 3 |
"bos_token_id": 0,
|
| 4 |
"eos_token_id": 0,
|
| 5 |
-
"transformers_version": "4.
|
| 6 |
}
|
|
|
|
| 2 |
"_from_model_config": true,
|
| 3 |
"bos_token_id": 0,
|
| 4 |
"eos_token_id": 0,
|
| 5 |
+
"transformers_version": "4.52.4"
|
| 6 |
}
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:071bb4b9ad3b3f7ea5aafeb481e5b5d7f486df3c81006fd643b651328a1367ea
|
| 3 |
+
size 219563872
|
recipe.yaml
CHANGED
|
@@ -4,4 +4,4 @@ default_stage:
|
|
| 4 |
targets: [Linear]
|
| 5 |
ignore: ['re:.*lm_head', 're:.*vision.*', 're:.*visual.*', 're:.*image.*', 're:.*patch_embed.*',
|
| 6 |
're:.*pos_embed.*', 're:.*norm.*', 're:.*layernorm.*']
|
| 7 |
-
scheme:
|
|
|
|
| 4 |
targets: [Linear]
|
| 5 |
ignore: ['re:.*lm_head', 're:.*vision.*', 're:.*visual.*', 're:.*image.*', 're:.*patch_embed.*',
|
| 6 |
're:.*pos_embed.*', 're:.*norm.*', 're:.*layernorm.*']
|
| 7 |
+
scheme: FP8
|