JustJaro commited on
Commit
f73ccc7
·
verified ·
1 Parent(s): ebd659b

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. README.md +10 -10
  2. config.json +5 -5
  3. generation_config.json +1 -1
  4. model.safetensors +2 -2
  5. recipe.yaml +1 -1
README.md CHANGED
@@ -34,10 +34,10 @@ The model utilizes **static FP8 quantization** for optimal inference performance
34
 
35
  - **Original Model**: [HuggingFaceTB/SmolLM-135M](https://huggingface.co/HuggingFaceTB/SmolLM-135M)
36
  - **Source Model**: HuggingFaceTB/SmolLM-135M
37
- - **Quantized Model**: InternVL3-38B-FP8-Dynamic
38
- - **Quantization Method**: FP8 Dynamic (W8A8)
39
- - **Quantization Library**: [LLM Compressor](https://github.com/vllm-project/llm-compressor) v0.6.0
40
- - **Calibration Dataset**: N/A
41
  - **Attention Implementation**: Flash Attention 2 (memory efficient, fastest)
42
  - **Quantized by**: [JustJaro](https://huggingface.co/JustJaro)
43
 
@@ -50,7 +50,7 @@ from vllm import LLM, SamplingParams
50
 
51
  # Load the quantized model
52
  model = LLM(
53
- model="JustJaro/InternVL3-38B-FP8-Dynamic",
54
  trust_remote_code=True,
55
  max_model_len=8192,
56
  tensor_parallel_size=1, # Adjust based on your GPU setup
@@ -68,7 +68,7 @@ print(response[0].outputs[0].text)
68
  from transformers import AutoTokenizer, AutoProcessor
69
  from llmcompressor import LLM
70
 
71
- model_id = "JustJaro/InternVL3-38B-FP8-Dynamic"
72
  model = LLM.load(model_id, device="cuda")
73
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
74
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
@@ -93,7 +93,7 @@ print(response)
93
  - **Weights**: FP8 E4M3 with static per-tensor scales
94
  - **Activations**: FP8 E4M3 with static per-tensor scales
95
  - **Preserved Components**: Vision tower, embeddings, normalization layers
96
- - **Calibration**: 0 samples from multimodal dataset
97
 
98
  ## 📈 Performance Benchmarks
99
 
@@ -109,8 +109,8 @@ Expected performance improvements over FP16 baseline:
109
  This model was created using:
110
 
111
  ```
112
- llmcompressor==0.6.0
113
- transformers==4.53.0
114
  torch==2.7.1
115
  vllm==not installed
116
  ```
@@ -1031,7 +1031,7 @@ Original model: [HuggingFaceTB/SmolLM-135M](https://huggingface.co/HuggingFaceTB
1031
  ## 📞 Contact
1032
 
1033
  For questions about this quantized model:
1034
- - **Issues**: [Create an issue](https://huggingface.co/JustJaro/InternVL3-38B-FP8-Dynamic/discussions)
1035
  - **Original Model**: Refer to [HuggingFaceTB/SmolLM-135M](https://huggingface.co/HuggingFaceTB/SmolLM-135M)
1036
 
1037
  ---
 
34
 
35
  - **Original Model**: [HuggingFaceTB/SmolLM-135M](https://huggingface.co/HuggingFaceTB/SmolLM-135M)
36
  - **Source Model**: HuggingFaceTB/SmolLM-135M
37
+ - **Quantized Model**: InternVL3-38B-FP8-Static
38
+ - **Quantization Method**: FP8 Static (W8A8)
39
+ - **Quantization Library**: [LLM Compressor](https://github.com/vllm-project/llm-compressor) v0.6.1.dev18+g090baff5
40
+ - **Calibration Dataset**: open_platypus (256 samples, seq_len=2048)
41
  - **Attention Implementation**: Flash Attention 2 (memory efficient, fastest)
42
  - **Quantized by**: [JustJaro](https://huggingface.co/JustJaro)
43
 
 
50
 
51
  # Load the quantized model
52
  model = LLM(
53
+ model="JustJaro/InternVL3-38B-FP8-Static",
54
  trust_remote_code=True,
55
  max_model_len=8192,
56
  tensor_parallel_size=1, # Adjust based on your GPU setup
 
68
  from transformers import AutoTokenizer, AutoProcessor
69
  from llmcompressor import LLM
70
 
71
+ model_id = "JustJaro/InternVL3-38B-FP8-Static"
72
  model = LLM.load(model_id, device="cuda")
73
  tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
74
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
93
  - **Weights**: FP8 E4M3 with static per-tensor scales
94
  - **Activations**: FP8 E4M3 with static per-tensor scales
95
  - **Preserved Components**: Vision tower, embeddings, normalization layers
96
+ - **Calibration**: 256 samples from multimodal dataset
97
 
98
  ## 📈 Performance Benchmarks
99
 
 
109
  This model was created using:
110
 
111
  ```
112
+ llmcompressor==0.6.1.dev18+g090baff5
113
+ transformers==4.52.4
114
  torch==2.7.1
115
  vllm==not installed
116
  ```
 
1031
  ## 📞 Contact
1032
 
1033
  For questions about this quantized model:
1034
+ - **Issues**: [Create an issue](https://huggingface.co/JustJaro/InternVL3-38B-FP8-Static/discussions)
1035
  - **Original Model**: Refer to [HuggingFaceTB/SmolLM-135M](https://huggingface.co/HuggingFaceTB/SmolLM-135M)
1036
 
1037
  ---
config.json CHANGED
@@ -24,12 +24,12 @@
24
  "input_activations": {
25
  "actorder": null,
26
  "block_structure": null,
27
- "dynamic": true,
28
  "group_size": null,
29
  "num_bits": 8,
30
- "observer": null,
31
  "observer_kwargs": {},
32
- "strategy": "token",
33
  "symmetric": true,
34
  "type": "float"
35
  },
@@ -45,7 +45,7 @@
45
  "num_bits": 8,
46
  "observer": "minmax",
47
  "observer_kwargs": {},
48
- "strategy": "channel",
49
  "symmetric": true,
50
  "type": "float"
51
  }
@@ -65,7 +65,7 @@
65
  "rope_theta": 10000.0,
66
  "tie_word_embeddings": true,
67
  "torch_dtype": "bfloat16",
68
- "transformers_version": "4.53.0",
69
  "use_cache": true,
70
  "vocab_size": 49152
71
  }
 
24
  "input_activations": {
25
  "actorder": null,
26
  "block_structure": null,
27
+ "dynamic": false,
28
  "group_size": null,
29
  "num_bits": 8,
30
+ "observer": "minmax",
31
  "observer_kwargs": {},
32
+ "strategy": "tensor",
33
  "symmetric": true,
34
  "type": "float"
35
  },
 
45
  "num_bits": 8,
46
  "observer": "minmax",
47
  "observer_kwargs": {},
48
+ "strategy": "tensor",
49
  "symmetric": true,
50
  "type": "float"
51
  }
 
65
  "rope_theta": 10000.0,
66
  "tie_word_embeddings": true,
67
  "torch_dtype": "bfloat16",
68
+ "transformers_version": "4.52.4",
69
  "use_cache": true,
70
  "vocab_size": 49152
71
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 0,
4
  "eos_token_id": 0,
5
- "transformers_version": "4.53.0"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 0,
4
  "eos_token_id": 0,
5
+ "transformers_version": "4.52.4"
6
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b29852221e8b0fb7ce5364816d029fc8cd9fbdc0b790efad61cc32ce4dc2f36
3
- size 163227736
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:071bb4b9ad3b3f7ea5aafeb481e5b5d7f486df3c81006fd643b651328a1367ea
3
+ size 219563872
recipe.yaml CHANGED
@@ -4,4 +4,4 @@ default_stage:
4
  targets: [Linear]
5
  ignore: ['re:.*lm_head', 're:.*vision.*', 're:.*visual.*', 're:.*image.*', 're:.*patch_embed.*',
6
  're:.*pos_embed.*', 're:.*norm.*', 're:.*layernorm.*']
7
- scheme: FP8_DYNAMIC
 
4
  targets: [Linear]
5
  ignore: ['re:.*lm_head', 're:.*vision.*', 're:.*visual.*', 're:.*image.*', 're:.*patch_embed.*',
6
  're:.*pos_embed.*', 're:.*norm.*', 're:.*layernorm.*']
7
+ scheme: FP8