ekurtic commited on
Commit
1ac8d87
·
verified ·
1 Parent(s): ffd7cae

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +77 -1
README.md CHANGED
@@ -44,4 +44,80 @@ If you are running `vllm > 0.15.0`, you will likely have the bug fixes already a
44
  | Toxic Chat | 0.433 | 0.425 | 98.15 | 0.519 | 0.519 | 100 |
45
  | ToxiGen | 0.46 | 0.47 | 102.17 | 0.315 | 0.325 | 103.17 |
46
  | XSTest | 0.834 | 0.833 | 99.88 | 0.78 | 0.775 | 99.36 |
47
- | Average Score | 0.6711282051 | 0.6729230769 | 100.5220513 | 0.5706410256 | 0.5725641026 | 100.8784615 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  | Toxic Chat | 0.433 | 0.425 | 98.15 | 0.519 | 0.519 | 100 |
45
  | ToxiGen | 0.46 | 0.47 | 102.17 | 0.315 | 0.325 | 103.17 |
46
  | XSTest | 0.834 | 0.833 | 99.88 | 0.78 | 0.775 | 99.36 |
47
+ | Average Score | 0.6711282051 | 0.6729230769 | 100.5220513 | 0.5706410256 | 0.5725641026 | 100.8784615 |
48
+
49
+
50
+ ## Model creation
51
+
52
+ This model is created with `compressed-tensors==0.13.0` and `llmcompressor==0.9.0.1`, and the following LLM-Compressor quantization script:
53
+
54
+ ```bash
55
+ CUDA_VISIBLE_DEVICES=0 python quantize.py --model_path meta-llama/Llama-Guard-4-12B RedHatAI/Llama-Guard-4-12B-FP8-dynamic --pipeline datafree
56
+ ```
57
+
58
+ ```python
59
+ import argparse
60
+ import torch
61
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Llama4ForConditionalGeneration
62
+ from llmcompressor.modifiers.quantization import QuantizationModifier
63
+ from llmcompressor import oneshot
64
+ from compressed_tensors.quantization import (
65
+ QuantizationScheme,
66
+ QuantizationArgs,
67
+ QuantizationType,
68
+ QuantizationStrategy,
69
+ )
70
+
71
+
72
+ def main():
73
+ parser = argparse.ArgumentParser(description="Quantize a causal language model")
74
+ parser.add_argument(
75
+ "--model_path",
76
+ type=str,
77
+ required=True,
78
+ help="Path to the pre-trained model",
79
+ )
80
+ parser.add_argument(
81
+ "--quant_path",
82
+ type=str,
83
+ required=True,
84
+ help="Output path for the quantized model",
85
+ )
86
+ parser.add_argument(
87
+ "--pipeline", #['basic', 'datafree', 'sequential', independent]
88
+ type=str,
89
+ required=True,
90
+ )
91
+
92
+ print(f"Loading model from {args.model_path}...")
93
+ model = Llama4ForConditionalGeneration.from_pretrained(
94
+ args.model_path,
95
+ torch_dtype="auto",
96
+ trust_remote_code=True,
97
+ )
98
+
99
+ recipe = QuantizationModifier(
100
+ targets="Linear",
101
+ scheme="FP8_dynamic",
102
+ ignore=[
103
+ 're:.*lm_head',
104
+ 're:.*multi_modal_projector',
105
+ 're:.*vision_model',
106
+ ]
107
+ )
108
+
109
+ print("Applying quantization...")
110
+ oneshot(
111
+ model=model,
112
+ recipe=recipe,
113
+ trust_remote_code_model=True,
114
+ pipeline=args.pipeline,
115
+ )
116
+
117
+ model.save_pretrained(args.quant_path, save_compressed=True, skip_compression_stats=True, disable_sparse_compression=True)
118
+ print(f"Quantized model saved to {args.quant_path}")
119
+
120
+
121
+ if __name__ == "__main__":
122
+ main()
123
+ ```