ekurtic commited on
Commit
2cd88cb
·
verified ·
1 Parent(s): 54f8f96

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +93 -1
README.md CHANGED
@@ -45,4 +45,96 @@ If you are running `vllm > 0.15.0`, you will likely have the bug fixes already a
45
  | Toxic Chat | 0.433 | 0.436 | 100.69 | 0.519 | 0.525 | 101.16 |
46
  | ToxiGen | 0.46 | 0.465 | 101.09 | 0.315 | 0.32 | 101.59 |
47
  | XSTest | 0.834 | 0.836 | 100.24 | 0.78 | 0.78 | 100 |
48
- | Average Score | 0.6711282051 | 0.6687692308 | 99.42051282 | 0.5706410256 | 0.5682051282 | 99.30282051 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  | Toxic Chat | 0.433 | 0.436 | 100.69 | 0.519 | 0.525 | 101.16 |
46
  | ToxiGen | 0.46 | 0.465 | 101.09 | 0.315 | 0.32 | 101.59 |
47
  | XSTest | 0.834 | 0.836 | 100.24 | 0.78 | 0.78 | 100 |
48
+ | Average Score | 0.6711282051 | 0.6687692308 | 99.42051282 | 0.5706410256 | 0.5682051282 | 99.30282051 |
49
+
50
+
51
+ ## Model creation
52
+
53
+ This model is created with `compressed-tensors==0.13.0` and `llmcompressor==0.9.0.1`, and the following LLM-Compressor quantization script:
54
+
55
+ ```bash
56
+ CUDA_VISIBLE_DEVICES=0 python quantize.py --model_path meta-llama/Llama-Guard-4-12B --quant_path RedHatAI/Llama-Guard-4-12B-quantized.w8a8 --calib_size 512 --dampening_frac 0.03 --pipeline independent
57
+ ```
58
+
59
+ ```python
60
+ from datasets import load_dataset
61
+ from transformers import AutoProcessor, Llama4ForConditionalGeneration
62
+ from llmcompressor.modifiers.quantization import GPTQModifier
63
+ from llmcompressor import oneshot
64
+ import argparse
65
+ from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs, QuantizationType, QuantizationStrategy
66
+
67
+
68
+ parser = argparse.ArgumentParser()
69
+ parser.add_argument('--model_path', type=str, required=True)
70
+ parser.add_argument('--quant_path', type=str, required=True)
71
+ parser.add_argument('--calib_size', type=int, required=True)
72
+ parser.add_argument('--dampening_frac', type=float, required=True)
73
+ parser.add_argument('--pipeline', type=str, default="basic") #['basic', 'datafree', 'sequential', independent]
74
+
75
+ args = parser.parse_args()
76
+ print(f"[DEBUGGING ARGS] {args}")
77
+
78
+ model = Llama4ForConditionalGeneration.from_pretrained(
79
+ args.model_path,
80
+ torch_dtype="auto",
81
+ trust_remote_code=True,
82
+ )
83
+ processor = AutoProcessor.from_pretrained(args.model_path, trust_remote_code=True)
84
+
85
+ def preprocess_fn(example):
86
+ # prepare for multimodal processor
87
+ for msg in example["messages"]:
88
+ msg["content"] = [{'type': 'text', 'text': msg['content']}]
89
+
90
+ return {"text": processor.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
91
+
92
+ ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
93
+ ds = ds.map(preprocess_fn)
94
+
95
+ print(f"================================================================================")
96
+ print(f"[For debugging] Calibration data sample is:\n{repr(ds[0]['text'])}")
97
+ print(f"================================================================================")
98
+
99
+ quant_scheme = QuantizationScheme(
100
+ targets=["Linear"],
101
+ weights=QuantizationArgs(
102
+ num_bits=8,
103
+ type=QuantizationType.INT,
104
+ symmetric=True,
105
+ strategy=QuantizationStrategy.CHANNEL,
106
+ ),
107
+ input_activations=QuantizationArgs(
108
+ dynamic=True,
109
+ num_bits=8,
110
+ strategy=QuantizationStrategy.TOKEN,
111
+ symmetric=True,
112
+ type=QuantizationType.INT,
113
+ ),
114
+ output_activations=None,
115
+ )
116
+
117
+ recipe = [
118
+ GPTQModifier(
119
+ targets=["Linear"],
120
+ ignore=[
121
+ "re:.*lm_head",
122
+ "re:.*multi_modal_projector",
123
+ "re:.*vision_model",
124
+ ],
125
+ dampening_frac=args.dampening_frac,
126
+ config_groups={"group_0": quant_scheme},
127
+ )
128
+ ]
129
+ oneshot(
130
+ model=model,
131
+ dataset=ds,
132
+ recipe=recipe,
133
+ num_calibration_samples=args.calib_size,
134
+ max_seq_length=2048,
135
+ pipeline=args.pipeline,
136
+ )
137
+
138
+ SAVE_DIR = args.quant_path
139
+ model.save_pretrained(SAVE_DIR)
140
+ ```