ekurtic commited on
Commit
9e9f5d0
·
verified ·
1 Parent(s): c8004ad

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +113 -1
README.md CHANGED
@@ -44,4 +44,116 @@ Evaluations are obtained with `vllm==0.15.0` and bug fixes from this [PR](https:
44
  | Toxic Chat | 0.433 | 0.433 | 100 | 0.519 | 0.508 | 97.88 |
45
  | ToxiGen | 0.46 | 0.444 | 96.52 | 0.315 | 0.3 | 95.24 |
46
  | XSTest | 0.834 | 0.832 | 99.76 | 0.78 | 0.765 | 98.08 |
47
- | Average Score | 0.6711282051 | 0.6654871795 | 99.12538462 | 0.5706410256 | 0.5629487179 | 98.45897436 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  | Toxic Chat | 0.433 | 0.433 | 100 | 0.519 | 0.508 | 97.88 |
45
  | ToxiGen | 0.46 | 0.444 | 96.52 | 0.315 | 0.3 | 95.24 |
46
  | XSTest | 0.834 | 0.832 | 99.76 | 0.78 | 0.765 | 98.08 |
47
+ | Average Score | 0.6711282051 | 0.6654871795 | 99.12538462 | 0.5706410256 | 0.5629487179 | 98.45897436 |
48
+
49
+
50
+ ## Model creation
51
+
52
+ This model is created with `compressed-tensors==0.13.0` and `llmcompressor==0.9.0.1`, and the following LLM-Compressor quantization script:
53
+
54
+ ```bash
55
+ CUDA_VISIBLE_DEVICES=0 python quantize.py --model_path meta-llama/Llama-Guard-4-12B --quant_path RedHatAI/Llama-Guard-4-12B-quantized.w4a16 --group_size 128 --calib_size 1024 --dampening_frac 0.01 --observer minmax --sym True --actorder False --pipeline independent
56
+ ```
57
+
58
+ ```python
59
+ from datasets import load_dataset
60
+ from transformers import AutoProcessor, Llama4ForConditionalGeneration
61
+ from llmcompressor.modifiers.quantization import GPTQModifier
62
+ from llmcompressor import oneshot
63
+ import argparse
64
+ from compressed_tensors.quantization import QuantizationScheme, QuantizationArgs, QuantizationType, QuantizationStrategy
65
+
66
+ def parse_actorder(value):
67
+ # Interpret the input value for --actorder
68
+ if value.lower() == "false":
69
+ return False
70
+ elif value.lower() == "group":
71
+ return "group"
72
+ elif value.lower() == "weight":
73
+ return "weight"
74
+ else:
75
+ raise argparse.ArgumentTypeError("Invalid value for --actorder. Use 'group', 'weight', or 'False'.")
76
+
77
+ def parse_sym(value):
78
+ if value.lower() == "false":
79
+ return False
80
+ elif value.lower() == "true":
81
+ return True
82
+ else:
83
+ raise argparse.ArgumentTypeError(f"Invalid value for --sym. Use false or true, but got {value}")
84
+
85
+ parser = argparse.ArgumentParser()
86
+ parser.add_argument('--model_path', type=str, required=True)
87
+ parser.add_argument('--quant_path', type=str, required=True)
88
+ parser.add_argument('--group_size', type=int, required=True)
89
+ parser.add_argument('--calib_size', type=int, required=True)
90
+ parser.add_argument('--dampening_frac', type=float, required=True)
91
+ parser.add_argument('--observer', type=str, required=True) # mse or minmax
92
+ parser.add_argument('--sym', type=parse_sym, required=True) # true or false
93
+ parser.add_argument('--actorder', type=parse_actorder, required=True) # group or weight or false
94
+ parser.add_argument('--pipeline', type=str, default="basic") # ['basic', 'datafree', 'sequential', independent]
95
+
96
+ args = parser.parse_args()
97
+
98
+ model = Llama4ForConditionalGeneration.from_pretrained(
99
+ args.model_path,
100
+ torch_dtype="auto",
101
+ trust_remote_code=True,
102
+ )
103
+ processor = AutoProcessor.from_pretrained(args.model_path, trust_remote_code=True)
104
+
105
+ def preprocess_fn(example):
106
+ # prepare for multimodal processor
107
+ for msg in example["messages"]:
108
+ msg["content"] = [{'type': 'text', 'text': msg['content']}]
109
+
110
+ return {"text": processor.apply_chat_template(example["messages"], add_generation_prompt=False, tokenize=False)}
111
+
112
+ ds = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
113
+ ds = ds.map(preprocess_fn)
114
+
115
+ print(f"================================================================================")
116
+ print(f"[For debugging] Calibration data sample is:\n{repr(ds[0]['text'])}")
117
+ print(f"================================================================================")
118
+
119
+ quant_scheme = QuantizationScheme(
120
+ targets=["Linear"],
121
+ weights=QuantizationArgs(
122
+ num_bits=4,
123
+ type=QuantizationType.INT,
124
+ symmetric=args.sym,
125
+ group_size=args.group_size,
126
+ strategy=QuantizationStrategy.GROUP,
127
+ observer=args.observer,
128
+ actorder=args.actorder
129
+ ),
130
+ input_activations=None,
131
+ output_activations=None,
132
+ )
133
+
134
+ recipe = [
135
+ GPTQModifier(
136
+ targets=["Linear"],
137
+ ignore=[
138
+ "re:.*lm_head",
139
+ "re:.*multi_modal_projector",
140
+ "re:.*vision_model",
141
+ ],
142
+ dampening_frac=args.dampening_frac,
143
+ config_groups={"group_0": quant_scheme},
144
+ )
145
+ ]
146
+ oneshot(
147
+ model=model,
148
+ dataset=ds,
149
+ recipe=recipe,
150
+ num_calibration_samples=args.calib_size,
151
+ max_seq_length=4096,
152
+ pipeline=args.pipeline,
153
+ )
154
+
155
+ SAVE_DIR = args.quant_path
156
+ model.save_pretrained(SAVE_DIR)
157
+ print(f"Model saved to {SAVE_DIR}. Please manually copy other files like tokenizer, proprocessors, etc.")
158
+ ```
159
+