Update README.md
Browse files
README.md
CHANGED
|
@@ -49,10 +49,29 @@ This model has been 4-bit quantized Llada-8B-Base model with [GPTQModel](https:/
|
|
| 49 |
|
| 50 |
## Example:
|
| 51 |
```python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
import torch
|
| 53 |
from datasets import load_dataset
|
| 54 |
-
from gptqmodel import
|
| 55 |
-
from
|
|
|
|
|
|
|
| 56 |
import torch.nn.functional as F
|
| 57 |
import numpy as np
|
| 58 |
|
|
@@ -60,6 +79,64 @@ import numpy as np
|
|
| 60 |
|
| 61 |
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
def add_gumbel_noise(logits, temperature):
|
| 64 |
'''
|
| 65 |
The Gumbel max is a method for sampling categorical distributions.
|
|
@@ -92,6 +169,82 @@ def get_num_transfer_tokens(mask_index, steps):
|
|
| 92 |
|
| 93 |
return num_transfer_tokens
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
|
| 97 |
|
|
@@ -166,32 +319,45 @@ def generate(model, prompt, steps=128, gen_length=128, block_length=128, tempera
|
|
| 166 |
return x
|
| 167 |
|
| 168 |
def main():
|
| 169 |
-
|
| 170 |
-
tokenizer = AutoTokenizer.from_pretrained(quantized_model_id ,use_fast=False)
|
| 171 |
-
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 174 |
-
prompt = "
|
| 175 |
|
| 176 |
-
# #
|
| 177 |
-
m = [{"role": "user", "content": prompt}, ]
|
| 178 |
-
prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
|
| 179 |
|
| 180 |
input_ids = tokenizer(prompt)['input_ids']
|
| 181 |
input_ids = torch.tensor(input_ids).to(device).unsqueeze(0)
|
| 182 |
|
| 183 |
-
|
| 184 |
|
| 185 |
|
|
|
|
| 186 |
model = GPTQModel.load(quantized_model_id, device=device , trust_remote_code=True )
|
| 187 |
|
| 188 |
-
|
| 189 |
-
steps=
|
| 190 |
-
|
| 191 |
-
print("*"*30+ f"Steps {steps}"+ "*"*30)
|
| 192 |
print(input_ids.shape)
|
| 193 |
print( tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0])
|
|
|
|
|
|
|
|
|
|
| 194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
|
| 197 |
if __name__ == "__main__":
|
|
@@ -205,4 +371,6 @@ if __name__ == "__main__":
|
|
| 205 |
|
| 206 |
main()
|
| 207 |
|
|
|
|
|
|
|
| 208 |
```
|
|
|
|
| 49 |
|
| 50 |
## Example:
|
| 51 |
```python
|
| 52 |
+
|
| 53 |
+
# Copyright 2024-2025 ModelCloud.ai
|
| 54 |
+
# Copyright 2024-2025 qubitium@modelcloud.ai
|
| 55 |
+
# Contact: qubitium@modelcloud.ai, x.com/qubitium
|
| 56 |
+
#
|
| 57 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 58 |
+
# you may not use this file except in compliance with the License.
|
| 59 |
+
# You may obtain a copy of the License at
|
| 60 |
+
#
|
| 61 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 62 |
+
#
|
| 63 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 64 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 65 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 66 |
+
# See the License for the specific language governing permissions and
|
| 67 |
+
# limitations under the License.
|
| 68 |
+
|
| 69 |
import torch
|
| 70 |
from datasets import load_dataset
|
| 71 |
+
from gptqmodel import GPTQModel, QuantizeConfig, BACKEND
|
| 72 |
+
from gptqmodel.models.base import BaseGPTQModel
|
| 73 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
|
| 74 |
+
from gptqmodel.models.auto import MODEL_MAP
|
| 75 |
import torch.nn.functional as F
|
| 76 |
import numpy as np
|
| 77 |
|
|
|
|
| 79 |
|
| 80 |
|
| 81 |
|
| 82 |
+
pretrained_model_id = '/home/chentianqi/model/GSAI-ML/LLaDA-8B-Base' # "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
| 83 |
+
quantized_model_id = "FunAGI/LLaDA-8B-Base-gptqmodel-4bit"
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class LladaGPTQ(BaseGPTQModel):
|
| 88 |
+
# Non-repeating layers at the root level: same level as `layers_node`
|
| 89 |
+
# Excluding `layers_node`.
|
| 90 |
+
base_modules = ["model.transformer.wte", "model.transformer.ln_f"]
|
| 91 |
+
pre_lm_head_norm_module = "model.transformer.ln_f"
|
| 92 |
+
lm_head = "model.transformer.ff_out"
|
| 93 |
+
# Below describes all the repeating layers in this transformer model
|
| 94 |
+
# `model.layers` is a node/module that hold all the repeating layers. The parent node for all n-layers.
|
| 95 |
+
layers_node = "model.transformer.blocks"
|
| 96 |
+
# Each repeating layer in `model.layers` is of type `LlamaDecoderLayer`
|
| 97 |
+
layer_type = "LLaDALlamaBlock"
|
| 98 |
+
# Inside each `LlamaDecoderLayer` layer are many internal modules
|
| 99 |
+
# List them in the order executed in model forward() code
|
| 100 |
+
# Many models have same execution order of: attention (q_k_v) projection, attention (output) projection, mlp (n) projections
|
| 101 |
+
layer_modules = [
|
| 102 |
+
["attn_out", "k_proj", "v_proj", "q_proj"],
|
| 103 |
+
["ff_proj", "up_proj"],
|
| 104 |
+
["ff_out"],
|
| 105 |
+
]
|
| 106 |
+
MODEL_MAP ["llada"] = LladaGPTQ
|
| 107 |
+
|
| 108 |
+
# os.makedirs(quantized_model_dir, exist_ok=True)
|
| 109 |
+
def get_wikitext2(tokenizer, nsamples, seqlen):
|
| 110 |
+
traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train").filter(
|
| 111 |
+
lambda x: len(x["text"]) >= seqlen)
|
| 112 |
+
|
| 113 |
+
return [tokenizer(example["text"]) for example in traindata.select(range(nsamples))]
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
@torch.no_grad()
|
| 117 |
+
def calculate_avg_ppl(model, tokenizer):
|
| 118 |
+
from gptqmodel.utils import Perplexity
|
| 119 |
+
|
| 120 |
+
ppl = Perplexity(
|
| 121 |
+
model=model,
|
| 122 |
+
tokenizer=tokenizer,
|
| 123 |
+
dataset_path="wikitext",
|
| 124 |
+
dataset_name="wikitext-2-raw-v1",
|
| 125 |
+
split="train",
|
| 126 |
+
text_column="text",
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
all = ppl.calculate(n_ctx=512, n_batch=512)
|
| 130 |
+
|
| 131 |
+
# average ppl
|
| 132 |
+
avg = sum(all) / len(all)
|
| 133 |
+
|
| 134 |
+
return avg
|
| 135 |
+
|
| 136 |
+
dynamic = {
|
| 137 |
+
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
def add_gumbel_noise(logits, temperature):
|
| 141 |
'''
|
| 142 |
The Gumbel max is a method for sampling categorical distributions.
|
|
|
|
| 169 |
|
| 170 |
return num_transfer_tokens
|
| 171 |
|
| 172 |
+
def forward_process(batch, prompt_index, mask_id):
|
| 173 |
+
b, l = batch.shape
|
| 174 |
+
|
| 175 |
+
target_len = (l - prompt_index.sum()).item()
|
| 176 |
+
k = torch.randint(1, target_len + 1, (), device=batch.device)
|
| 177 |
+
|
| 178 |
+
x = torch.round(torch.linspace(float(k), k + (b - 1) * (target_len / b), steps=b, device=batch.device)).long()
|
| 179 |
+
x = ((x - 1) % target_len) + 1
|
| 180 |
+
assert x.min() >= 1 and x.max() <= target_len
|
| 181 |
+
|
| 182 |
+
indices = torch.arange(target_len, device=batch.device).repeat(b, 1)
|
| 183 |
+
is_mask = indices < x.unsqueeze(1)
|
| 184 |
+
for i in range(b):
|
| 185 |
+
is_mask[i] = is_mask[i][torch.randperm(target_len)]
|
| 186 |
+
|
| 187 |
+
is_mask = torch.cat((torch.zeros(b, prompt_index.sum(), dtype=torch.bool, device=batch.device), is_mask), dim=1)
|
| 188 |
+
noisy_batch = torch.where(is_mask, mask_id, batch)
|
| 189 |
+
|
| 190 |
+
# Return the masked batch and the mask ratio
|
| 191 |
+
return noisy_batch, (x / target_len).unsqueeze(1).repeat(1, l)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def get_logits(model, batch, prompt_index, cfg_scale, mask_id):
|
| 195 |
+
if cfg_scale > 0.:
|
| 196 |
+
assert len(prompt_index) == batch.shape[1]
|
| 197 |
+
prompt_index = prompt_index.unsqueeze(0).repeat(batch.shape[0], 1)
|
| 198 |
+
un_batch = batch.clone()
|
| 199 |
+
un_batch[prompt_index] = mask_id
|
| 200 |
+
batch = torch.cat([batch, un_batch])
|
| 201 |
+
|
| 202 |
+
input = batch
|
| 203 |
+
logits = model(input).logits
|
| 204 |
+
|
| 205 |
+
if cfg_scale > 0.:
|
| 206 |
+
logits, un_logits = torch.chunk(logits, 2, dim=0)
|
| 207 |
+
logits = un_logits + (cfg_scale + 1) * (logits - un_logits)
|
| 208 |
+
return logits
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
@ torch.no_grad()
|
| 213 |
+
def get_log_likelihood(model, prompt, answer, mc_num=128, batch_size=32, cfg_scale=0., mask_id=126336):
|
| 214 |
+
'''
|
| 215 |
+
Args:
|
| 216 |
+
model: Mask predictor.
|
| 217 |
+
prompt: A tensor of shape (l1).
|
| 218 |
+
answer: A tensor of shape (l2).
|
| 219 |
+
mc_num: Monte Carlo estimation times.
|
| 220 |
+
As detailed in Appendix B.5. Since MMLU, CMMLU, and C-EVAL only require the likelihood of a single token, a
|
| 221 |
+
single Monte Carlo estimate is sufficient for these benchmarks. For all other benchmarks, we find that 128
|
| 222 |
+
Monte Carlo samples are adequate to produce stable results.
|
| 223 |
+
batch_size: Mini batch size.
|
| 224 |
+
cfg_scale: Unsupervised classifier-free guidance scale.
|
| 225 |
+
mask_id: The toke id of [MASK] is 126336.
|
| 226 |
+
'''
|
| 227 |
+
|
| 228 |
+
seq = torch.concatenate([prompt, answer])[None, :]
|
| 229 |
+
seq = seq.repeat((batch_size, 1)).to(model.device)
|
| 230 |
+
prompt_index = torch.arange(seq.shape[1], device=model.device) < len(prompt)
|
| 231 |
+
|
| 232 |
+
loss_ = []
|
| 233 |
+
for _ in range(mc_num // batch_size):
|
| 234 |
+
|
| 235 |
+
perturbed_seq, p_mask = forward_process(seq, prompt_index, mask_id)
|
| 236 |
+
mask_index = perturbed_seq == mask_id
|
| 237 |
+
|
| 238 |
+
logits = get_logits(model, perturbed_seq, prompt_index, cfg_scale, mask_id)
|
| 239 |
+
|
| 240 |
+
loss = F.cross_entropy(logits[mask_index], seq[mask_index], reduction='none') / p_mask[mask_index]
|
| 241 |
+
loss = loss.sum() / batch_size
|
| 242 |
+
|
| 243 |
+
loss_.append(loss.item())
|
| 244 |
+
|
| 245 |
+
return - sum(loss_) / len(loss_)
|
| 246 |
+
|
| 247 |
+
|
| 248 |
|
| 249 |
|
| 250 |
|
|
|
|
| 319 |
return x
|
| 320 |
|
| 321 |
def main():
|
| 322 |
+
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_id, use_fast=False)
|
|
|
|
|
|
|
| 323 |
|
| 324 |
+
traindataset = get_wikitext2(tokenizer, nsamples=128, seqlen=1024)
|
| 325 |
+
|
| 326 |
+
quantize_config = QuantizeConfig(
|
| 327 |
+
dynamic=dynamic,
|
| 328 |
+
bits=8, # quantize model to 4-bit
|
| 329 |
+
group_size=128, # it is recommended to set the value to 128,
|
| 330 |
+
desc_act = True,
|
| 331 |
+
sym=False
|
| 332 |
+
)
|
| 333 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 334 |
+
prompt = "Question: Lily can run 12 kilometers per hour for 4 hours. After that, she runs 6 kilometers per hour. How many kilometers can she run in 8 hours? The answer: "
|
| 335 |
|
| 336 |
+
# # Add special tokens for the Instruct model. The Base model does not require the following two lines.
|
| 337 |
+
# m = [{"role": "user", "content": prompt}, ]
|
| 338 |
+
# prompt = tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False)
|
| 339 |
|
| 340 |
input_ids = tokenizer(prompt)['input_ids']
|
| 341 |
input_ids = torch.tensor(input_ids).to(device).unsqueeze(0)
|
| 342 |
|
|
|
|
| 343 |
|
| 344 |
|
| 345 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 346 |
model = GPTQModel.load(quantized_model_id, device=device , trust_remote_code=True )
|
| 347 |
|
| 348 |
+
steps=128
|
| 349 |
+
out = generate(model, input_ids, steps=steps , gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence')
|
| 350 |
+
print("*"*30+ f"GPTQ-4bit Steps {steps}"+ "*"*30)
|
|
|
|
| 351 |
print(input_ids.shape)
|
| 352 |
print( tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0])
|
| 353 |
+
del model
|
| 354 |
+
|
| 355 |
+
model =AutoModel.from_pretrained(pretrained_model_id, trust_remote_code=True ).cuda()
|
| 356 |
|
| 357 |
+
out = generate(model, input_ids, steps=steps , gen_length=128, block_length=32, temperature=0., cfg_scale=0., remasking='low_confidence')
|
| 358 |
+
print("*"*30+ f"FP16 Steps {steps}"+ "*"*30)
|
| 359 |
+
print(input_ids.shape)
|
| 360 |
+
print( tokenizer.batch_decode(out[:, input_ids.shape[1]:], skip_special_tokens=True)[0])
|
| 361 |
|
| 362 |
|
| 363 |
if __name__ == "__main__":
|
|
|
|
| 371 |
|
| 372 |
main()
|
| 373 |
|
| 374 |
+
|
| 375 |
+
|
| 376 |
```
|