File size: 8,241 Bytes
5000658 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 |
import re
from pathlib import Path
from typing import Dict, Optional, Union
import torch
from datasets import load_dataset
from ..quantization import QuantAlgo
def split(v, tp_size, idx, dim=0):
if tp_size == 1:
return v
if len(v.shape) == 1:
return torch.chunk(v, tp_size)[idx].contiguous()
else:
return torch.chunk(v, tp_size, dim=dim)[idx].clone()
def split_qkv_tp(v, n_head, n_hidden, tensor_parallel, rank):
"""
Splits the QKV matrix according to tensor parallelism
"""
v = v.reshape(3, n_hidden, n_hidden)
split_v = split(v, tensor_parallel, rank, dim=1)
split_v = split_v.reshape(3 * (n_hidden // tensor_parallel), n_hidden)
return split_v.clone()
def split_qkv_bias_tp(v, n_head, n_hidden, tensor_parallel, rank):
"""
Splits the QKV bias according to tensor parallelism
"""
v = v.reshape(3, n_hidden)
split_v = split(v, tensor_parallel, rank, dim=1)
split_v = split_v.reshape(3 * (n_hidden // tensor_parallel))
return split_v.clone()
def split_matrix_tp(v, tensor_parallel, rank, dim):
return split(v, tensor_parallel, rank, dim=dim)
def weight_only_quantize(weight: torch.Tensor,
quant_algo: str,
plugin: bool = True):
assert quant_algo in [QuantAlgo.W4A16, QuantAlgo.W8A16
], f'unsupported quant algo: {quant_algo}'
if quant_algo == QuantAlgo.W4A16:
assert plugin, 'W4A16 is only supported with plugin'
if weight.dim() > 2:
v = weight.transpose(-1, -2)
else:
v = weight.t()
t = torch.quint4x2 if quant_algo == QuantAlgo.W4A16 else torch.int8
processed_torch_weights, torch_weight_scales = \
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
v.contiguous(), t)
if plugin:
return processed_torch_weights, torch_weight_scales
else:
return v, torch_weight_scales
def weight_only_quantize_dict(weights: Dict[str, torch.Tensor],
quant_algo: str,
quant_weights=[
'qkv.weight', 'dense.weight', 'fc.weight',
'proj.weight', 'gate.weight'
],
exclude_weights=['shared_expert_gate.weight'],
plugin: bool = True):
if quant_algo not in [QuantAlgo.W4A16, QuantAlgo.W8A16]:
return weights
for name in list(weights):
if any([_name in name for _name in exclude_weights]):
continue
if any([_name in name for _name in quant_weights
]) and weights[name].dtype != torch.int8:
quant_weight, quant_scale = weight_only_quantize(
weight=weights[name], quant_algo=quant_algo, plugin=plugin)
weights[name] = quant_weight
weights[name.replace('.weight', '.per_channel_scale')] = quant_scale
return weights
def load_state_dict(
file_path: Union[str, Path],
dtype: Optional[torch.dtype] = None,
device: Optional[Union[str, torch.device]] = None,
) -> Dict[str, torch.Tensor]:
""" Load weights from model file.
`safetensors` or `pytorch binary` is supported.
Args:
file_path: model file path, ends with .bin or .safetensors.
dtype: torch.dtype, data type.
device: torch device like, optional. If None, load to cpu.
Returns:
Weights as state dict.
"""
file_path = Path(file_path)
if dtype is not None:
assert isinstance(dtype, torch.dtype)
if device is None:
device = 'cpu'
model_params = {}
if file_path.suffix == '.safetensors':
# load from safetensors file
from safetensors import safe_open
with safe_open(file_path, framework='pt', device=device) as f:
for name in f.keys():
tensor = f.get_tensor(name)
if dtype is not None:
tensor = tensor.to(dtype)
model_params[name] = tensor
elif file_path.suffix == '.bin':
# load from pytorch bin file
state_dict = torch.load(file_path, map_location=device)
for name in state_dict:
tensor = state_dict[name]
if dtype is not None:
tensor = tensor.to(dtype)
model_params[name] = tensor
else:
raise NotImplementedError(
f'Support .safetensors or .bin files, but got {str(file_path)}')
return model_params
def get_model_path(
model_dir: Union[str, Path],
name: Optional[str] = None,
) -> Optional[str]:
""" Get model path from model directory.
`safetensors` or `pytorch binary` is supported.
Args:
model_dir: model directory.
name: model file name without suffix.
Returns:
Full model path.
"""
model_dir = Path(model_dir)
if name is not None:
if (model_dir / f"{name}.safetensors").exists():
return str(model_dir / f"{name}.safetensors")
elif (model_dir / f"{name}.bin").exists():
return str(model_dir / f"{name}.bin")
else:
return None
else:
model_files = list(model_dir.glob('*.safetensors'))
if len(model_files) > 0:
assert len(
model_files
) == 1, f"find multiple safetensors files in {model_dir}, please specify one"
return str(model_files[0])
model_files = list(model_dir.glob('*.bin'))
if len(model_files) > 0:
assert len(
model_files
) == 1, f"find multiple bin files in {model_dir}, please specify one"
return str(model_files[0])
return None
def retrieved_layer_index_from_name(name: str) -> Optional[int]:
# This method is a hacky function to retrieve the layer index from
# HF model. Most of HF models have similar naming convention but
# please check carefully before applying if this method works well
# on your target model.
res = re.search(r'\d+', name)
return int(res.group()) if res is not None else res
def iterate_shard_files(model_dir: Union[Path, str],
rank: int,
progress_bar: bool = True):
model_dir = Path(model_dir)
# '.bin' or '.safetensors'. In case that both exist, '.safetensor'
# files will be loaded first.
shard_files = list(model_dir.glob('*.safetensors'))
if not shard_files:
# The model checkpoint is stored in .bin file.
shard_files = list(model_dir.glob('*.bin'))
if not shard_files:
raise RuntimeError(
f"Could not find any .safetensors or .bin files in {model_dir}")
try:
import tqdm
if progress_bar:
# Show a progress bar per rank.
desc = f'Rank [{rank}] Loading weights'
shard_files = tqdm.tqdm(shard_files, desc=desc, position=rank)
except ImportError:
pass
for shard_file in shard_files:
yield shard_file
def has_safetensors(model_dir: str):
return len(list(Path(model_dir).glob('*.safetensors'))) > 0
DEFAULT_HF_DATASET_META = {
'ccdv/cnn_dailymail': ('3.0.0', 'train', 'article'),
'cnn_dailymail': ('3.0.0', 'train', 'article'),
'lambada': (None, 'validation', 'text'),
}
def load_calib_dataset(dataset_name_or_dir: str,
config_name: Optional[str] = None,
split: Optional[str] = None,
key: Optional[str] = None,
trust_remote_code=True,
**kwargs):
if config_name is None:
for name, meta in DEFAULT_HF_DATASET_META.items():
if name in dataset_name_or_dir:
if config_name is None:
config_name = meta[0]
if split is None:
split = meta[1]
if key is None:
key = meta[2]
break
dataset = load_dataset(dataset_name_or_dir,
name=config_name,
split=split,
**kwargs)
return dataset[key]
|