File size: 6,484 Bytes
d73500e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
import torch
from torch import nn as nn, cuda
import os
def print_gpu_memory(accelerator):
if accelerator.is_local_main_process: # ๐
for i in range(cuda.device_count()):
used_memory = cuda.memory_allocated(0) // 1024 ** 2
print(f"GPU {i} Used Memory: {used_memory}MB")
def print_gpu_memory_device():
device = cuda.current_device()
used_memory = cuda.memory_allocated(device) // 1024 ** 2
print(f"GPU {device} Used Memory: {used_memory}MB")
def find_modules(module, layers=[], name='') -> dict:
"""
Recursively find the layers of a certain type in a module.
Args:
module (nn.Module): PyTorch module.
layers (list): List of layer types to find.
name (str): Name of the module.
Returns:
dict: Dictionary of layers of the given type(s) within the module.
"""
if type(module) in layers:
return {name: module}
res = {}
for name1, child in module.named_children():
res.update(find_modules(
child, layers=layers, name=name + '.' + name1 if name != '' else name1
))
return res
def find_linears(module) -> dict:
# ๐ find only the expert weights
res = find_modules(module, [nn.Linear])
return res
@torch.no_grad()
def check_sparsity(model):
use_cache = model.config.use_cache
model.config.use_cache = False
layers = model.model.layers
count = 0
total_params = 0
for i in range(len(layers)):
layer = layers[i]
subset = find_modules(layer)
sub_count = 0
sub_params = 0
for name in subset:
W = subset[name].weight.data
count += (W == 0).sum().item()
total_params += W.numel()
sub_count += (W == 0).sum().item()
sub_params += W.numel()
print(f"layer {i} sparsity {float(sub_count) / sub_params:.6f}")
model.config.use_cache = use_cache
return float(count) / total_params
@torch.no_grad()
def check_sparsity_from_state_dict(state_dict):
"""
๐ This function has been rewritten to calculate sparsity from "state_dict".
"""
# Get corresponding names for each layer
layer_params = {}
for name in sorted(list(state_dict.keys())):
if "layers" in name:
layer_id = int(name.split(".")[2])
if layer_id not in layer_params:
layer_params[layer_id] = [name]
else:
layer_params[layer_id].append(name)
layer_num = max(list(layer_params.keys())) + 1
# Calculate sparsity
count = 0
total_params = 0
for i in range(layer_num):
sub_count = 0
sub_params = 0
for name in layer_params[i]:
count += (state_dict[name] == 0).sum().item()
total_params += state_dict[name].numel()
sub_count += (state_dict[name] == 0).sum().item()
sub_params += state_dict[name].numel()
print(f"layer {i} sparsity {float(sub_count) / sub_params:.6f}")
return float(count) / total_params
@torch.no_grad()
def prepare_calibration_input(model, dataloader, num_samples=16):
layers = model.model.layers
cache = {'inputs': [], 'attention_mask': [], "position_ids": [], "position_ids": [], "cache_position": []}
class Catcher(nn.Module):
def __init__(self, module):
super().__init__()
self.module = module
self.self_attn = None
def forward(self, input, **kwargs):
# print(input.shape)
cache['inputs'].append(input)
cache['attention_mask'].append(kwargs['attention_mask'])
cache['position_ids'].append(kwargs['position_ids'])
cache['cache_position'].append(kwargs['cache_position'] if 'cache_position' in kwargs else None)
raise ValueError
layers[0] = Catcher(layers[0])
for index, batch in enumerate(dataloader):
if index >= num_samples: # ๐ limit the number of samples in each device, batch_size must be 1
break
try:
model(**batch)
except ValueError:
pass
layers[0] = layers[0].module
outputs = [None] * len(cache['inputs'])
return cache['inputs'], outputs, cache['attention_mask'], cache['position_ids'], cache['cache_position']
auto_map = {
"llama": {
"AutoConfig": "configuration_dropped_llama.LlamaConfig",
"AutoModelForCausalLM": "modeling_dropped_llama.LlamaForCausalLM"
},
"mistral": {
"AutoConfig": "configuration_dropped_mistral.MistralConfig",
"AutoModelForCausalLM": "modeling_dropped_mistral.MistralForCausalLM"
},
"deepseek":
{
"AutoConfig": "configuration_deepseek.DeepseekConfig",
"AutoModelForCausalLM": "modeling_dropped_deepseek.DeepseekForCausalLM"
},
"gemma2":
{
"AutoConfig": "configuration_dropped_gemma2.Gemma2Config",
"AutoModelForCausalLM": "modeling_dropped_gemma2.Gemma2ForCausalLM"
},
"baichuan":
{
"AutoConfig": "configuration_dropped_baichuan.BaichuanConfig",
"AutoModelForCausalLM": "modeling_dropped_baichuan.BaichuanForCausalLM"
}
}
CUSTOM_FILE ={
"llama": {
"config": os.path.join(os.path.dirname(__file__), "models/configuration_dropped_llama.py"),
"model": os.path.join(os.path.dirname(__file__), "models/modeling_dropped_llama.py")
},
"mistral": {
"config": os.path.join(os.path.dirname(__file__), "models/configuration_dropped_mistral.py"),
"model": os.path.join(os.path.dirname(__file__), "models/modeling_dropped_mistral.py")
},
"deepseek": {
"config": os.path.join(os.path.dirname(__file__), "models/configuration_deepseek.py"),
"model": os.path.join(os.path.dirname(__file__), "models/modeling_dropped_deepseek.py")
},
"gemma2": {
"config": os.path.join(os.path.dirname(__file__), "models/configuration_dropped_gemma2.py"),
"model": os.path.join(os.path.dirname(__file__), "models/modeling_dropped_gemma2.py")
},
"baichuan": {
"config": os.path.join(os.path.dirname(__file__), "models/configuration_dropped_baichuan.py"),
"model": os.path.join(os.path.dirname(__file__), "models/modeling_dropped_baichuan.py")
}
}
|