File size: 12,051 Bytes
b3a3b15 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 |
import math
import numpy as np
import torch
import argparse
import os
from functools import partial
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm # For progress bars
def arg_parser():
parser = argparse.ArgumentParser(description='Inference benchmarking')
parser.add_argument('--batch_size', type=int, default=16)
parser.add_argument('--hidden_features', type=int, default=32768)
parser.add_argument('--in_features', type=int, default=8192)
parser.add_argument('--model_index', type=int, default=5)
parser.add_argument('--seq_len', type=int, default=512)
parser.add_argument('--index_size', type=int, default=8192)
parser.add_argument('--head_density', type=float, default=0.25)
parser.add_argument('--attn_topk', type=float, default=0.5)
parser.add_argument('--print_results', type=bool, default=True)
parser.add_argument('--iterations', type=int, default=10)
parser.add_argument('--check_results', type=bool, default=False)
parser.add_argument('--results_dir', type=str, default='results')
parser.add_argument('--max_batch_size', type=int, default=32)
parser.add_argument('--max_seqlen', type=int, default=2048)
parser.add_argument('--bias', type=int, default=0)
parser.add_argument('--device', type=int, default=0)
parser.add_argument('--mode', type=str, default='row', choices=['row', 'col', 'auto'])
return parser.parse_args()
def initialize_distributed_environment():
# Set environment variables for NCCL
os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
os.environ["NCCL_GRAPH_MIXING_SUPPORT"] = "0"
# Initialize the distributed process group
torch.distributed.init_process_group(backend="nccl", init_method="env://")
# Set the device based on the rank of the current process
device = f"cuda:{torch.distributed.get_rank()}"
world_size = torch.distributed.get_world_size()
# Set the current CUDA device to avoid operations being executed on the wrong GPU
torch.cuda.set_device(device)
# You can return device, world_size, and any other relevant information
return device, world_size
def get_gpu_name():
if torch.cuda.is_available():
gpu_name = torch.cuda.get_device_name(torch.cuda.current_device())
# Clean the GPU name to make it filename-friendly
gpu_name_clean = gpu_name.replace(" ", "_").replace("/", "_").replace("\\", "_")
return gpu_name_clean
else:
return "CPU"
def _get_device(device_id):
if torch.cuda.is_available():
device = torch.device(f"cuda:{device_id}")
else:
device = torch.device("cpu")
return device
def extract_model_name(model_path: str) -> str:
return model_path.split("/")[-1]
def create_results_directory(results_dir):
"""
Creates the results directory if it does not exist.
Parameters:
- results_dir (str or Path): The path to the results directory.
Returns:
- Path: The Path object representing the results directory.
"""
path = Path(results_dir).resolve()
if not path.exists():
path.mkdir(parents=True, exist_ok=True)
print(f"Created results directory at: {path}")
else:
# print(f"Results directory already exists at: {path}")
pass
return path
def ZeroIndex(index_vec, max_value):
# Create a set of all integers from 0 to max_value - 1
all_integers = set(range(max_value))
# Convert index_vec to a set
index_set = set(index_vec.cpu().numpy())
# Subtract index_set from all_integers
remaining_integers = all_integers - index_set
# Convert the result back to a tensor
zero_index = torch.tensor(list(remaining_integers), dtype=torch.int32, device='cuda')
return zero_index
def sparse_index(index_size, max_value, return_zero_index = False):
index_vec = torch.randperm(max_value, dtype=torch.int32, device='cuda')[:index_size]
index_vec, _ = torch.sort(index_vec)
if return_zero_index:
zero_index = ZeroIndex(index_vec, max_value)
else:
zero_index = None
return index_vec, zero_index
# utility function to study activations
def create_random_batches(labels, batch_size=32):
"""
Shuffles the labels and splits them into random batches.
Parameters:
- labels (np.ndarray): The labels matrix of shape (212646, 16384).
- batch_size (int): The number of samples per batch.
Returns:
- List[np.ndarray]: A list of batches, each containing `batch_size` rows.
"""
num_samples = labels.shape[0]
# Generate a permutation of indices
shuffled_indices = np.random.permutation(num_samples)
# Shuffle the labels matrix
shuffled_labels = labels[shuffled_indices]
# Calculate the number of complete batches
num_batches = num_samples // batch_size
# Split the shuffled labels into batches
batches = np.split(shuffled_labels[:num_batches * batch_size], num_batches)
return batches
def generate_BH_index(batch_size: int, heads: int, selected_heads: int, device = 'cuda'):
'''
Generates a random list of selected heads for each batch.
Args:
- batch_size (int): Number of batches.
- heads (int): Total number of heads.
- selected_heads (int): Number of heads to select for each batch.
Returns:
- bh_index (torch.Tensor): Tensor of shape (batch_size * selected_heads, 2) where each row is (batch_idx, head_idx).
'''
N_selected = batch_size * selected_heads
bh_index = torch.zeros((N_selected, 2), dtype=torch.int32, device=device)
for batch_idx in range(batch_size):
selected_head_indices = torch.randperm(heads)[:selected_heads]
sorted_head_indices = torch.sort(selected_head_indices).values
for i, head_idx in enumerate(sorted_head_indices):
bh_index[batch_idx * selected_heads + i] = torch.tensor([batch_idx, head_idx], dtype=torch.int32)
return bh_index
def generate_random_BH_index(batch_size: int, heads: int, selected_heads: int, device = 'cuda'):
'''
Generates a random list of selected heads for each batch.
Args:
- batch_size (int): Number of batches.
- heads (int): Total number of heads.
- selected_heads (int): Number of heads to select for each batch.
Returns:
- bh_index (torch.Tensor): Tensor of shape (batch_size, selected_heads)
'''
bh_index = torch.zeros((batch_size, selected_heads), dtype=torch.int32, device=device)
for batch_idx in range(batch_size):
selected_head_indices = torch.randperm(heads)[:selected_heads]
# sort the selected head indices
sorted_head_indices = torch.sort(selected_head_indices).values
bh_index[batch_idx] = sorted_head_indices
return bh_index
def generate_random_BG_index(batch_size: int, groups: int, selected_groups: int, device = 'cuda'):
'''
Generates a random list of selected heads for each batch.
Args:
- batch_size (int): Number of batches.
- heads (int): Total number of heads.
- selected_heads (int): Number of heads to select for each batch.
Returns:
- bh_index (torch.Tensor): Tensor of shape (batch_size, selected_heads)
'''
bg_index = torch.zeros((batch_size, selected_groups), dtype=torch.int32, device=device)
for batch_idx in range(batch_size):
selected_group_indices = torch.randperm(groups)[:selected_groups]
# sort the selected head indices
sorted_group_indices = torch.sort(selected_group_indices).values
bg_index[batch_idx] = sorted_group_indices
return bg_index
def activation_stats_layer(test_batches, total_neurons, device):
"""
Calculates the average and standard deviation of activations across batches.
Parameters:
- test_batches (List[np.ndarray] or List[torch.Tensor]): List of batches containing label data.
- total_neurons (int): Total number of neurons.
- device (torch.device): The device to perform computations on (e.g., 'cpu' or 'cuda').
Returns:
- avg_act (float): The average number of activations per batch.
- std_dev (float): The standard deviation of activations across batches.
"""
sum_activation = 0.0 # To accumulate the total activations
sum_activation_sq = 0.0 # To accumulate the squared activations
num_batches = len(test_batches)
for i, batch in enumerate(test_batches):
# Convert batch to a PyTorch tensor if it's not already
if not isinstance(batch, torch.Tensor):
torch_labels = torch.tensor(batch, dtype=torch.float32, device=device)
else:
torch_labels = batch.to(device=device, dtype=torch.float32)
# Binarize the labels: 1 if activation > 0, else 0
binary_labels = (torch_labels > 0).int()
# Sum activations per neuron across the batch
activation_counts = binary_labels.sum(dim=0)
# Convert counts to binary: 1 if neuron is activated in the batch, else 0
activated_neurons = (activation_counts > 0).int()
# Total number of activated neurons in this batch
total_activations = activated_neurons.sum().item()
# Accumulate sum and sum of squares
sum_activation += total_activations
sum_activation_sq += total_activations ** 2
# Optional: Print progress every 1000 batches
# if (i + 1) % 1000 == 0 or (i + 1) == num_batches:
# print(f"Processed {i + 1}/{num_batches} batches")
# Calculate average activation
avg_act = sum_activation / num_batches
# Calculate variance and standard deviation
variance = (sum_activation_sq / num_batches) - (avg_act ** 2)
std_dev = variance ** 0.5
# Display results
print(f"\nAverage activation: {avg_act:.2f} "
f"({(avg_act / total_neurons) * 100:.2f}% of total neurons)")
print(f"Standard deviation of activation: {std_dev:.2f}")
return avg_act, std_dev
def calculate_index_sizes(in_features):
"""
Calculate index sizes based on the given in_features.
The sizes are rounded up to the nearest multiple of 1024
and generated in 5% increments up to 100% of total neurons.
Args:
in_features (int): The number of input features.
Returns:
List[int]: A list of index sizes rounded up to the nearest multiple of 1024.
"""
index_sizes = []
total_neurons = in_features * 4
# Generate 20 percentages from 5% to 100% in increments of 5%
percentages = [i for i in range(5, 105, 5)]
# Calculate index sizes and round up to the nearest multiple of 1024
for p in percentages:
index_size = int((p / 100) * total_neurons)
index_size = math.ceil(index_size / 1024) * 1024
index_sizes.append(index_size)
return index_sizes
def compute_perplexity(model, dataloader, device):
total_loss = 0.0
total_tokens = 0
with torch.no_grad():
for batch in tqdm(dataloader, desc="Calculating Perplexity"):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
# Shift input_ids and labels for causal language modeling
labels = input_ids.clone()
# Replace padding tokens in labels by -100 so they are ignored in loss computation
labels[attention_mask == 0] = -100
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
# Multiply loss by number of tokens in the batch
# The loss is averaged over the number of non-masked tokens
# To get the total loss, multiply by the number of non-masked tokens
total_loss += loss.item() * torch.sum(labels != -100).item()
total_tokens += torch.sum(labels != -100).item()
# Calculate perplexity
perplexity = math.exp(total_loss / total_tokens)
return perplexity |