PolarSparsity / HybridTensor /utils /utils.py

Upload folder using huggingface_hub

b3a3b15 verified 8 months ago

12.1 kB

	import math
	import numpy as np
	import torch
	import argparse
	import os
	from functools import partial
	import pandas as pd
	import matplotlib.pyplot as plt
	from pathlib import Path
	from tqdm import tqdm # For progress bars

	def arg_parser():
	parser = argparse.ArgumentParser(description='Inference benchmarking')
	parser.add_argument('--batch_size', type=int, default=16)
	parser.add_argument('--hidden_features', type=int, default=32768)
	parser.add_argument('--in_features', type=int, default=8192)
	parser.add_argument('--model_index', type=int, default=5)
	parser.add_argument('--seq_len', type=int, default=512)
	parser.add_argument('--index_size', type=int, default=8192)
	parser.add_argument('--head_density', type=float, default=0.25)
	parser.add_argument('--attn_topk', type=float, default=0.5)
	parser.add_argument('--print_results', type=bool, default=True)
	parser.add_argument('--iterations', type=int, default=10)
	parser.add_argument('--check_results', type=bool, default=False)
	parser.add_argument('--results_dir', type=str, default='results')
	parser.add_argument('--max_batch_size', type=int, default=32)
	parser.add_argument('--max_seqlen', type=int, default=2048)
	parser.add_argument('--bias', type=int, default=0)
	parser.add_argument('--device', type=int, default=0)
	parser.add_argument('--mode', type=str, default='row', choices=['row', 'col', 'auto'])

	return parser.parse_args()

	def initialize_distributed_environment():
	# Set environment variables for NCCL
	os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
	os.environ["NCCL_GRAPH_MIXING_SUPPORT"] = "0"

	# Initialize the distributed process group
	torch.distributed.init_process_group(backend="nccl", init_method="env://")

	# Set the device based on the rank of the current process
	device = f"cuda:{torch.distributed.get_rank()}"
	world_size = torch.distributed.get_world_size()

	# Set the current CUDA device to avoid operations being executed on the wrong GPU
	torch.cuda.set_device(device)

	# You can return device, world_size, and any other relevant information
	return device, world_size

	def get_gpu_name():
	if torch.cuda.is_available():
	gpu_name = torch.cuda.get_device_name(torch.cuda.current_device())
	# Clean the GPU name to make it filename-friendly
	gpu_name_clean = gpu_name.replace(" ", "_").replace("/", "_").replace("\\", "_")
	return gpu_name_clean
	else:
	return "CPU"

	def _get_device(device_id):
	if torch.cuda.is_available():
	device = torch.device(f"cuda:{device_id}")
	else:
	device = torch.device("cpu")

	return device

	def extract_model_name(model_path: str) -> str:
	return model_path.split("/")[-1]

	def create_results_directory(results_dir):
	"""
	Creates the results directory if it does not exist.

	Parameters:
	- results_dir (str or Path): The path to the results directory.

	Returns:
	- Path: The Path object representing the results directory.
	"""
	path = Path(results_dir).resolve()

	if not path.exists():
	path.mkdir(parents=True, exist_ok=True)
	print(f"Created results directory at: {path}")
	else:
	# print(f"Results directory already exists at: {path}")
	pass

	return path

	def ZeroIndex(index_vec, max_value):
	# Create a set of all integers from 0 to max_value - 1
	all_integers = set(range(max_value))

	# Convert index_vec to a set
	index_set = set(index_vec.cpu().numpy())

	# Subtract index_set from all_integers
	remaining_integers = all_integers - index_set

	# Convert the result back to a tensor
	zero_index = torch.tensor(list(remaining_integers), dtype=torch.int32, device='cuda')

	return zero_index

	def sparse_index(index_size, max_value, return_zero_index = False):
	index_vec = torch.randperm(max_value, dtype=torch.int32, device='cuda')[:index_size]
	index_vec, _ = torch.sort(index_vec)
	if return_zero_index:
	zero_index = ZeroIndex(index_vec, max_value)
	else:
	zero_index = None
	return index_vec, zero_index

	# utility function to study activations

	def create_random_batches(labels, batch_size=32):
	"""
	Shuffles the labels and splits them into random batches.

	Parameters:
	- labels (np.ndarray): The labels matrix of shape (212646, 16384).
	- batch_size (int): The number of samples per batch.

	Returns:
	- List[np.ndarray]: A list of batches, each containing `batch_size` rows.
	"""
	num_samples = labels.shape[0]

	# Generate a permutation of indices
	shuffled_indices = np.random.permutation(num_samples)

	# Shuffle the labels matrix
	shuffled_labels = labels[shuffled_indices]

	# Calculate the number of complete batches
	num_batches = num_samples // batch_size

	# Split the shuffled labels into batches
	batches = np.split(shuffled_labels[:num_batches * batch_size], num_batches)

	return batches

	def generate_BH_index(batch_size: int, heads: int, selected_heads: int, device = 'cuda'):
	'''
	Generates a random list of selected heads for each batch.

	Args:
	- batch_size (int): Number of batches.
	- heads (int): Total number of heads.
	- selected_heads (int): Number of heads to select for each batch.

	Returns:
	- bh_index (torch.Tensor): Tensor of shape (batch_size * selected_heads, 2) where each row is (batch_idx, head_idx).
	'''
	N_selected = batch_size * selected_heads
	bh_index = torch.zeros((N_selected, 2), dtype=torch.int32, device=device)

	for batch_idx in range(batch_size):
	selected_head_indices = torch.randperm(heads)[:selected_heads]
	sorted_head_indices = torch.sort(selected_head_indices).values
	for i, head_idx in enumerate(sorted_head_indices):
	bh_index[batch_idx * selected_heads + i] = torch.tensor([batch_idx, head_idx], dtype=torch.int32)

	return bh_index

	def generate_random_BH_index(batch_size: int, heads: int, selected_heads: int, device = 'cuda'):
	'''
	Generates a random list of selected heads for each batch.

	Args:
	- batch_size (int): Number of batches.
	- heads (int): Total number of heads.
	- selected_heads (int): Number of heads to select for each batch.

	Returns:
	- bh_index (torch.Tensor): Tensor of shape (batch_size, selected_heads)
	'''
	bh_index = torch.zeros((batch_size, selected_heads), dtype=torch.int32, device=device)

	for batch_idx in range(batch_size):
	selected_head_indices = torch.randperm(heads)[:selected_heads]
	# sort the selected head indices
	sorted_head_indices = torch.sort(selected_head_indices).values
	bh_index[batch_idx] = sorted_head_indices

	return bh_index

	def generate_random_BG_index(batch_size: int, groups: int, selected_groups: int, device = 'cuda'):
	'''
	Generates a random list of selected heads for each batch.

	Args:
	- batch_size (int): Number of batches.
	- heads (int): Total number of heads.
	- selected_heads (int): Number of heads to select for each batch.

	Returns:
	- bh_index (torch.Tensor): Tensor of shape (batch_size, selected_heads)
	'''
	bg_index = torch.zeros((batch_size, selected_groups), dtype=torch.int32, device=device)

	for batch_idx in range(batch_size):
	selected_group_indices = torch.randperm(groups)[:selected_groups]
	# sort the selected head indices
	sorted_group_indices = torch.sort(selected_group_indices).values
	bg_index[batch_idx] = sorted_group_indices

	return bg_index



	def activation_stats_layer(test_batches, total_neurons, device):
	"""
	Calculates the average and standard deviation of activations across batches.

	Parameters:
	- test_batches (List[np.ndarray] or List[torch.Tensor]): List of batches containing label data.
	- total_neurons (int): Total number of neurons.
	- device (torch.device): The device to perform computations on (e.g., 'cpu' or 'cuda').

	Returns:
	- avg_act (float): The average number of activations per batch.
	- std_dev (float): The standard deviation of activations across batches.
	"""
	sum_activation = 0.0 # To accumulate the total activations
	sum_activation_sq = 0.0 # To accumulate the squared activations
	num_batches = len(test_batches)

	for i, batch in enumerate(test_batches):
	# Convert batch to a PyTorch tensor if it's not already
	if not isinstance(batch, torch.Tensor):
	torch_labels = torch.tensor(batch, dtype=torch.float32, device=device)
	else:
	torch_labels = batch.to(device=device, dtype=torch.float32)

	# Binarize the labels: 1 if activation > 0, else 0
	binary_labels = (torch_labels > 0).int()

	# Sum activations per neuron across the batch
	activation_counts = binary_labels.sum(dim=0)

	# Convert counts to binary: 1 if neuron is activated in the batch, else 0
	activated_neurons = (activation_counts > 0).int()

	# Total number of activated neurons in this batch
	total_activations = activated_neurons.sum().item()

	# Accumulate sum and sum of squares
	sum_activation += total_activations
	sum_activation_sq += total_activations ** 2

	# Optional: Print progress every 1000 batches
	# if (i + 1) % 1000 == 0 or (i + 1) == num_batches:
	# print(f"Processed {i + 1}/{num_batches} batches")

	# Calculate average activation
	avg_act = sum_activation / num_batches

	# Calculate variance and standard deviation
	variance = (sum_activation_sq / num_batches) - (avg_act ** 2)
	std_dev = variance ** 0.5

	# Display results
	print(f"\nAverage activation: {avg_act:.2f} "
	f"({(avg_act / total_neurons) * 100:.2f}% of total neurons)")
	print(f"Standard deviation of activation: {std_dev:.2f}")

	return avg_act, std_dev

	def calculate_index_sizes(in_features):
	"""
	Calculate index sizes based on the given in_features.
	The sizes are rounded up to the nearest multiple of 1024
	and generated in 5% increments up to 100% of total neurons.

	Args:
	in_features (int): The number of input features.

	Returns:
	List[int]: A list of index sizes rounded up to the nearest multiple of 1024.
	"""
	index_sizes = []
	total_neurons = in_features * 4

	# Generate 20 percentages from 5% to 100% in increments of 5%
	percentages = [i for i in range(5, 105, 5)]

	# Calculate index sizes and round up to the nearest multiple of 1024
	for p in percentages:
	index_size = int((p / 100) * total_neurons)
	index_size = math.ceil(index_size / 1024) * 1024
	index_sizes.append(index_size)

	return index_sizes


	def compute_perplexity(model, dataloader, device):
	total_loss = 0.0
	total_tokens = 0

	with torch.no_grad():
	for batch in tqdm(dataloader, desc="Calculating Perplexity"):
	input_ids = batch['input_ids'].to(device)
	attention_mask = batch['attention_mask'].to(device)

	# Shift input_ids and labels for causal language modeling
	labels = input_ids.clone()
	# Replace padding tokens in labels by -100 so they are ignored in loss computation
	labels[attention_mask == 0] = -100

	outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
	loss = outputs.loss
	# Multiply loss by number of tokens in the batch
	# The loss is averaged over the number of non-masked tokens
	# To get the total loss, multiply by the number of non-masked tokens
	total_loss += loss.item() * torch.sum(labels != -100).item()
	total_tokens += torch.sum(labels != -100).item()

	# Calculate perplexity
	perplexity = math.exp(total_loss / total_tokens)
	return perplexity