File size: 12,051 Bytes
b3a3b15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
import math
import numpy as np
import torch
import argparse
import os
from functools import partial
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm  # For progress bars

def arg_parser():
    parser = argparse.ArgumentParser(description='Inference benchmarking')
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--hidden_features', type=int, default=32768)
    parser.add_argument('--in_features', type=int, default=8192)
    parser.add_argument('--model_index', type=int, default=5)
    parser.add_argument('--seq_len', type=int, default=512)
    parser.add_argument('--index_size', type=int, default=8192)
    parser.add_argument('--head_density', type=float, default=0.25)
    parser.add_argument('--attn_topk', type=float, default=0.5)
    parser.add_argument('--print_results', type=bool, default=True)
    parser.add_argument('--iterations', type=int, default=10)
    parser.add_argument('--check_results', type=bool, default=False)
    parser.add_argument('--results_dir', type=str, default='results')
    parser.add_argument('--max_batch_size', type=int, default=32)
    parser.add_argument('--max_seqlen', type=int, default=2048)
    parser.add_argument('--bias', type=int, default=0)
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--mode', type=str, default='row', choices=['row', 'col', 'auto'])
    
    return parser.parse_args()

def initialize_distributed_environment():
    # Set environment variables for NCCL
    os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
    os.environ["NCCL_GRAPH_MIXING_SUPPORT"] = "0"

    # Initialize the distributed process group
    torch.distributed.init_process_group(backend="nccl", init_method="env://")

    # Set the device based on the rank of the current process
    device = f"cuda:{torch.distributed.get_rank()}"
    world_size = torch.distributed.get_world_size()

    # Set the current CUDA device to avoid operations being executed on the wrong GPU
    torch.cuda.set_device(device)

    # You can return device, world_size, and any other relevant information
    return device, world_size

def get_gpu_name():
    if torch.cuda.is_available():
        gpu_name = torch.cuda.get_device_name(torch.cuda.current_device())
        # Clean the GPU name to make it filename-friendly
        gpu_name_clean = gpu_name.replace(" ", "_").replace("/", "_").replace("\\", "_")
        return gpu_name_clean
    else:
        return "CPU"
    
def _get_device(device_id):
    if torch.cuda.is_available():
        device = torch.device(f"cuda:{device_id}")
    else:
        device = torch.device("cpu")
    
    return device

def extract_model_name(model_path: str) -> str:
    return model_path.split("/")[-1]

def create_results_directory(results_dir):
    """
    Creates the results directory if it does not exist.

    Parameters:
    - results_dir (str or Path): The path to the results directory.

    Returns:
    - Path: The Path object representing the results directory.
    """
    path = Path(results_dir).resolve()
    
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)
        print(f"Created results directory at: {path}")
    else:
        # print(f"Results directory already exists at: {path}")
        pass
    
    return path

def ZeroIndex(index_vec, max_value):
    # Create a set of all integers from 0 to max_value - 1
    all_integers = set(range(max_value))

    # Convert index_vec to a set
    index_set = set(index_vec.cpu().numpy())

    # Subtract index_set from all_integers
    remaining_integers = all_integers - index_set

    # Convert the result back to a tensor
    zero_index = torch.tensor(list(remaining_integers), dtype=torch.int32, device='cuda')

    return zero_index

def sparse_index(index_size, max_value, return_zero_index = False):
    index_vec = torch.randperm(max_value, dtype=torch.int32, device='cuda')[:index_size]
    index_vec, _ = torch.sort(index_vec)
    if return_zero_index:
        zero_index = ZeroIndex(index_vec, max_value)
    else:
        zero_index = None
    return index_vec, zero_index

# utility function to study activations

def create_random_batches(labels, batch_size=32):
    """
    Shuffles the labels and splits them into random batches.

    Parameters:
    - labels (np.ndarray): The labels matrix of shape (212646, 16384).
    - batch_size (int): The number of samples per batch.

    Returns:
    - List[np.ndarray]: A list of batches, each containing `batch_size` rows.
    """
    num_samples = labels.shape[0]
    
    # Generate a permutation of indices
    shuffled_indices = np.random.permutation(num_samples)
    
    # Shuffle the labels matrix
    shuffled_labels = labels[shuffled_indices]
    
    # Calculate the number of complete batches
    num_batches = num_samples // batch_size
    
    # Split the shuffled labels into batches
    batches = np.split(shuffled_labels[:num_batches * batch_size], num_batches)
    
    return batches

def generate_BH_index(batch_size: int, heads: int, selected_heads: int, device = 'cuda'):
    '''
    Generates a random list of selected heads for each batch.
    
    Args:
    - batch_size (int): Number of batches.
    - heads (int): Total number of heads.
    - selected_heads (int): Number of heads to select for each batch.
    
    Returns:
    - bh_index (torch.Tensor): Tensor of shape (batch_size * selected_heads, 2) where each row is (batch_idx, head_idx).
    '''
    N_selected = batch_size * selected_heads
    bh_index = torch.zeros((N_selected, 2), dtype=torch.int32, device=device)
    
    for batch_idx in range(batch_size):
        selected_head_indices = torch.randperm(heads)[:selected_heads]
        sorted_head_indices = torch.sort(selected_head_indices).values
        for i, head_idx in enumerate(sorted_head_indices):
            bh_index[batch_idx * selected_heads + i] = torch.tensor([batch_idx, head_idx], dtype=torch.int32)
    
    return bh_index

def generate_random_BH_index(batch_size: int, heads: int, selected_heads: int, device = 'cuda'):
    '''
    Generates a random list of selected heads for each batch.
    
    Args:
    - batch_size (int): Number of batches.
    - heads (int): Total number of heads.
    - selected_heads (int): Number of heads to select for each batch.
    
    Returns:
    - bh_index (torch.Tensor): Tensor of shape (batch_size, selected_heads)
    '''
    bh_index = torch.zeros((batch_size, selected_heads), dtype=torch.int32, device=device)
    
    for batch_idx in range(batch_size):
        selected_head_indices = torch.randperm(heads)[:selected_heads]
        # sort the selected head indices
        sorted_head_indices = torch.sort(selected_head_indices).values
        bh_index[batch_idx] =  sorted_head_indices
    
    return bh_index

def generate_random_BG_index(batch_size: int, groups: int, selected_groups: int, device = 'cuda'):
    '''
    Generates a random list of selected heads for each batch.
    
    Args:
    - batch_size (int): Number of batches.
    - heads (int): Total number of heads.
    - selected_heads (int): Number of heads to select for each batch.
    
    Returns:
    - bh_index (torch.Tensor): Tensor of shape (batch_size, selected_heads)
    '''
    bg_index = torch.zeros((batch_size, selected_groups), dtype=torch.int32, device=device)
    
    for batch_idx in range(batch_size):
        selected_group_indices = torch.randperm(groups)[:selected_groups]
        # sort the selected head indices
        sorted_group_indices = torch.sort(selected_group_indices).values
        bg_index[batch_idx] =  sorted_group_indices
    
    return bg_index



def activation_stats_layer(test_batches, total_neurons, device):
    """
    Calculates the average and standard deviation of activations across batches.

    Parameters:
    - test_batches (List[np.ndarray] or List[torch.Tensor]): List of batches containing label data.
    - total_neurons (int): Total number of neurons.
    - device (torch.device): The device to perform computations on (e.g., 'cpu' or 'cuda').

    Returns:
    - avg_act (float): The average number of activations per batch.
    - std_dev (float): The standard deviation of activations across batches.
    """
    sum_activation = 0.0       # To accumulate the total activations
    sum_activation_sq = 0.0    # To accumulate the squared activations
    num_batches = len(test_batches)

    for i, batch in enumerate(test_batches):
        # Convert batch to a PyTorch tensor if it's not already
        if not isinstance(batch, torch.Tensor):
            torch_labels = torch.tensor(batch, dtype=torch.float32, device=device)
        else:
            torch_labels = batch.to(device=device, dtype=torch.float32)

        # Binarize the labels: 1 if activation > 0, else 0
        binary_labels = (torch_labels > 0).int()

        # Sum activations per neuron across the batch
        activation_counts = binary_labels.sum(dim=0)

        # Convert counts to binary: 1 if neuron is activated in the batch, else 0
        activated_neurons = (activation_counts > 0).int()

        # Total number of activated neurons in this batch
        total_activations = activated_neurons.sum().item()

        # Accumulate sum and sum of squares
        sum_activation += total_activations
        sum_activation_sq += total_activations ** 2

        # Optional: Print progress every 1000 batches
        # if (i + 1) % 1000 == 0 or (i + 1) == num_batches:
        #     print(f"Processed {i + 1}/{num_batches} batches")

    # Calculate average activation
    avg_act = sum_activation / num_batches

    # Calculate variance and standard deviation
    variance = (sum_activation_sq / num_batches) - (avg_act ** 2)
    std_dev = variance ** 0.5

    # Display results
    print(f"\nAverage activation: {avg_act:.2f} "
          f"({(avg_act / total_neurons) * 100:.2f}% of total neurons)")
    print(f"Standard deviation of activation: {std_dev:.2f}")

    return avg_act, std_dev

def calculate_index_sizes(in_features):
    """
    Calculate index sizes based on the given in_features.
    The sizes are rounded up to the nearest multiple of 1024 
    and generated in 5% increments up to 100% of total neurons.
    
    Args:
        in_features (int): The number of input features.
    
    Returns:
        List[int]: A list of index sizes rounded up to the nearest multiple of 1024.
    """
    index_sizes = []
    total_neurons = in_features * 4

    # Generate 20 percentages from 5% to 100% in increments of 5%
    percentages = [i for i in range(5, 105, 5)]

    # Calculate index sizes and round up to the nearest multiple of 1024
    for p in percentages:
        index_size = int((p / 100) * total_neurons)
        index_size = math.ceil(index_size / 1024) * 1024
        index_sizes.append(index_size)
    
    return index_sizes


def compute_perplexity(model, dataloader, device):
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Calculating Perplexity"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Shift input_ids and labels for causal language modeling
            labels = input_ids.clone()
            # Replace padding tokens in labels by -100 so they are ignored in loss computation
            labels[attention_mask == 0] = -100

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            # Multiply loss by number of tokens in the batch
            # The loss is averaged over the number of non-masked tokens
            # To get the total loss, multiply by the number of non-masked tokens
            total_loss += loss.item() * torch.sum(labels != -100).item()
            total_tokens += torch.sum(labels != -100).item()

    # Calculate perplexity
    perplexity = math.exp(total_loss / total_tokens)
    return perplexity