File size: 9,934 Bytes

b3a3b15

import math
import torch
from functools import partial
import pandas as pd
import matplotlib.pyplot as plt

from HybridTensor.utils.utils import get_gpu_name, create_results_directory

from tqdm import tqdm  # For progress bars

# from HybridTensor.modules.MLP import StandardMLPBlock, SelectiveMLP, SelectiveMLPTriton
# from HybridTensor.utils.utils import sparse_index

def benchmark_mlp_fwd(x, model, index_vec = None, iterations = 100, print_result = False):
    
    class_name_ = model.__class__.__name__
    if index_vec is not None:
        model = partial(model, index_vec=index_vec)
    # warm up, this also compiles the triton kernel before measuring the execution time
    for _ in range(10):
        out = model(x)

    torch.cuda.synchronize()
    start = torch.cuda.Event(enable_timing=True)
    start.record()
    for _ in range(iterations):
        out = model(x)
    torch.cuda.synchronize()
    end = torch.cuda.Event(enable_timing=True)
    end.record()
    torch.cuda.synchronize()
    
    elapsed_time = start.elapsed_time(end)/iterations

    if print_result:
        print(f"{class_name_} Execution time: {elapsed_time} ms")
    
    return out, elapsed_time

def generate_index_sizes(hidden_features):
    index_sizes = []
    idx = 0
    while idx < hidden_features:
        idx += 512
        index_sizes.append(min(idx, hidden_features))
    return index_sizes


def save_results_to_csv(df, filename_prefix='mlp_profiling_results', results_dir = create_results_directory("results")):
    """
    Saves the profiling results DataFrame to a CSV file within the specified results directory.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing profiling results.
    - filename_prefix (str): The prefix for the CSV filename.
    - results_dir (Path): The Path object for the results directory.
    """
    
    # Retrieve the GPU name
    gpu_name = get_gpu_name()
    
    # Define the filename with GPU name
    filename = f"{filename_prefix}_{gpu_name}.csv"
    
    # Define the full path for the CSV file
    csv_path = results_dir / filename
    
    # Save the DataFrame to the CSV file
    df.to_csv(csv_path, index=False)
    print(f"Results saved to {csv_path}")
    
def plot_results(df, output_prefix='mlp_profiling', results_dir=create_results_directory("results")):
    """
    Plots the profiling results and saves the plot image within the specified results directory.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing profiling results.
    - output_prefix (str): The prefix for the plot filename.
    - results_dir (Path): The Path object for the results directory.
    """
    plt.figure(figsize=(12, 6))
    
    # Plot Execution Time
    plt.subplot(1, 2, 1)
    plt.plot(df['index_size'], df['standard_time'], label='Standard MLP', marker='o')
    plt.plot(df['index_size'], df['selective_cutlass_time'], label='Selective MLP Cutlass', marker='o')
    plt.plot(df['index_size'], df['selective_triton_time'], label='Selective MLP Triton', marker='o')
    plt.xlabel('Index Size')
    plt.ylabel('Execution Time (ms)')
    plt.title('Execution Time vs. Index Size')
    plt.legend()
    plt.grid(True)
    
    # Plot Speedup
    plt.subplot(1, 2, 2)
    plt.plot(df['index_size'], df['cutlass_speedup'], label='Cutlass Speedup', marker='o')
    plt.plot(df['index_size'], df['triton_speedup'], label='Triton Speedup', marker='o')
    plt.xlabel('Index Size')
    plt.ylabel('Speedup')
    plt.title('Speedup vs. Index Size')
    plt.legend()
    plt.grid(True)
    
    plt.tight_layout()
    
    # Retrieve the GPU name
    gpu_name = get_gpu_name()
    
    # Define the filename with GPU name
    plot_filename = f"{output_prefix}_{gpu_name}.png"
    
    # Define the full path for the plot image
    plot_path = results_dir / plot_filename
    
    # Save the plot
    plt.savefig(plot_path)
    plt.show()
    print(f"Plots saved as {plot_path}")

def cuda_profiler(func, *args, warmup_runs=10, timed_runs=1000, **kwargs):
    """
    Generic profiler function to measure execution time of a given function.

    Parameters:
    - func: The function to be profiled.
    - *args: Positional arguments to be passed to the function.
    - warmup_runs: Number of warm-up runs (default: 2).
    - timed_runs: Number of timed iterations (default: 10).
    - **kwargs: Keyword arguments to be passed to the function.

    Returns:
    - float: The average execution time of the function in milliseconds.
    """
    # Warm-up phase
    for _ in range(warmup_runs):
        out = func(*args, **kwargs)

    # Synchronize before starting the timer to ensure accurate measurements
    torch.cuda.synchronize()

    # Create CUDA events for measuring execution time
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

    # Record execution times for the given number of runs
    start_event.record()
    for _ in range(timed_runs):
        # Execute the function
        out = func(*args, **kwargs)

    # Wait for the events to be completed
    # torch.cuda.synchronize()
    end_event.record()
    torch.cuda.synchronize()
    # Calculate elapsed time for this iteration
    elapsed_time = start_event.elapsed_time(end_event)

    # Calculate average time per run
    avg_time = elapsed_time / timed_runs

    return out, avg_time


from HybridTensor.triton.select_attn_v1 import select_attn
from HybridTensor.utils.utils import generate_BH_index
from HybridTensor.triton.references.attention_proj_sparse import qkv_proj_sparse, out_proj_sparse

def _sim_cache_update(k, v, qkv, seq_len):
    k[:, -1, ...] = qkv[:, :, 1]
    v[:, -1, ...] = qkv[:, :, 2]

def mha_inference_simulation(B, in_features, seq_len, head_density, active_density):
    ''' 
    Simulates the execultion time of a standard MHA layer and a selective MHA layer with sparse projection and select_attn.
    
    Parameters:
    -   B: batch size
    -   in_features: number of features
    -   seq_len: sequence length
    -   head_density: the percentage of heads that are active per batch 
    -   active_density: the percentage of active heads per layer (aggregate active heads in all batches)
    
    '''
    # Test parameters
    H = in_features // 128  # Number of heads
    G = 1  # Group size
    M = 1  # Sequence length for queries
    Mk = seq_len  # Sequence length for keys/values
    Kq = 128  # Embedding size for queries
    Kkv = 128  # Embedding size for keys/values

    dtype = torch.float16
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    device = 'cuda:0'
    dtype = torch.float16

    x = torch.rand(B, in_features, dtype=dtype).to(device)
    proj_dim = 3 * in_features

    # Define the Linear layer
    qkv_project = torch.nn.Linear(in_features, proj_dim, dtype=dtype).to(device)
    out_project = torch.nn.Linear(in_features, in_features, dtype=dtype).to(device)

    weight = torch.randn(3, H, Kkv, in_features, device=device, dtype=dtype)
    bias = torch.randn(3, H, Kkv, device=device, dtype=dtype)

    n_active_heads = math.ceil(H * active_density)
    head_idx = torch.randperm(H, device=device, dtype=torch.int32)[:n_active_heads]

    batch_idx = torch.stack([
        torch.arange(B, dtype=torch.int32, device=device)
        for _ in range(n_active_heads)
    ])

    print(f"Batch size: {B}, Total heads: {H}, Features: {in_features}, Seq len: {seq_len}")
    print(f"Total active heads: {n_active_heads}")
    print(f"Head density in SelectAttn: {head_density}")
    
    print("====================================")

    # Inference simulation
    qkv, qkv_project_time = cuda_profiler(qkv_project, x)
    print(f"qkv projection time: {qkv_project_time:.3f} ms")

    qkv_sel, qkv_sel_proj_time = cuda_profiler(qkv_proj_sparse, x, weight, head_idx, batch_idx, bias)
    print(f"qkv projection time: {qkv_sel_proj_time:.3f} ms")

    # Generate random tensors for q, k, v
    q = torch.randn(B, M, G, H, Kq, dtype=dtype, device=device)
    k = torch.randn(B, Mk, G, H, Kkv, dtype=dtype, device=device)
    v = torch.randn(B, Mk, G, H, Kkv, dtype=dtype, device=device)

    # need to update kv cache with the new k, v
    _sim_cache_update(k, v, qkv, seq_len)
    _, kv_cache_update_time = cuda_profiler(_sim_cache_update, k, v, qkv, seq_len)
    print(f"KV cache update time: {kv_cache_update_time:.3f} ms")

    scale = 1 / (Kq ** 0.5)
    batch_head_index_1 = generate_BH_index(B, H, math.ceil(H * 1))

    triton_sel_output, attn_time = cuda_profiler(select_attn, q, k, v, scale, batch_head_index_1)
    print(f"Attention time: {attn_time:.3f} ms")

    batch_head_index_2 = generate_BH_index(B, H, math.ceil(H * head_density))
    triton_sel_output_2, select_attn_time = cuda_profiler(select_attn, q, k, v, scale, batch_head_index_2)
    print(f"SelectAttn time: {select_attn_time:.3f} ms")

    triton_sel_output_2, view_time = cuda_profiler(triton_sel_output_2.view, B, in_features)

    # Out projection
    out, out_project_time = cuda_profiler(out_project, triton_sel_output_2)
    print(f"out projection time: {out_project_time:.3f} ms")

    standard_time = qkv_project_time + attn_time + out_project_time
    select_time = qkv_project_time + select_attn_time + out_project_time
    select_time_sparse_project = qkv_sel_proj_time + select_attn_time + out_project_time

    print("====================================")
    print(f"Standard time: {standard_time:.3f} ms")
    print(f"Select time: {select_time:.3f} ms")
    print(f"Select time with sparse project: {select_time_sparse_project:.3f} ms")
    print("====================================")
    print(f"Selective Speedup: {standard_time / select_time:.3f}")
    print(f"Selective Speedup with sparse project: {standard_time / select_time_sparse_project:.3f}")
    
    # free cuda memory
    del qkv, qkv_sel, q, k, v, triton_sel_output, triton_sel_output_2, out
    torch.cuda.empty_cache()