Susav's picture
Upload folder using huggingface_hub
b3a3b15 verified
import math
import torch
from functools import partial
import pandas as pd
import matplotlib.pyplot as plt
from HybridTensor.utils.utils import get_gpu_name, create_results_directory
from tqdm import tqdm # For progress bars
# from HybridTensor.modules.MLP import StandardMLPBlock, SelectiveMLP, SelectiveMLPTriton
# from HybridTensor.utils.utils import sparse_index
def benchmark_mlp_fwd(x, model, index_vec = None, iterations = 100, print_result = False):
class_name_ = model.__class__.__name__
if index_vec is not None:
model = partial(model, index_vec=index_vec)
# warm up, this also compiles the triton kernel before measuring the execution time
for _ in range(10):
out = model(x)
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
start.record()
for _ in range(iterations):
out = model(x)
torch.cuda.synchronize()
end = torch.cuda.Event(enable_timing=True)
end.record()
torch.cuda.synchronize()
elapsed_time = start.elapsed_time(end)/iterations
if print_result:
print(f"{class_name_} Execution time: {elapsed_time} ms")
return out, elapsed_time
def generate_index_sizes(hidden_features):
index_sizes = []
idx = 0
while idx < hidden_features:
idx += 512
index_sizes.append(min(idx, hidden_features))
return index_sizes
def save_results_to_csv(df, filename_prefix='mlp_profiling_results', results_dir = create_results_directory("results")):
"""
Saves the profiling results DataFrame to a CSV file within the specified results directory.
Parameters:
- df (pd.DataFrame): The DataFrame containing profiling results.
- filename_prefix (str): The prefix for the CSV filename.
- results_dir (Path): The Path object for the results directory.
"""
# Retrieve the GPU name
gpu_name = get_gpu_name()
# Define the filename with GPU name
filename = f"{filename_prefix}_{gpu_name}.csv"
# Define the full path for the CSV file
csv_path = results_dir / filename
# Save the DataFrame to the CSV file
df.to_csv(csv_path, index=False)
print(f"Results saved to {csv_path}")
def plot_results(df, output_prefix='mlp_profiling', results_dir=create_results_directory("results")):
"""
Plots the profiling results and saves the plot image within the specified results directory.
Parameters:
- df (pd.DataFrame): The DataFrame containing profiling results.
- output_prefix (str): The prefix for the plot filename.
- results_dir (Path): The Path object for the results directory.
"""
plt.figure(figsize=(12, 6))
# Plot Execution Time
plt.subplot(1, 2, 1)
plt.plot(df['index_size'], df['standard_time'], label='Standard MLP', marker='o')
plt.plot(df['index_size'], df['selective_cutlass_time'], label='Selective MLP Cutlass', marker='o')
plt.plot(df['index_size'], df['selective_triton_time'], label='Selective MLP Triton', marker='o')
plt.xlabel('Index Size')
plt.ylabel('Execution Time (ms)')
plt.title('Execution Time vs. Index Size')
plt.legend()
plt.grid(True)
# Plot Speedup
plt.subplot(1, 2, 2)
plt.plot(df['index_size'], df['cutlass_speedup'], label='Cutlass Speedup', marker='o')
plt.plot(df['index_size'], df['triton_speedup'], label='Triton Speedup', marker='o')
plt.xlabel('Index Size')
plt.ylabel('Speedup')
plt.title('Speedup vs. Index Size')
plt.legend()
plt.grid(True)
plt.tight_layout()
# Retrieve the GPU name
gpu_name = get_gpu_name()
# Define the filename with GPU name
plot_filename = f"{output_prefix}_{gpu_name}.png"
# Define the full path for the plot image
plot_path = results_dir / plot_filename
# Save the plot
plt.savefig(plot_path)
plt.show()
print(f"Plots saved as {plot_path}")
def cuda_profiler(func, *args, warmup_runs=10, timed_runs=1000, **kwargs):
"""
Generic profiler function to measure execution time of a given function.
Parameters:
- func: The function to be profiled.
- *args: Positional arguments to be passed to the function.
- warmup_runs: Number of warm-up runs (default: 2).
- timed_runs: Number of timed iterations (default: 10).
- **kwargs: Keyword arguments to be passed to the function.
Returns:
- float: The average execution time of the function in milliseconds.
"""
# Warm-up phase
for _ in range(warmup_runs):
out = func(*args, **kwargs)
# Synchronize before starting the timer to ensure accurate measurements
torch.cuda.synchronize()
# Create CUDA events for measuring execution time
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
# Record execution times for the given number of runs
start_event.record()
for _ in range(timed_runs):
# Execute the function
out = func(*args, **kwargs)
# Wait for the events to be completed
# torch.cuda.synchronize()
end_event.record()
torch.cuda.synchronize()
# Calculate elapsed time for this iteration
elapsed_time = start_event.elapsed_time(end_event)
# Calculate average time per run
avg_time = elapsed_time / timed_runs
return out, avg_time
from HybridTensor.triton.select_attn_v1 import select_attn
from HybridTensor.utils.utils import generate_BH_index
from HybridTensor.triton.references.attention_proj_sparse import qkv_proj_sparse, out_proj_sparse
def _sim_cache_update(k, v, qkv, seq_len):
k[:, -1, ...] = qkv[:, :, 1]
v[:, -1, ...] = qkv[:, :, 2]
def mha_inference_simulation(B, in_features, seq_len, head_density, active_density):
'''
Simulates the execultion time of a standard MHA layer and a selective MHA layer with sparse projection and select_attn.
Parameters:
- B: batch size
- in_features: number of features
- seq_len: sequence length
- head_density: the percentage of heads that are active per batch
- active_density: the percentage of active heads per layer (aggregate active heads in all batches)
'''
# Test parameters
H = in_features // 128 # Number of heads
G = 1 # Group size
M = 1 # Sequence length for queries
Mk = seq_len # Sequence length for keys/values
Kq = 128 # Embedding size for queries
Kkv = 128 # Embedding size for keys/values
dtype = torch.float16
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cuda:0'
dtype = torch.float16
x = torch.rand(B, in_features, dtype=dtype).to(device)
proj_dim = 3 * in_features
# Define the Linear layer
qkv_project = torch.nn.Linear(in_features, proj_dim, dtype=dtype).to(device)
out_project = torch.nn.Linear(in_features, in_features, dtype=dtype).to(device)
weight = torch.randn(3, H, Kkv, in_features, device=device, dtype=dtype)
bias = torch.randn(3, H, Kkv, device=device, dtype=dtype)
n_active_heads = math.ceil(H * active_density)
head_idx = torch.randperm(H, device=device, dtype=torch.int32)[:n_active_heads]
batch_idx = torch.stack([
torch.arange(B, dtype=torch.int32, device=device)
for _ in range(n_active_heads)
])
print(f"Batch size: {B}, Total heads: {H}, Features: {in_features}, Seq len: {seq_len}")
print(f"Total active heads: {n_active_heads}")
print(f"Head density in SelectAttn: {head_density}")
print("====================================")
# Inference simulation
qkv, qkv_project_time = cuda_profiler(qkv_project, x)
print(f"qkv projection time: {qkv_project_time:.3f} ms")
qkv_sel, qkv_sel_proj_time = cuda_profiler(qkv_proj_sparse, x, weight, head_idx, batch_idx, bias)
print(f"qkv projection time: {qkv_sel_proj_time:.3f} ms")
# Generate random tensors for q, k, v
q = torch.randn(B, M, G, H, Kq, dtype=dtype, device=device)
k = torch.randn(B, Mk, G, H, Kkv, dtype=dtype, device=device)
v = torch.randn(B, Mk, G, H, Kkv, dtype=dtype, device=device)
# need to update kv cache with the new k, v
_sim_cache_update(k, v, qkv, seq_len)
_, kv_cache_update_time = cuda_profiler(_sim_cache_update, k, v, qkv, seq_len)
print(f"KV cache update time: {kv_cache_update_time:.3f} ms")
scale = 1 / (Kq ** 0.5)
batch_head_index_1 = generate_BH_index(B, H, math.ceil(H * 1))
triton_sel_output, attn_time = cuda_profiler(select_attn, q, k, v, scale, batch_head_index_1)
print(f"Attention time: {attn_time:.3f} ms")
batch_head_index_2 = generate_BH_index(B, H, math.ceil(H * head_density))
triton_sel_output_2, select_attn_time = cuda_profiler(select_attn, q, k, v, scale, batch_head_index_2)
print(f"SelectAttn time: {select_attn_time:.3f} ms")
triton_sel_output_2, view_time = cuda_profiler(triton_sel_output_2.view, B, in_features)
# Out projection
out, out_project_time = cuda_profiler(out_project, triton_sel_output_2)
print(f"out projection time: {out_project_time:.3f} ms")
standard_time = qkv_project_time + attn_time + out_project_time
select_time = qkv_project_time + select_attn_time + out_project_time
select_time_sparse_project = qkv_sel_proj_time + select_attn_time + out_project_time
print("====================================")
print(f"Standard time: {standard_time:.3f} ms")
print(f"Select time: {select_time:.3f} ms")
print(f"Select time with sparse project: {select_time_sparse_project:.3f} ms")
print("====================================")
print(f"Selective Speedup: {standard_time / select_time:.3f}")
print(f"Selective Speedup with sparse project: {standard_time / select_time_sparse_project:.3f}")
# free cuda memory
del qkv, qkv_sel, q, k, v, triton_sel_output, triton_sel_output_2, out
torch.cuda.empty_cache()