Susav commited on Jun 15, 2025

Commit

b3a3b15

verified ·

1 Parent(s): 2453355

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

HybridTensor/__init__.py +0 -0
HybridTensor/__pycache__/__init__.cpython-310.pyc +0 -0
HybridTensor/__pycache__/__init__.cpython-39.pyc +0 -0
HybridTensor/benchmarks/generation/__pycache__/gen_util.cpython-39.pyc +0 -0
HybridTensor/benchmarks/generation/__pycache__/llama_sparse_generation.cpython-39.pyc +0 -0
HybridTensor/benchmarks/generation/__pycache__/model_sparse_generation.cpython-39.pyc +0 -0
HybridTensor/benchmarks/generation/__pycache__/opt_gen_tp.cpython-39.pyc +0 -0
HybridTensor/benchmarks/generation/__pycache__/opt_generation.cpython-310.pyc +0 -0
HybridTensor/benchmarks/generation/__pycache__/opt_generation.cpython-39.pyc +0 -0
HybridTensor/benchmarks/generation/__pycache__/opt_sparse_gen_tp.cpython-39.pyc +0 -0
HybridTensor/benchmarks/generation/__pycache__/opt_sparse_generation.cpython-310.pyc +0 -0
HybridTensor/benchmarks/generation/__pycache__/opt_sparse_generation.cpython-39.pyc +0 -0
HybridTensor/benchmarks/generation/gen_util.py +38 -0
HybridTensor/benchmarks/generation/model_sparse_generation.py +71 -0
HybridTensor/benchmarks/generation/opt_gen_tp.py +133 -0
HybridTensor/benchmarks/generation/opt_generation.py +289 -0
HybridTensor/benchmarks/generation/opt_sparse_gen_tp.py +112 -0
HybridTensor/benchmarks/generation/opt_sparse_generation.py +182 -0
HybridTensor/benchmarks/model_eval.py +313 -0
HybridTensor/benchmarks/model_perplexity.py +165 -0
HybridTensor/benchmarks/opt_attn_sparse_topk_perplexity.py +264 -0
HybridTensor/benchmarks/select_block_decode.py +218 -0
HybridTensor/models/__pycache__/create_sparse_model.cpython-310.pyc +0 -0
HybridTensor/models/__pycache__/create_sparse_model.cpython-39.pyc +0 -0
HybridTensor/models/__pycache__/helper.cpython-310.pyc +0 -0
HybridTensor/models/__pycache__/helper.cpython-39.pyc +0 -0
HybridTensor/models/__pycache__/llama.cpython-39.pyc +0 -0
HybridTensor/models/__pycache__/opt.cpython-310.pyc +0 -0
HybridTensor/models/__pycache__/opt.cpython-39.pyc +0 -0
HybridTensor/models/create_sparse_model.py +854 -0
HybridTensor/models/helper.py +125 -0
HybridTensor/models/llama.py +74 -0
HybridTensor/models/opt.py +229 -0
HybridTensor/modules/SelectiveBlock.py +960 -0
HybridTensor/modules/SelectiveMHA.py +1579 -0
HybridTensor/modules/SelectiveMLP.py +580 -0
HybridTensor/modules/SelectiveRouters.py +136 -0
HybridTensor/modules/__init__.py +0 -0
HybridTensor/modules/__pycache__/MLP.cpython-39.pyc +0 -0
HybridTensor/modules/__pycache__/ParallelMLP.cpython-39.pyc +0 -0
HybridTensor/modules/__pycache__/SelectiveBlock.cpython-39.pyc +0 -0
HybridTensor/modules/__pycache__/SelectiveBlock_v1.cpython-310.pyc +0 -0
HybridTensor/modules/__pycache__/SelectiveBlock_v1.cpython-39.pyc +0 -0
HybridTensor/modules/__pycache__/SelectiveMHA.cpython-310.pyc +0 -0
HybridTensor/modules/__pycache__/SelectiveMHA.cpython-39.pyc +0 -0
HybridTensor/modules/__pycache__/SelectiveMLP.cpython-310.pyc +0 -0
HybridTensor/modules/__pycache__/SelectiveMLP.cpython-39.pyc +0 -0
HybridTensor/modules/__pycache__/SelectiveRouters.cpython-310.pyc +0 -0
HybridTensor/modules/__pycache__/SelectiveRouters.cpython-39.pyc +0 -0
HybridTensor/modules/__pycache__/__init__.cpython-310.pyc +0 -0

HybridTensor/__init__.py ADDED Viewed

File without changes

HybridTensor/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (190 Bytes). View file

HybridTensor/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (171 Bytes). View file

HybridTensor/benchmarks/generation/__pycache__/gen_util.cpython-39.pyc ADDED Viewed

Binary file (869 Bytes). View file

HybridTensor/benchmarks/generation/__pycache__/llama_sparse_generation.cpython-39.pyc ADDED Viewed

Binary file (2.34 kB). View file

HybridTensor/benchmarks/generation/__pycache__/model_sparse_generation.cpython-39.pyc ADDED Viewed

Binary file (2.68 kB). View file

HybridTensor/benchmarks/generation/__pycache__/opt_gen_tp.cpython-39.pyc ADDED Viewed

Binary file (3.75 kB). View file

HybridTensor/benchmarks/generation/__pycache__/opt_generation.cpython-310.pyc ADDED Viewed

Binary file (5.22 kB). View file

HybridTensor/benchmarks/generation/__pycache__/opt_generation.cpython-39.pyc ADDED Viewed

Binary file (6.12 kB). View file

HybridTensor/benchmarks/generation/__pycache__/opt_sparse_gen_tp.cpython-39.pyc ADDED Viewed

Binary file (3.44 kB). View file

HybridTensor/benchmarks/generation/__pycache__/opt_sparse_generation.cpython-310.pyc ADDED Viewed

Binary file (3.36 kB). View file

HybridTensor/benchmarks/generation/__pycache__/opt_sparse_generation.cpython-39.pyc ADDED Viewed

Binary file (4.56 kB). View file

HybridTensor/benchmarks/generation/gen_util.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import random
+import torch
+from datasets import load_dataset
+from transformers import AutoTokenizer
+def tokenize_dataset(dataset, tokenizer):
+    # Tokenize and concatenate all texts (without adding special tokens)
+    all_tokens = []
+    for example in dataset:
+        tokens = tokenizer(example["text"], add_special_tokens=False)["input_ids"]
+        all_tokens.extend(tokens)
+    return all_tokens
+def get_random_batch(tokens, batch_size, seq_length):
+    total = len(tokens)
+    batch = []
+    for _ in range(batch_size):
+        start = random.randint(0, total - seq_length)
+        batch.append(tokens[start : start + seq_length])
+    return torch.tensor(batch)
+'''
+# Load dataset and tokenizer
+dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+model_name = "facebook/opt-6.7b"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokens = tokenize_dataset(dataset, tokenizer)
+# Define parameters
+batch_size = 8
+seq_length = 2000
+random_batch = get_random_batch(tokens, batch_size, seq_length)
+print("Batch shape:", random_batch.shape)  # Expected: (8, 128)
+'''

HybridTensor/benchmarks/generation/model_sparse_generation.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch
+import argparse
+from HybridTensor.utils.utils import _get_device
+from HybridTensor.utils.activations import MODELS
+from HybridTensor.models.opt import build_sparse_opt
+from HybridTensor.models.llama import build_sparse_llama
+from HybridTensor.routers.mlp.mlp_router_optim import load_router_dict_from_csv
+from transformers import AutoTokenizer
+def update_router_config(model, num_layers, mlp_topk_lookup, attn_topk):
+    for i in range(num_layers):
+        if mlp_topk_lookup is not None:
+            model.transformer.layers[i].mlp_topk = mlp_topk_lookup[i]
+        # model.transformer.layers[i].mlp_topk = 512
+        model.transformer.layers[i].mha_router.topk = attn_topk
+    # dense attention in layer 0
+    model.transformer.layers[0].mha_router.topk = 1.0
+def arg_parser():
+    parser = argparse.ArgumentParser(description='Inference benchmarking')
+    parser.add_argument('--batch_size', type=int, default=16)
+    parser.add_argument('--model_index', type=int, default=5)
+    parser.add_argument('--print_results', type=bool, default=True)
+    parser.add_argument('--iterations', type=int, default=1)
+    parser.add_argument('--gpu', type=int, default=0)
+    parser.add_argument('--attn_topk', type=float, default=0.5, help='Attention topk for sparse model')
+    parser.add_argument('--mlp_ckpt_dir', type=str, default='/home/grads/s/<name>/nvme/HybridTensor/checkpoint/opt-6.7b-routers/mlp')
+    parser.add_argument('--attn_ckpt_dir', type=str, default='/home/grads/s/<name>/nvme/HybridTensor/checkpoint/opt-6.7b-routers/mha_linear')
+    parser.add_argument('--batch_stats_dir', type=str, default='configs/mlp_router/opt-6.7b')
+    parser.add_argument('--delta', type=int, default=256, help='Delta value for MLP topk calculation')
+    parser.add_argument('--use_cuda_graph', type=bool, default=False, help='Use CUDA graph for inference')
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = arg_parser()
+    model_name = MODELS[args.model_index-1]
+    print(f"Model name: {model_name}")
+    dtype = torch.float16
+    device= _get_device(args.gpu)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if "llama" in model_name:
+        model = build_sparse_llama(args, model_name,
+                               args.attn_ckpt_dir,
+                               device = device, dtype=dtype)
+        update_router_config(model, model.config.n_layer, None, args.attn_topk)  # this sets the router config for all layers using a single config
+    else:
+        mlp_topk_lookup = load_router_dict_from_csv(args.batch_stats_dir, args.batch_size)
+        model = build_sparse_opt(args, model_name,
+                               args.mlp_ckpt_dir,
+                               args.attn_ckpt_dir,
+                               device = device, dtype=dtype)
+        update_router_config(model, model.config.n_layer, mlp_topk_lookup, args.attn_topk)  # this sets the router config for all layers using a single config
+    model.eval()
+    print(model)
+    # test input
+    input_text = "Once upon a time in a land far, far away, there lived a"
+    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
+    # Generate output
+    with torch.no_grad():
+        output = model.generate(input_ids, max_length=50)
+        print(tokenizer.decode(output[0], skip_special_tokens=True))

HybridTensor/benchmarks/generation/opt_gen_tp.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from transformers.models.opt import OPTConfig
+from transformers import AutoTokenizer
+from flash_attn.models.opt import opt_config_to_gpt2_config
+import os
+import torch
+import argparse
+from apex.transformer import parallel_state
+from HybridTensor.utils.utils import arg_parser, _get_device
+from HybridTensor.utils.activations import OPT_MODELS
+from HybridTensor.models.opt import SparseConfig, build_sparse_opt, build_dense_opt
+def initialize_distributed_environment():
+    # Set environment variables for NCCL
+    os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
+    os.environ["NCCL_GRAPH_MIXING_SUPPORT"] = "0"
+    # Initialize the distributed process group
+    torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    # Set the device based on the rank of the current process
+    device = f"cuda:{torch.distributed.get_rank()}"
+    world_size = torch.distributed.get_world_size()
+    # Set the current CUDA device to avoid operations being executed on the wrong GPU
+    torch.cuda.set_device(device)
+    # You can return device, world_size, and any other relevant information
+    return device, world_size
+def _turn_bias_off(model, num_layers):
+    for i in range(num_layers):
+        model.transformer.layers[i].mlp.fc1.bias = None
+        model.transformer.layers[i].mlp.fc2.bias = None
+def arg_parser():
+    parser = argparse.ArgumentParser(description='Inference benchmarking')
+    parser.add_argument('--batch_size', type=int, default=128)
+    parser.add_argument('--model_index', type=int, default=5)
+    parser.add_argument('--seq_len', type=int, default=25)
+    parser.add_argument('--index_size', type=int, default=8192)
+    parser.add_argument('--head_density', type=float, default=0.25)
+    parser.add_argument('--print_results', type=bool, default=True)
+    parser.add_argument('--iterations', type=int, default=2)
+    parser.add_argument('--check_results', type=bool, default=False)
+    parser.add_argument('--results_dir', type=str, default='results')
+    parser.add_argument('--gpu', type=int, default=0)
+    parser.add_argument('--bias', type=bool, default=False)
+    parser.add_argument('--mlp_ckpt_dir', type=str, default='/home/grads/s/<name>/nvme/HybridTensor/checkpoint/opt-6.7b-routers/mlp')
+    parser.add_argument('--attn_ckpt_dir', type=str, default='/home/grads/s/<name>/nvme/HybridTensor/checkpoint/opt-6.7b-routers/mha_linear')
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = arg_parser()
+    model_name = OPT_MODELS[args.model_index-1]
+    device, world_size = initialize_distributed_environment()
+    dtype = torch.float16
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    process_group = parallel_state.get_tensor_model_parallel_group()
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # model = build_sparse_opt(model_name, args.mlp_ckpt_dir, args.attn_ckpt_dir, device = device, dtype=dtype, process_group = process_group, world_size = world_size, rank = rank)
+    model = build_dense_opt(model_name, process_group = process_group, world_size = world_size, rank = rank, device = device, dtype=dtype)
+    model.eval()
+    # if rank == 0:
+    #     print(model)
+    # input_texts = ["Hello, my dog is cute and", "The future of AI is", "In a distant galaxy, a spaceship", "The cat is sleeping on the "]
+    input_texts = ["In a distant galaxy, a spaceship"]
+    tokenized_inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).to(device)
+    input_ids=tokenized_inputs["input_ids"]
+    # input_ids = tokenizer("Hello, my dog is cute and", return_tensors="pt").input_ids.to(device=device)
+    max_length = args.seq_len
+    position_ids = None
+    eos_token_id = tokenizer.eos_token_id
+    num_layers = model.config.n_layer
+    # turn bias off for mlp layers
+    if not args.bias:
+        _turn_bias_off(model, num_layers)
+    _ = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        eos_token_id=eos_token_id,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=False,
+        )
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for i in range(args.iterations):
+        out = model.generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            eos_token_id=eos_token_id,
+            return_dict_in_generate=True,
+            output_scores=True,
+            enable_timing=False,
+            )
+    end_event.record()
+    torch.cuda.synchronize()
+    # print(tokenizer.batch_decode(out.sequences.tolist()))
+    if rank == 0:
+        elapsed_time = start_event.elapsed_time(end_event) / args.iterations
+        print(f"Average time per genearation : {elapsed_time} ms")
+        # Compute throughput and latency per token
+        num_tokens_generated = out.sequences.shape[1] - input_ids.shape[1]
+        throughput = num_tokens_generated / (elapsed_time / 1000)  # tokens per second
+        latency_per_token = elapsed_time / num_tokens_generated  # ms per token
+        print(f"Number of tokens generated: {num_tokens_generated}")
+        print(f"Throughput: {throughput} tokens/second")
+        print(f"Latency per token: {latency_per_token} ms")
+        print(tokenizer.batch_decode(out.sequences.tolist()))

HybridTensor/benchmarks/generation/opt_generation.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import re
+import time
+import pytest
+import torch
+import argparse
+from einops import rearrange
+from HybridTensor.benchmarks.generation.gen_util import tokenize_dataset, get_random_batch
+from HybridTensor.utils.activations import OPT_MODELS
+from datasets import load_dataset
+from flash_attn.models.gpt import GPTLMHeadModel
+from flash_attn.models.opt import opt_config_to_gpt2_config, remap_state_dict_hf_opt
+from flash_attn.utils.generation import update_graph_cache
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+from transformers import AutoTokenizer, OPTConfig
+from transformers.models.opt.modeling_opt import OPTForCausalLM
+def test_opt_generation(model_name):
+    """Check that our implementation of OPT generation matches the HF implementation:
+    the scores in fp16 should be around the same as the HF scores in fp16, when compared to
+    the HF scores in fp32.
+    """
+    print(f"\nMODEL: {model_name}")
+    verbose = False
+    dtype = torch.float16
+    device = "cuda"
+    rtol, atol = 3e-3, 3e-1
+    config = opt_config_to_gpt2_config(OPTConfig.from_pretrained(model_name))
+    # Only prenorm supports residual_in_fp32
+    config.residual_in_fp32 = getattr(config, "prenorm", True)
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    config.fused_dropout_add_ln = True
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+    model.eval()
+    torch.manual_seed(0)
+    # OPT tokenizer requires use_fast=False
+    # https://huggingface.co/docs/transformers/model_doc/opt
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+    eos_token_id = tokenizer.eos_token_id
+    input_ids = tokenizer("Hello, my dog is cute and he", return_tensors="pt").input_ids.to(
+        device=device
+    )
+    max_length = 25
+    # input_ids = torch.randint(0, 100, (2, 10), dtype=torch.long, device='cuda')
+    # max_length = input_ids.shape[1] + 40
+    # Slow generation for reference
+    sequences = []
+    scores = []
+    cur_input_ids = input_ids
+    with torch.inference_mode():
+        scores.append(model(cur_input_ids).logits[:, -1])
+        sequences.append(scores[-1].argmax(dim=-1))
+        for _ in range(input_ids.shape[1] + 1, max_length):
+            cur_input_ids = torch.cat([cur_input_ids, rearrange(sequences[-1], "b -> b 1")], dim=-1)
+            scores.append(model(cur_input_ids).logits[:, -1])
+            sequences.append(scores[-1].argmax(dim=-1))
+            if eos_token_id is not None and (sequences[-1] == eos_token_id).all():
+                break
+    sequences = torch.cat([input_ids, torch.stack(sequences, dim=1)], dim=1)
+    scores = tuple(scores)
+    print("Without CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        eos_token_id=eos_token_id,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+    if verbose:
+        print(out.sequences)
+    print(tokenizer.batch_decode(out.sequences.tolist()))
+    if getattr(config, "use_flash_attn", False):
+        # Capture graph outside the timing loop
+        batch_size, seqlen_og = input_ids.shape
+        model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
+        print("With CUDA graph")
+        torch.cuda.synchronize()
+        start = time.time()
+        out_cg = model.generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            cg=True,
+            return_dict_in_generate=True,
+            output_scores=True,
+            enable_timing=True,
+        )
+        torch.cuda.synchronize()
+        print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+        if verbose:
+            print(out_cg.sequences)
+        print(tokenizer.batch_decode(out_cg.sequences.tolist()))
+    del model
+    model_hf = OPTForCausalLM.from_pretrained(model_name, torch_dtype=dtype).to(device=device)
+    model_hf.eval()
+    print("HF fp16")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_hf = model_hf.generate(
+        input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+    del model_hf
+    model_ref = OPTForCausalLM.from_pretrained(model_name).to(device=device)
+    model_ref.eval()
+    print("HF fp32")
+    torch.cuda.synchronize()
+    start = time.time()
+    out_ref = model_ref.generate(
+        input_ids=input_ids, max_length=max_length, return_dict_in_generate=True, output_scores=True
+    )
+    torch.cuda.synchronize()
+    print(f"Prompt processing + decoding time: {(time.time() - start) * 1000:.0f}ms")
+    del model_ref
+    print(tokenizer.batch_decode(out_ref.sequences.tolist()))
+    if verbose:
+        print(
+            f"Scores max diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}"
+        )
+        print(
+            f"Scores mean diff: {(torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}"
+        )
+        print(
+            f"HF fp16 max diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item()}"
+        )
+        print(
+            f"HF fp16 mean diff: {(torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)).abs().mean().item()}"
+        )
+    assert torch.all(out.sequences == sequences)
+    assert torch.allclose(
+        torch.stack(out.scores, dim=1), torch.stack(scores, dim=1), rtol=rtol, atol=atol
+    )
+    assert torch.all(out.sequences == out_ref.sequences)
+    assert torch.all(out.sequences == out_hf.sequences)
+    assert (torch.stack(out.scores, 1) - torch.stack(out_ref.scores, 1)).abs().max().item() < 3 * (
+        torch.stack(out_hf.scores, 1) - torch.stack(out_ref.scores, 1)
+    ).abs().max().item()
+def arg_parser():
+    parser = argparse.ArgumentParser(description='Inference benchmarking')
+    parser.add_argument('--batch_size', type=int, default=32)
+    parser.add_argument('--model_index', type=int, default=5)
+    parser.add_argument('--seq_len', type=int, default=1024)
+    parser.add_argument('--index_size', type=int, default=8192)
+    parser.add_argument('--head_density', type=float, default=0.25)
+    parser.add_argument('--print_results', type=bool, default=False)
+    parser.add_argument('--iterations', type=int, default=1)
+    parser.add_argument('--check_results', type=bool, default=False)
+    parser.add_argument('--results_dir', type=str, default='results')
+    parser.add_argument('--gpu', type=int, default=0)
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = arg_parser()
+    model_name = OPT_MODELS[args.model_index-1]
+    # test_opt_generation(model_name)
+    print(f"\nMODEL: {model_name}\n")
+    verbose = False
+    dtype = torch.float16
+    device = "cuda"
+    rtol, atol = 3e-3, 3e-1
+    config = opt_config_to_gpt2_config(OPTConfig.from_pretrained(model_name))
+    # Only prenorm supports residual_in_fp32
+    config.residual_in_fp32 = getattr(config, "prenorm", True)
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    config.fused_dropout_add_ln = True
+    model = GPTLMHeadModel.from_pretrained(model_name, config, device=device, dtype=dtype)
+    model.eval()
+    torch.manual_seed(0)
+    # OPT tokenizer requires use_fast=False
+    # https://huggingface.co/docs/transformers/model_doc/opt
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+    eos_token_id = tokenizer.eos_token_id
+    # input_ids = tokenizer("In a distant galaxy, a spaceship", return_tensors="pt").input_ids.to(
+    #     device=device
+    # )
+    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+    tokens = tokenize_dataset(dataset, tokenizer)
+    input_ids = get_random_batch(tokens, args.batch_size, args.seq_len)
+    input_ids = input_ids.to(device=device)
+    max_length = args.seq_len + 20
+    # input_ids = torch.randint(0, 100, (2, 10), dtype=torch.long, device='cuda')
+    # max_length = input_ids.shape[1] + 40
+    # warm up
+    _ = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        eos_token_id=eos_token_id,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=False,
+    )
+    print("Without CUDA graph")
+    torch.cuda.synchronize()
+    start = time.time()
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        eos_token_id=eos_token_id,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=False,
+    )
+    torch.cuda.synchronize()
+    elapsed_time = (time.time() - start) * 1000
+    print(f"Prompt processing + decoding time: {elapsed_time:.0f} ms")
+    # Compute throughput and latency per token
+    num_tokens_generated = out.sequences.shape[1] - input_ids.shape[1]
+    throughput = (args.batch_size * num_tokens_generated) / (elapsed_time / 1000)
+    latency_per_token = elapsed_time / num_tokens_generated  # ms per token
+    # print(f"Number of tokens generated: {num_tokens_generated}")
+    print(f"Throughput: {throughput:.1f} tokens/second")
+    print(f"Latency per token: {latency_per_token:.1f} ms")
+    if args.print_results:
+        # print(out.sequences)
+        print(tokenizer.batch_decode(out.sequences.tolist()))
+    # ============================================================================= #
+    print("\n")
+    if getattr(config, "use_flash_attn", False):
+        # Capture graph outside the timing loop
+        batch_size, seqlen_og = input_ids.shape
+        model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
+        print("With CUDA graph")
+        torch.cuda.synchronize()
+        start = time.time()
+        out_cg = model.generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            cg=True,
+            return_dict_in_generate=True,
+            output_scores=True,
+            enable_timing=False,
+        )
+        torch.cuda.synchronize()
+        elapsed_time = (time.time() - start) * 1000
+        print(f"Prompt processing + decoding time: {elapsed_time:.0f} ms")
+        # Compute throughput and latency per token
+        num_tokens_generated = out.sequences.shape[1] - input_ids.shape[1]
+        latency_per_token = elapsed_time / num_tokens_generated  # ms per token
+        throughput = (args.batch_size * num_tokens_generated) / (elapsed_time / 1000)
+        # print(f"Number of tokens generated: {num_tokens_generated}")
+        print(f"Throughput: {throughput:.1f} tokens/second")
+        print(f"Latency per token: {latency_per_token:.1f} ms")
+        if args.print_results:
+            # print(out_cg.sequences)
+            print(tokenizer.batch_decode(out_cg.sequences.tolist()))

HybridTensor/benchmarks/generation/opt_sparse_gen_tp.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from transformers.models.opt import OPTConfig
+from transformers import AutoTokenizer
+from flash_attn.models.opt import opt_config_to_gpt2_config
+import os
+import torch
+import argparse
+from apex.transformer import parallel_state
+from HybridTensor.utils.utils import arg_parser, _get_device
+from HybridTensor.utils.activations import OPT_MODELS
+from HybridTensor.models.opt import SparseConfig, build_sparse_opt
+def update_router_config(model, num_layers, mlp_act_th, attn_topk, layer_config = None):
+    for i in range(num_layers):
+        model.transformer.layers[i].mlp_router.act_th = mlp_act_th
+        model.transformer.layers[i].mha_router.topk = attn_topk
+def initialize_distributed_environment():
+    # Set environment variables for NCCL
+    os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0"
+    os.environ["NCCL_GRAPH_MIXING_SUPPORT"] = "0"
+    # Initialize the distributed process group
+    torch.distributed.init_process_group(backend="nccl", init_method="env://")
+    # Set the device based on the rank of the current process
+    device = f"cuda:{torch.distributed.get_rank()}"
+    world_size = torch.distributed.get_world_size()
+    # Set the current CUDA device to avoid operations being executed on the wrong GPU
+    torch.cuda.set_device(device)
+    # You can return device, world_size, and any other relevant information
+    return device, world_size
+def arg_parser():
+    parser = argparse.ArgumentParser(description='Inference benchmarking')
+    parser.add_argument('--batch_size', type=int, default=128)
+    parser.add_argument('--model_index', type=int, default=5)
+    parser.add_argument('--seq_len', type=int, default=28)
+    parser.add_argument('--index_size', type=int, default=8192)
+    parser.add_argument('--head_density', type=float, default=0.25)
+    parser.add_argument('--print_results', type=bool, default=True)
+    parser.add_argument('--iterations', type=int, default=100)
+    parser.add_argument('--check_results', type=bool, default=False)
+    parser.add_argument('--results_dir', type=str, default='results')
+    parser.add_argument('--gpu', type=int, default=0)
+    parser.add_argument('--mlp_ckpt_dir', type=str, default='<PATH_TO_MLP_ROUTER_CHECKPOINTS>')
+    parser.add_argument('--attn_topk', type=float, default=0.5, help='Attention topk for sparse model')
+    parser.add_argument('--attn_ckpt_dir', type=str, default='<PATH_TO_ATTENTION_CHECKPOINTS>')
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = arg_parser()
+    model_name = OPT_MODELS[args.model_index-1]
+    device, world_size = initialize_distributed_environment()
+    dtype = torch.float16
+    parallel_state.initialize_model_parallel(tensor_model_parallel_size_=world_size)
+    rank = parallel_state.get_tensor_model_parallel_rank()
+    process_group = parallel_state.get_tensor_model_parallel_group()
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = build_sparse_opt(model_name, args.mlp_ckpt_dir, args.attn_ckpt_dir, device = device, dtype=dtype, process_group = process_group, world_size = world_size, rank = rank)
+    model.eval()
+    print("Model loaded with sparse routers")
+    mlp_act_th = 0.5
+    attn_topk = 0.5
+    update_router_config(model, model.config.n_layer, mlp_act_th, attn_topk)
+    print("Router config updated")
+    # print router configs from all layers
+    # for i in range(model.config.n_layer):
+    #     print(f"Layer {i}: mlp_act_th = {model.transformer.layers[i].mlp_router.act_th}, attn_topk = {model.transformer.layers[i].mha_router.topk}")
+    input_texts = ["Hello, my dog is cute and", "The future of AI is", "In a distant galaxy, a spaceship", "The cat is sleeping on the "]
+    # input_texts = ["Hello, my dog is cute and", "Hello, my rat is cute and"]
+    tokenized_inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).to(device)
+    input_ids=tokenized_inputs["input_ids"]
+    # input_ids = tokenizer("Hello, my dog is cute and", return_tensors="pt").input_ids.to(device=device)
+    max_length = args.seq_len
+    position_ids = None
+    eos_token_id = tokenizer.eos_token_id
+    num_layers = model.config.n_layer
+    # print all the model weights and check the accuracy
+    # if rank == 0:
+    #     print(model.state_dict())
+    # out = model(input_ids)
+    # print(out)
+    out = model.generate(
+        input_ids=input_ids,
+        max_length=max_length,
+        eos_token_id=eos_token_id,
+        return_dict_in_generate=True,
+        output_scores=True,
+        enable_timing=True,
+        )
+    if rank == 0:
+        print(tokenizer.batch_decode(out.sequences.tolist()))

HybridTensor/benchmarks/generation/opt_sparse_generation.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import torch
+import argparse
+from HybridTensor.utils.utils import _get_device
+from HybridTensor.utils.activations import OPT_MODELS
+from HybridTensor.models.opt import SparseConfig, build_sparse_opt
+from HybridTensor.benchmarks.generation.gen_util import tokenize_dataset, get_random_batch
+from HybridTensor.utils.activations import build_mlp_topk_lookup
+from HybridTensor.routers.mlp.mlp_router_optim import load_router_dict_from_csv
+from datasets import load_dataset
+from transformers.models.opt import OPTConfig
+from transformers import AutoTokenizer
+from flash_attn.models.opt import opt_config_to_gpt2_config
+from flash_attn.utils.generation import update_graph_cache
+def arg_parser():
+    parser = argparse.ArgumentParser(description='Inference benchmarking')
+    parser.add_argument('--batch_size', type=int, default=16)
+    parser.add_argument('--model_index', type=int, default=5)
+    parser.add_argument('--seq_len', type=int, default=1024)
+    parser.add_argument('--index_size', type=int, default=8192)
+    parser.add_argument('--head_density', type=float, default=0.5)
+    parser.add_argument('--print_results', type=bool, default=True)
+    parser.add_argument('--iterations', type=int, default=1)
+    parser.add_argument('--check_results', type=bool, default=False)
+    parser.add_argument('--results_dir', type=str, default='results')
+    parser.add_argument('--gpu', type=int, default=0)
+    parser.add_argument('--attn_topk', type=float, default=0.5, help='Attention topk for sparse model')
+    parser.add_argument('--mlp_ckpt_dir', type=str, default='<PATH_TO_MLP_ROUTER_CHECKPOINTS>')
+    parser.add_argument('--attn_ckpt_dir', type=str, default='<PATH_TO_ATTENTION_CHECKPOINTS>')
+    parser.add_argument('--batch_stats_dir', type=str, default='configs/mlp_router/opt-6.7b')
+    parser.add_argument('--delta', type=int, default=256, help='Delta value for MLP topk calculation')
+    parser.add_argument('--use_cuda_graph', type=bool, default=False, help='Use CUDA graph for inference')
+    return parser.parse_args()
+def update_router_config(model, num_layers, mlp_topk_lookup, attn_topk):
+    for i in range(num_layers):
+        model.transformer.layers[i].mlp_topk = mlp_topk_lookup[i]
+        # model.transformer.layers[i].mlp_topk = 512
+        model.transformer.layers[i].mha_router.topk = attn_topk
+        # model.transformer.layers[i].skip_mlp_router = True
+    model.transformer.layers[0].mha_router.topk = 1.0  # dense attention in layer 0
+if __name__ == "__main__":
+    args = arg_parser()
+    model_name = OPT_MODELS[args.model_index-1]
+    dtype = torch.float16
+    device= _get_device(args.gpu)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # args.mlp_ckpt_dir = None
+    # args.attn_ckpt_dir = None
+    model = build_sparse_opt(args, model_name, args.mlp_ckpt_dir, args.attn_ckpt_dir, device = device, dtype=dtype)
+    model.eval()
+    print(model)
+    print("Model loaded with sparse routers")
+    # mlp_topk_lookup = build_mlp_topk_lookup("results/mlp_results/batch_activations/opt-6.7b", args.batch_size, args.delta)
+    mlp_topk_lookup = load_router_dict_from_csv(args.batch_stats_dir, args.batch_size)
+    print("MLP topk values updated: ", mlp_topk_lookup)
+    update_router_config(model, model.config.n_layer, mlp_topk_lookup, args.attn_topk)  # this sets the router config for all layers using a single config
+    # update_router_config(model, model.config.n_layer, 2048, args.attn_topk)
+    print("Router config updated \n")
+    max_length = args.seq_len + 20
+    batch_size = args.batch_size
+    # input_texts = ["Hello, my dog is cute and", "The future of AI is", "In a distant galaxy, a spaceship", "The cat is sleeping on the "]
+    # input_texts = ["In a distant galaxy, a spaceship"]
+    # tokenized_inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=False).to(device)
+    # input_ids=tokenized_inputs["input_ids"]
+    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+    tokens = tokenize_dataset(dataset, tokenizer)
+    input_ids = get_random_batch(tokens, args.batch_size, args.seq_len).to(device)
+    print("Input ids generated, starting inference")
+    # input_ids = tokenizer("Hello, my dog is cute and he", return_tensors="pt").input_ids.to(device)
+    position_ids = None
+    eos_token_id = tokenizer.eos_token_id
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    with torch.no_grad():
+        # warm up
+        _ = model.generate(
+            input_ids=input_ids,
+            max_length=max_length,
+            eos_token_id=eos_token_id,
+            return_dict_in_generate=True,
+            output_scores=True,
+            enable_timing=False,
+            cg=False,
+            )
+        print("Warm up done")
+        start_event.record()
+        for i in range(args.iterations):
+            out = model.generate(
+                input_ids=input_ids,
+                max_length=max_length,
+                eos_token_id=eos_token_id,
+                return_dict_in_generate=True,
+                output_scores=True,
+                enable_timing=False,
+                cg=False,
+                )
+        end_event.record()
+        torch.cuda.synchronize()
+        print("Without CUDA graph")
+        elapsed_time = start_event.elapsed_time(end_event) / args.iterations
+        print(f"Average time per genearation : {elapsed_time:.1f} ms")
+        # Compute throughput and latency per token
+        num_tokens_generated = out.sequences.shape[1] - input_ids.shape[1]
+        throughput = batch_size * num_tokens_generated / (elapsed_time / 1000)  # tokens per second
+        latency_per_token = elapsed_time / num_tokens_generated  # ms per token
+        print(f"Number of tokens generated: {num_tokens_generated}")
+        print(f"Throughput: {throughput:.1f} tokens/second")
+        print(f"Latency per token: {latency_per_token:.1f} ms")
+        # print(tokenizer.batch_decode(out.sequences.tolist()))
+        print("\n")
+        # print only the new tokens generated
+        print("New tokens generated:")
+        print(tokenizer.batch_decode(out.sequences[:, input_ids.shape[1]:].tolist()))
+        # ====================== With CUDA graph ======================
+        if args.use_cuda_graph:
+            batch_size, seqlen_og = input_ids.shape
+            model._decoding_cache = update_graph_cache(model, None, batch_size, seqlen_og, max_length)
+            print("With CUDA graph")
+            torch.cuda.synchronize()
+            start_event.record()
+            for i in range(args.iterations):
+                out = model.generate(
+                    input_ids=input_ids,
+                    max_length=max_length,
+                    cg=True,
+                    eos_token_id=eos_token_id,
+                    return_dict_in_generate=True,
+                    output_scores=True,
+                    enable_timing=False,
+                    )
+            end_event.record()
+            torch.cuda.synchronize()
+            elapsed_time = start_event.elapsed_time(end_event) / args.iterations
+            print(f"Average time per genearation : {elapsed_time:.1f} ms")
+            # Compute throughput and latency per token
+            num_tokens_generated = out.sequences.shape[1] - input_ids.shape[1]
+            throughput = batch_size * num_tokens_generated / (elapsed_time / 1000)  # tokens per second
+            latency_per_token = elapsed_time / num_tokens_generated  # ms per token
+            print(f"Number of tokens generated: {num_tokens_generated}")
+            print(f"Throughput: {throughput:.1f} tokens/second")
+            print(f"Latency per token: {latency_per_token:.1f} ms")
+            # print(tokenizer.batch_decode(out.sequences.tolist()))
+            print("New tokens generated:")
+            print(tokenizer.batch_decode(out.sequences[:, input_ids.shape[1]:].tolist()))

HybridTensor/benchmarks/model_eval.py ADDED Viewed

	@@ -0,0 +1,313 @@

+import torch
+import argparse
+import os
+import json
+import logging
+import numpy as np
+import csv
+# from hf_models.opt.modeling_opt_routers import (
+#     SparseOPTForCausalLM,
+#     create_hf_mha_router_state_dict,
+#     create_hf_mlp_router_state_dict
+# )
+from hf_models.opt.modeling_opt_routers_topk import (
+    SparseOPTForCausalLM,
+    create_hf_mha_router_state_dict,
+    create_hf_mlp_router_state_dict
+)
+from hf_models.llama.modeling_sparse_llama_routers import (
+    SparseLlamaForCausalLM,
+    create_hf_attn_router_state_dict
+)
+from hf_models.opt.modeling_sparse_opt_topk import SparseOPTForCausalLM as SparseOPTTopKAttn
+from hf_models.llama.modeling_sparse_llama_mha_topk import SparseLlamaForCausalLM as SparseLlamaTopKAttn
+from HybridTensor.benchmarks.opt_attn_sparse_topk_perplexity import _update_model_attn_thresholds
+from HybridTensor.benchmarks.model_perplexity import compute_attn_layer_sparsity, compute_average_activation
+from HybridTensor.utils.activations import ActivationThresholds, build_mlp_topk_lookup, _update_hf_mlp_topk, CONFIGS, MODELS
+from HybridTensor.routers.mlp.mlp_router_optim import load_router_dict_from_csv
+from HybridTensor.utils.utils import extract_model_name
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from lm_eval.models.huggingface import HFLM
+from lm_eval.tasks import TaskManager
+import lm_eval
+import pandas as pd
+from tabulate import tabulate
+import logging
+logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
+import warnings
+warnings.simplefilter(action='ignore', category=FutureWarning)
+from huggingface_hub import login
+def read_and_print_results(filepath='results.csv'):
+    """
+    Reads the CSV file containing evaluation results and prints them in a formatted table.
+    """
+    if not os.path.exists(filepath):
+        print(f"File '{filepath}' not found.")
+        return
+    df = pd.read_csv(filepath)
+    print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))
+def save_results_to_csv(results, attn_topk, filepath='eval_results.csv'):
+    """
+    Extracts benchmark accuracies from results and saves them along with the attn_topk config.
+    Parameters:
+      results: dict, evaluation results with structure results['results'][<benchmark>]['acc,none']
+      attn_topk: float, the attention top-k value used for this run
+      filepath: str, CSV file to write to (appends if it exists)
+    """
+    # Build a dictionary row with attn_topk and each benchmark's accuracy
+    row = {'attn_topk': attn_topk}
+    for benchmark, data in results['results'].items():
+        # Default to None if the key is missing
+        row[benchmark] = data.get('acc,none', None)
+    # Check if file exists to decide on writing header
+    file_exists = os.path.isfile(filepath)
+    with open(filepath, 'a', newline='') as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=row.keys())
+        if not file_exists:
+            writer.writeheader()
+        writer.writerow(row)
+def _update_model_attn_sparsity(model, attn_th):
+    num_layers = model.config.num_hidden_layers
+    # Use the 'decoder' attribute if it exists; otherwise use model.model.layers
+    layers = model.model.decoder.layers if hasattr(model.model, 'decoder') else model.model.layers
+    attn_sparsity_map = compute_attn_layer_sparsity(model_name=model_name, min_th=0.2, critical_th=0.3, attn_sparsity=attn_th)
+    for i in range(num_layers):
+        layers[i].self_attn.sp_threshold = attn_sparsity_map[i]
+    average_act = compute_average_activation(attn_sparsity_map)
+    print(f"Attention sparsity {attn_th}: {attn_sparsity_map}")
+    print(f"Average activation: {average_act:.2f}")
+    return model
+def _evaluate_model(model, tokenizer, benchmarks: list, device: str, batch_size: int = 8):
+    logging.info("Evaluating on benchmarks: %s", benchmarks)
+    lm_obj = HFLM(
+        pretrained=model,
+        tokenizer=tokenizer,
+        device=device,
+        batch_size=batch_size
+    )
+    task_manager = TaskManager()
+    num_fewshot = 5
+    print(f"Number of fewshot examples: {num_fewshot}")
+    results = lm_eval.simple_evaluate(
+        model=lm_obj,
+        tasks=benchmarks,
+        num_fewshot=num_fewshot,  # change this
+        task_manager=task_manager
+    )
+    logging.info("Evaluation complete.")
+    for benchmark, benchmark_results in results['results'].items():
+        logging.info("Results for %s: %s", benchmark.upper(), benchmark_results)
+    return results
+def _load_model(model_name, num_layers, device, args):
+    if args.mode == 'sparse':
+        logging.info("Loading sparse model...")
+        sp_thresholds = ActivationThresholds(num_layers=num_layers, attn_th= args.attn_topk, mlp_th=args.mlp_topk)
+        if args.model_index <=8:
+            # OPT models
+            model = SparseOPTForCausalLM.from_pretrained(
+                model_name,
+                device_map=device,
+                torch_dtype=torch.float16,
+                sp_thresholds=sp_thresholds.activation_threshold,
+                mlp_thresholds=sp_thresholds.mlp_threshold,
+                attn_implementation="flash_attention_2"
+            )
+            logging.info("Loading router states...")
+            mlp_router_state = create_hf_mlp_router_state_dict(args.mlp_ckpt_dir)
+            mha_router_state = create_hf_mha_router_state_dict(args.attn_ckpt_dir)
+            model_state = model.state_dict()
+            model_state.update(mlp_router_state)
+            model_state.update(mha_router_state)
+            model.load_state_dict(model_state)
+            logging.info("Sparse model loaded with routers!")
+            # load topk values for mlp and attn here
+            # mlp_topk_lookup = build_mlp_topk_lookup(args.batch_stats_dir, args.batch_size, args.delta)
+            # mlp_topk_lookup = build_mlp_topk_lookup(args.batch_stats_dir, 1, args.delta)
+            mlp_topk_lookup = load_router_dict_from_csv(args.batch_stats_dir, 1)
+            _update_hf_mlp_topk(model, mlp_topk_lookup)
+            # print("MLP topk values updated.")
+            # print("MLP topk values: ", mlp_topk_lookup)
+            logging.info("Using MLP topk values: %s", mlp_topk_lookup)
+            # print("Using delta value: ", args.delta)
+            # the first layer should use dense attention
+            model.model.decoder.layers[0].self_attn.sp_threshold = 1.0
+        else:
+            # Llama models
+            if not args.static_thresholds:
+                attn_sparsity_map = compute_attn_layer_sparsity(model_name=model_name, min_th=0.2, critical_th=0.3, attn_sparsity=args.attn_topk)
+                sp_thresholds.load_thresholds(attn_sparsity_map)
+                average_act = compute_average_activation(attn_sparsity_map)
+                print(f"Layer imporatance weights attention activations {sp_thresholds.activation_threshold}")
+                print(f"Average activation: {average_act:.2f}")
+            model = SparseLlamaForCausalLM.from_pretrained(model_name,
+                                                           device_map = device,
+                                                           torch_dtype=torch.float16,
+                                                            sp_thresholds = sp_thresholds.activation_threshold,
+                                                            attn_implementation="flash_attention_2")
+            logging.info("Loading router states...")
+            model_state = model.state_dict()
+            attn_router_states = create_hf_attn_router_state_dict(args.attn_ckpt_dir)
+            model_state.update(attn_router_states)
+            model.load_state_dict(model_state)
+            logging.info("Sparse model loaded with routers!")
+            # the first layer should use dense attetnion
+            _update_model_attn_thresholds(model, args.attn_topk)
+        # load topk values for mha here
+        # TODO: create a function to update the topk values for mha
+    elif args.mode == 'sparse_attn':
+        logging.info("Loading model with sparse attention")
+        sp_thresholds = ActivationThresholds(num_layers=num_layers, attn_th=args.attn_topk)
+        if not args.static_thresholds:
+            attn_sparsity_map = compute_attn_layer_sparsity(model_name=model_name, min_th=0.2, critical_th=0.3, attn_sparsity=args.attn_topk)
+            sp_thresholds.load_thresholds(attn_sparsity_map)
+            average_act = compute_average_activation(attn_sparsity_map)
+            print(f"Layer imporatance weights attention activations {sp_thresholds.activation_threshold}")
+            print(f"Average activation: {average_act:.2f}")
+        if args.model_index <= 8:
+            # opt models
+            model = SparseOPTTopKAttn.from_pretrained(model_name, device_map = device, torch_dtype=torch.float16, sp_thresholds = sp_thresholds.activation_threshold, attn_implementation="flash_attention_2")
+        else:
+            # llama models
+            model = SparseLlamaTopKAttn.from_pretrained(model_name, device_map = device, torch_dtype=torch.float16, sp_thresholds = sp_thresholds.activation_threshold, attn_implementation="flash_attention_2")
+    else:
+        logging.info("Loading dense model...")
+        model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device, torch_dtype=torch.float16)
+    return model
+def arg_parser():
+    parser = argparse.ArgumentParser(description='Inference benchmarking')
+    parser.add_argument('--batch_size', type=int, default=8)
+    parser.add_argument('--model_index', type=int, default=5)
+    parser.add_argument('--print_results', type=bool, default=True)
+    parser.add_argument('--results_dir', type=str, default='results/eval')
+    parser.add_argument('--device', type=int, default=100)
+    parser.add_argument('--mode', type=str, default='sparse', choices=['sparse', 'dense', 'sparse_attn'])
+    parser.add_argument('--attn_topk', type=float, default=0.5, help='Attention topk for sparse model')
+    parser.add_argument('--mlp_topk', type=int, default=2048, help='MLP topk for sparse model')
+    parser.add_argument('--delta', type=int, default=128, help='Delta value for MLP topk calculation')
+    parser.add_argument('--mlp_ckpt_dir', type=str, default='<PATH_TO_MLP_ROUTER_CHECKPOINTS>')
+    parser.add_argument('--attn_ckpt_dir', type=str, default='<PATH_TO_ATTENTION_CHECKPOINTS>')
+    parser.add_argument('--batch_stats_dir', type=str, default='configs/mlp_router')
+    parser.add_argument('--data_collection', type=bool, default=False, help='Collect data for different activation thresholds')
+    parser.add_argument('--benchmark', type=str, default='all', help='Options: all, or a single benchmark name')
+    parser.add_argument('--note', type=str, default='', help='Note to add to the results filename')
+    parser.add_argument('--static_thresholds', type=bool, default=True, help='Use static thresholds for attention layers')
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = arg_parser()
+    login_token = None # insert your token here
+    assert login_token is not None, "Please provide a valid Hugging Face token."
+    login(token=login_token)
+    # Setup logging
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+    model_name = MODELS[args.model_index - 1]
+    # print(f"Evaluating Model: {model_name}")
+    logging.info("Evaluating Model: %s", model_name)
+    logging.info("Mode: %s", args.mode)
+    num_layers = CONFIGS[model_name]['num_layer']
+    device = 'auto' if args.device == 100 else f'cuda:{args.device}'
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+    model = _load_model(model_name, num_layers, device, args)
+    model.eval()
+    # Determine benchmarks to evaluate
+    if args.benchmark == 'all':
+        benchmarks = ["piqa", "winogrande", "copa", "rte", "openbookqa", "arc_easy", "arc_challenge", "mmlu", "hellaswag"]
+    else:
+        benchmarks = [args.benchmark]
+    model_name_clean = extract_model_name(model_name)
+    if args.data_collection:
+        # make sure the model is not dense
+        assert args.mode != 'dense', "Data collection is only available for sparse models"
+        logging.info("Data collection mode enabled.")
+        if args.mode == 'sparse':
+            filepath = f"{args.results_dir}/eval_results_{model_name_clean}_sparse_sweep_dpsd.csv"
+        else:   # sparse_attn
+            filepath = f"{args.results_dir}/eval_results_{model_name_clean}_attn_sweep_dpsd.csv"
+        if args.note != '':
+            filepath = filepath.replace('.csv', f"_{args.note}.csv")
+        # attn_topk_values = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]   # MHA
+        attn_topk_values = [0.9, 0.8, 0.7, 0.6, 0.4, 0.3, 0.2, 0.1]
+        # attn_topk_values = [7/8, 6/8, 5/8, 4/8, 3/8, 2/8, 1/8] # GQA
+        for attn_topk in attn_topk_values:
+            logging.info("Evaluating with attention top-k value: %s", attn_topk)
+            if args.static_thresholds:
+                _update_model_attn_thresholds(model, attn_topk, mode=args.mode)
+            else:
+                _update_model_attn_sparsity(model, attn_topk)
+            results = _evaluate_model(
+                model=model,
+                tokenizer=tokenizer,
+                benchmarks=benchmarks,
+                device=device,
+                batch_size=args.batch_size
+            )
+            save_results_to_csv(results, attn_topk, filepath = filepath)
+    else:
+        logging.info("Evaluating with attention top-k value: %s", args.attn_topk)
+        if args.mode == 'dense':
+            filepath = f"{args.results_dir}/eval_results_{model_name_clean}_dense.csv"
+        elif args.mode == 'sparse_attn':
+            filepath = f"{args.results_dir}/eval_results_{model_name_clean}_sparse_attn_{args.attn_topk}_dpsd.csv"
+        else:
+            filepath = f"{args.results_dir}/eval_results_{model_name_clean}_test_attn_{args.attn_topk}_dpsd.csv"
+        if args.note != '':
+            filepath = filepath.replace('.csv', f"_{args.note}.csv")
+        results = _evaluate_model(
+            model=model,
+            tokenizer=tokenizer,
+            benchmarks=benchmarks,
+            device=device,
+            batch_size=args.batch_size
+        )
+        save_results_to_csv(results, args.attn_topk, filepath = filepath)
+    if args.print_results:
+        read_and_print_results(filepath=filepath)

HybridTensor/benchmarks/model_perplexity.py ADDED Viewed

	@@ -0,0 +1,165 @@

+# python -m HybridTensor.benchmarks.model_perplexity --model_index 14 --batch_size 4 --max_length 512 --attn_th 1 --static_thresholds True
+import sys
+import math
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+from hf_models.opt.modeling_sparse_opt_topk import SparseOPTForCausalLM as SparseOPTTopkAttn
+from hf_models.llama.modeling_sparse_llama_mha_topk import SparseLlamaForCausalLM as SparseLlamaTopKAttn
+from HybridTensor.utils.activations import ActivationThresholds, identify_model_type, MODELS, CONFIGS
+from HybridTensor.utils.utils import extract_model_name, compute_perplexity
+import argparse
+from datasets import load_dataset
+import json
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import pandas as pd
+from HybridTensor.benchmarks.opt_attn_sparse_topk_perplexity import (_update_model_attn_thresholds,
+                                                                    build_data_loader,
+                                                                    compute_sparse_perplexity,
+                                                                    compute_perplexity_data_collection,
+                                                                    display_model_menu,
+                                                                    _interactive_mode,
+                                                                    arg_parser,
+                                                                     )
+results_dir = "results/activations"
+def compute_attn_layer_sparsity(model_name, min_th, critical_th, attn_sparsity):
+    # Get model configuration
+    # model_name = MODELS[model_index - 1]
+    model_config = CONFIGS[model_name]
+    num_layers = model_config['num_layer']
+    # Load the importance scores from the file specified in the configuration
+    file_path = model_config['layer_imp']
+    with open(file_path, 'r') as f:
+        attn_layer_imp = json.load(f)
+    layer_importance = attn_layer_imp['importance_scores']
+    # Classify layers as critical or sparse
+    critical_layers = [i for i, imp in enumerate(layer_importance) if imp >= critical_th]
+    sparse_layers   = [i for i, imp in enumerate(layer_importance) if imp < critical_th]
+    # Calculate total sparse importance and the attention value
+    sum_sparse_importance = sum(layer_importance[i] for i in sparse_layers)
+    attn_val = attn_sparsity * len(sparse_layers)
+    # Compute the sparsity map per layer
+    layer_sparsity_map = {}
+    for layer_idx in range(num_layers):
+        if layer_idx in critical_layers:
+            layer_sparsity_map[layer_idx] = 1.0  # Fully dense for critical layers
+        else:
+            if sum_sparse_importance > 0:
+                raw_fraction = (layer_importance[layer_idx] / sum_sparse_importance) * attn_val
+            else:
+                raw_fraction = attn_sparsity
+            # Clamp the fraction between min_th and 1.0
+            fraction = max(raw_fraction, min_th)
+            fraction = min(fraction, 1.0)
+            layer_sparsity_map[layer_idx] = fraction
+    return layer_sparsity_map
+def compute_average_activation(layer_sparsity_map):
+    """
+    Computes the average activation for each layer based on the sparsity map.
+    """
+    total_activation = 0.0
+    for layer_idx, fraction in layer_sparsity_map.items():
+        total_activation += fraction
+    average_activation = total_activation / len(layer_sparsity_map)
+    return average_activation
+def compute_sparse_perplexity(model_name='facebook/opt-125m',
+                            dataset_name='wikitext',
+                            dataset_config='wikitext-2-raw-v1',
+                            batch_size=8,
+                            max_length=512,
+                            attn_th=0.0,
+                            static_thresholds=True,
+                            device_map="cuda:0"):
+    # Set device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f'Using device: {device}')
+    # load the activation thresholds
+    num_layers = CONFIGS[model_name]['num_layer']
+    sp_thresholds = ActivationThresholds(num_layers=num_layers, attn_th=attn_th)
+    print(f"Static attention activations: {sp_thresholds.activation_threshold}")
+    if not static_thresholds:
+        # act_threshold_filepath = CONFIGS[model_name]['sp_config']
+        attn_sparsity_map = compute_attn_layer_sparsity(model_name=model_name, min_th=0.2, critical_th=0.3, attn_sparsity=attn_th)
+        sp_thresholds.load_thresholds(attn_sparsity_map)
+        average_act = compute_average_activation(attn_sparsity_map)
+        print(f"Layer imporatance weights attention activations {sp_thresholds.activation_threshold}")
+        print(f"Average activation: {average_act:.2f}")
+    # Load tokenizer and model
+    # tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+    model_type = identify_model_type(model_name)
+    if model_type == 'OPT':
+        print(f"Loading OPT model: {model_name}")
+        model = SparseOPTTopkAttn.from_pretrained(model_name, device_map = device_map, torch_dtype=torch.float16, sp_thresholds = sp_thresholds.activation_threshold, attn_implementation="flash_attention_2")
+    elif model_type == 'Llama':
+        print(f"Loading Llama model: {model_name}")
+        model = SparseLlamaTopKAttn.from_pretrained(model_name, device_map = device_map, torch_dtype=torch.float16, sp_thresholds = sp_thresholds.activation_threshold, attn_implementation="flash_attention_2")
+    model.eval()
+    # # Load dataset
+    dataloader = build_data_loader(model_name, dataset_name, dataset_config, batch_size, max_length)
+    perplexity = compute_perplexity(model, dataloader, device)
+    return perplexity
+def arg_parser():
+    parser = argparse.ArgumentParser(description='Sparse Perplexity Evaluation')
+    parser.add_argument('--model_index', type=int, default=5, help='Index of the model to evaluate')
+    parser.add_argument('--batch_size', type=int, default=8, help='Batch size for evaluation')
+    parser.add_argument('--max_length', type=int, default=512, help='Maximum sequence length')
+    parser.add_argument('--attn_th', type=float, default=0.0, help='Activation threshold for attention layers')
+    parser.add_argument('--data_collection', type=bool, default=False, help='Collect data for different activation thresholds')
+    parser.add_argument('--device_map', type=str, default='auto', help='Device to use for evaluation')
+    parser.add_argument('--interactive', type=bool, default=False, help='Interactive mode for model selection')
+    parser.add_argument('--static_thresholds', type=bool, default=False, help='Use static thresholds for attention layers')
+    return parser.parse_args()
+def main():
+    """
+    Main function to execute the perplexity computation with user-selected OPT model.
+    """
+    print("=== OPT Models Perplexity Evaluation ===\n")
+    args = arg_parser()
+    if args.interactive:
+        selected_model, batch_size, max_length, data_collection, device_map, attn_th = _interactive_mode()
+    else:
+        selected_model, batch_size, max_length, data_collection, device_map, attn_th = MODELS[args.model_index-1], args.batch_size, args.max_length, args.data_collection, args.device_map, args.attn_th
+        print(f"Selected model: {selected_model}, batch size: {batch_size}, max length: {max_length}, attn_th: {attn_th}, data_collection: {data_collection}, device: {device_map}")
+    if data_collection:
+        print("\nStarting data collection...\n")
+        compute_perplexity_data_collection(model_name=selected_model, batch_size=batch_size, max_length=max_length, device_map=device_map)
+        print("\nData collection complete.\n")
+    else:
+        print("\nStarting perplexity computation...\n")
+        perplexity = compute_sparse_perplexity(model_name=selected_model, batch_size=batch_size, max_length=max_length,
+                                               attn_th=attn_th,
+                                               device_map=device_map,
+                                               static_thresholds=args.static_thresholds)
+        print(f"\n=== Perplexity Results ===")
+        print(f"Model: {selected_model}")
+        print(f"Perplexity: {perplexity:.2f}\n")
+if __name__ == "__main__":
+    main()

HybridTensor/benchmarks/opt_attn_sparse_topk_perplexity.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import sys
+import math
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+# from hf_models.opt.modeling_sparse_opt import SparseOPTForCausalLM
+from hf_models.opt.modeling_sparse_opt_topk import SparseOPTForCausalLM
+from HybridTensor.utils.activations import ActivationThresholds, MODELS, CONFIGS
+from HybridTensor.utils.utils import extract_model_name, compute_perplexity
+import argparse
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import pandas as pd
+results_dir = "results/activations"
+def _update_model_attn_thresholds(model, attn_th, mode='sparse'):
+    num_layers = model.config.num_hidden_layers
+    # Use the 'decoder' attribute if it exists; otherwise use model.model.layers
+    layers = model.model.decoder.layers if hasattr(model.model, 'decoder') else model.model.layers
+    for i in range(num_layers):
+        layers[i].self_attn.sp_threshold = attn_th
+    # For non-sparse attention, layer 0 should use a threshold of 1.0
+    # if mode != 'sparse_attn':
+    #     layers[0].self_attn.sp_threshold = 1.0
+    layers[0].self_attn.sp_threshold = 1.0
+    return model
+def build_data_loader(model_name, dataset_name, dataset_config, batch_size, max_length, split='test'):
+    """
+    Build a DataLoader for the specified dataset.
+    Args:
+    - model_name (str): The Hugging Face identifier of the model.
+    - dataset_name (str): The name of the dataset.
+    - dataset_config (str): The configuration of the dataset.
+    - batch_size (int): The batch size for the DataLoader.
+    - max_length (int): The maximum sequence length.
+    - split (str): The split of the dataset to use (default='test'). options: 'train', 'validation', 'test'
+    Returns:
+    - dataloader (DataLoader): The DataLoader for the specified dataset.
+    """
+    # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Load dataset
+    dataset = load_dataset(dataset_name, dataset_config, split=split)
+    dataset = dataset.filter(lambda x: len(x["text"]) >= max_length)
+    # Tokenize the dataset
+    def tokenize_function(examples):
+        return tokenizer(examples['text'], return_special_tokens_mask=True, truncation=True, max_length=max_length)
+    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])
+    # Create DataLoader
+    def collate_fn(batch):
+        input_ids = [torch.tensor(example['input_ids']) for example in batch]
+        attention_mask = [torch.tensor(example['attention_mask']) for example in batch]
+        # Pad sequences
+        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
+        attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
+        return {'input_ids': input_ids, 'attention_mask': attention_mask}
+    dataloader = DataLoader(tokenized_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
+    return dataloader
+def compute_sparse_perplexity(model_name='facebook/opt-125m', dataset_name='wikitext', dataset_config='wikitext-2-raw-v1', batch_size=8, max_length=512, attn_th=0.0, static_thresholds=True, device_map="cuda:0"):
+    # Set device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f'Using device: {device}')
+    # load the activation thresholds
+    num_layers = CONFIGS[model_name]['num_layer']
+    sp_thresholds = ActivationThresholds(num_layers=num_layers, attn_th=attn_th)
+    if not static_thresholds:
+        act_threshold_filepath = CONFIGS[model_name]['sp_config']
+        sp_thresholds.load_thresholds(act_threshold_filepath)
+        print(f'Activation thresholds loaded from {act_threshold_filepath}')
+    print(sp_thresholds.activation_threshold)
+    # Load tokenizer and model
+    # tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+    model = SparseOPTForCausalLM.from_pretrained(model_name, device_map = device_map, torch_dtype=torch.float16, sp_thresholds = sp_thresholds.activation_threshold, attn_implementation="flash_attention_2")
+    model.eval()
+    # # Load dataset
+    dataloader = build_data_loader(model_name, dataset_name, dataset_config, batch_size, max_length)
+    perplexity = compute_perplexity(model, dataloader, device)
+    return perplexity
+def compute_perplexity_data_collection(model_name='facebook/opt-125m', dataset_name='wikitext', dataset_config='wikitext-2-raw-v1', batch_size=8, max_length=512, device_map="cuda:0"):
+    # Set device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f'Using device: {device}')
+    # Load dataset
+    dataset = load_dataset(dataset_name, dataset_config, split='test')
+    dataset = dataset.filter(lambda x: len(x["text"]) >= 512)
+    dataloader = build_data_loader(model_name, dataset_name, dataset_config, batch_size, max_length)
+    attn_thresholds = [1, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]
+    # attn_thresholds = [1, 0.5, 0.2]
+    print(f"Computing perplexity for the following attention thresholds: {attn_thresholds}")
+    # load the model
+    num_layers = CONFIGS[model_name]['num_layer']
+    sp_thresholds = ActivationThresholds(num_layers=num_layers, attn_th=0.1)
+    model = SparseOPTForCausalLM.from_pretrained(model_name, device_map = device_map, torch_dtype=torch.float16, sp_thresholds = sp_thresholds.activation_threshold, attn_implementation="flash_attention_2")
+    model.eval()
+    results = []
+    for attn_th in attn_thresholds:
+        print(f'Computing perplexity for attn top k: {attn_th}')
+        # update the model with new threshold
+        model = _update_model_attn_thresholds(model, attn_th)
+        # compute and store the perplexity
+        perplexity = compute_perplexity(model, dataloader, device)
+        print(f'Perplexity: {perplexity:.2f}\n')
+        results.append({
+            'model': model_name,
+            'top_k': attn_th,
+            'perplexity': perplexity
+        })
+    # save the results to a csv file
+    results_df = pd.DataFrame(results)
+    model_name_str = extract_model_name(model_name)
+    # save the results to a csv file in the results directory
+    results_df.to_csv(f'{results_dir}/sparse_perplexity_results_{model_name_str}_topk.csv', index=False)
+def display_model_menu():
+    """
+    Displays a numbered menu of available OPT models and prompts the user to make a selection.
+    Returns:
+    - selected_model (str): The Hugging Face identifier of the selected model.
+    """
+    print("Available OPT Models:")
+    for idx, model in enumerate(MODELS, 1):
+        print(f"{idx}. {model}")
+    while True:
+        try:
+            choice = input("\nEnter the number corresponding to the model you want to evaluate (e.g., 1): ")
+            if choice.lower() in ['q', 'quit', 'exit']:
+                print("Exiting the program.")
+                sys.exit(0)
+            choice = int(choice)
+            if 1 <= choice <= len(MODELS):
+                selected_model = MODELS[choice - 1]
+                print(f"\nYou have selected: {selected_model}\n")
+                return selected_model
+            else:
+                print(f"Please enter a number between 1 and {len(MODELS)}.")
+        except ValueError:
+            print("Invalid input. Please enter a valid number.")
+def _interactive_mode():
+    selected_model = display_model_menu()
+    # Optional: Allow user to adjust batch size and max sequence length
+    try:
+        batch_size_input = input("Enter batch size (default=8): ").strip()
+        batch_size = int(batch_size_input) if batch_size_input else 8
+    except ValueError:
+        print("Invalid input for batch size. Using default value of 8.")
+        batch_size = 8
+    max_length = 512
+    try:
+        data_collection = input("Do you want to collect data for different activation thresholds? (y/n): ").strip()
+        data_collection = True if data_collection.lower() == 'y' else False
+    except ValueError:
+        print("Invalid input for data collection. Using default value of False.")
+        data_collection = False
+    # select device
+    device_map = input("Enter device (cuda:0/auto) [default=cuda:0]: ").strip()
+    if not device_map:
+        device_map = "cuda:0"
+    # select attention threshold
+    attn_th = 0.0
+    if not data_collection:
+        try:
+            attn_th = input("Enter activation threshold for attention layers: ").strip()
+            attn_th = float(attn_th) if attn_th else 0.0
+        except ValueError:
+            print("Invalid input for attention threshold. Using default value of 0.")
+            attn_th = 0.0
+    return selected_model, batch_size, max_length, data_collection, device_map, attn_th
+def arg_parser():
+    parser = argparse.ArgumentParser(description='Sparse Perplexity Evaluation')
+    parser.add_argument('--model_index', type=int, default=5, help='Index of the model to evaluate')
+    parser.add_argument('--batch_size', type=int, default=8, help='Batch size for evaluation')
+    parser.add_argument('--max_length', type=int, default=512, help='Maximum sequence length')
+    parser.add_argument('--attn_th', type=float, default=0.0, help='Activation threshold for attention layers')
+    parser.add_argument('--data_collection', type=bool, default=False, help='Collect data for different activation thresholds')
+    parser.add_argument('--device_map', type=str, default='cuda:0', help='Device to use for evaluation')
+    parser.add_argument('--interactive', type=bool, default=False, help='Interactive mode for model selection')
+    return parser.parse_args()
+def main():
+    """
+    Main function to execute the perplexity computation with user-selected OPT model.
+    """
+    print("=== OPT Models Perplexity Evaluation ===\n")
+    args = arg_parser()
+    if args.interactive:
+        selected_model, batch_size, max_length, data_collection, device_map, attn_th = _interactive_mode()
+    else:
+        selected_model, batch_size, max_length, data_collection, device_map, attn_th = MODELS[args.model_index-1], args.batch_size, args.max_length, args.data_collection, args.device_map, args.attn_th
+        print(f"Selected model: {selected_model}, batch size: {batch_size}, max length: {max_length}, attn_th: {attn_th}, data_collection: {data_collection}, device: {device_map}")
+    if data_collection:
+        print("\nStarting data collection...\n")
+        compute_perplexity_data_collection(model_name=selected_model, batch_size=batch_size, max_length=max_length, device_map=device_map)
+        print("\nData collection complete.\n")
+    else:
+        print("\nStarting perplexity computation...\n")
+        perplexity = compute_sparse_perplexity(model_name=selected_model, batch_size=batch_size, max_length=max_length, attn_th=attn_th, device_map=device_map)
+        print(f"\n=== Perplexity Results ===")
+        print(f"Model: {selected_model}")
+        print(f"Perplexity: {perplexity:.2f}\n")
+if __name__ == "__main__":
+    main()

HybridTensor/benchmarks/select_block_decode.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import torch
+from tests.test_select_block import create_block, Config, SparseConfig
+import csv
+import time
+import torch
+import torch.nn as nn
+from flash_attn.utils.generation import InferenceParams
+from HybridTensor.utils.utils import arg_parser, _get_device, sparse_index, generate_random_BH_index, get_gpu_name
+from HybridTensor.utils.profiling import cuda_profiler
+import math
+from tqdm import tqdm
+def run_simulation(args, batch_size, seq_len, index_size, attn_topk, device, dtype):
+    config = Config()
+    sp_config = SparseConfig()
+    sp_config.attn_topk = attn_topk
+    config.hidden_size = args.in_features
+    config.num_attention_heads = args.in_features // 128
+    config.use_heuristic = False    # use pre-compiled heuristic or complie new one during runtime
+    # Test create_block
+    sparse_block = create_block(config, sp_config, layer_idx=0, process_group=None, device=device, dtype=dtype)
+    sparse_block.eval()
+    sparse_block.mlp_topk = index_size
+    regular_config = config
+    regular_config.att_sparse = False
+    regular_config.mlp_sparse = False
+    regular_block = create_block(regular_config, None, layer_idx=0, process_group=None, device=device, dtype=dtype)
+    regular_block.eval()
+    # inference simulation with select block
+    max_seqlen = seq_len + 16
+    max_batch_size = batch_size
+    in_features = args.in_features
+    head_dim = 128
+    inference_params = InferenceParams(max_seqlen=max_seqlen, max_batch_size=max_batch_size)
+    process_group = None
+    sequence_parallel = False
+    # for testing and debugging
+    heads = config.num_attention_heads
+    selected_heads = heads // 2
+    # Create a static index vector (length equals total columns in B).
+    total_neurons = args.in_features * 4
+    test_index_vec = torch.empty((total_neurons,), device='cuda', dtype=torch.int32)
+    active_indices = sparse_index(args.index_size, total_neurons)[0]
+    test_index_vec[:args.index_size] = active_indices
+    if args.index_size < total_neurons:
+        test_index_vec[args.index_size:] = 0  # Fill the rest with dummy values.
+    # test_index_vec = sparse_index(args.in_features, args.in_features*4)[0].cuda()
+    test_bh_idx = generate_random_BH_index(args.batch_size, heads, selected_heads)
+    test_index_size = args.index_size
+    mixer_kwargs = (
+        {"seqlen": seq_len}
+        if process_group is not None and sequence_parallel
+        else {}
+    )
+    if inference_params is not None:
+        mixer_kwargs["inference_params"] = inference_params
+    with torch.no_grad():
+        # prefill stage
+        original_seq = torch.randn(batch_size, seq_len, in_features, device='cuda', dtype=torch.float16)
+        # Test prefill
+        output_sparse = sparse_block(original_seq, mixer_kwargs=mixer_kwargs)
+        output_regular = regular_block(original_seq, mixer_kwargs=mixer_kwargs)
+        # need to update inference_params to reflect the new sequence length
+        mixer_kwargs["inference_params"].seqlen_offset = seq_len
+        # Decode stage
+        input_x = torch.randn(batch_size, 1, in_features, device='cuda', dtype=torch.float16)
+        out_decode_sparse = sparse_block(input_x, mixer_kwargs=mixer_kwargs)
+        mixer_kwargs["inference_params"].seqlen_offset = seq_len
+        out_decode_regular = regular_block(input_x, mixer_kwargs=mixer_kwargs)
+        # mesure decode stage time in ms
+        # print("Without CUDA Graphs")
+        # out_decode_regular, regular_time = cuda_profiler(regular_block, input_x, mixer_kwargs=mixer_kwargs, warmup_runs=1, timed_runs=2)
+        # print(f"Regular time: {regular_time} ms")
+        # out_decode_sparse, sparse_time = cuda_profiler(sparse_block, input_x, mixer_kwargs=mixer_kwargs, warmup_runs=1, timed_runs=2)
+        # print(f"Sparse time: {sparse_time} ms")
+        # speedup = regular_time / sparse_time
+        # print(f"Speedup: {speedup}")
+        # --- CUDA Graph Capture for Decode Stage ---
+        # Allocate static buffer for regular block (shape assumed fixed)
+        input_x_static = input_x.clone()
+        output_regular_static = torch.empty((batch_size, 1, in_features), device=device, dtype=dtype)
+        # Capture regular block graph
+        _ = regular_block(input_x_static, mixer_kwargs=mixer_kwargs)
+        torch.cuda.synchronize()
+        graph_regular = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph_regular):
+            res = regular_block(input_x_static, mixer_kwargs=mixer_kwargs)
+            if isinstance(res, tuple):
+                res = res[0]
+            output_regular_static.copy_(res)
+        # For the sparse block, run a dummy call to determine its output shape.
+        # Also, reset the inference parameter to ensure consistent behavior.
+        mixer_kwargs["inference_params"].seqlen_offset = seq_len
+        temp = sparse_block(input_x_static, mixer_kwargs=mixer_kwargs)
+        if isinstance(temp, tuple):
+            temp = temp[0]
+        # print("Captured sparse block output shape:", temp.shape)
+        # Allocate static buffer matching the dummy run's shape.
+        output_sparse_static = torch.empty_like(temp)
+        # print("output_sparse_static shape:", output_sparse_static.shape)
+        torch.cuda.synchronize()
+        mixer_kwargs["inference_params"].seqlen_offset = seq_len
+        graph_sparse = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(graph_sparse):
+            res = sparse_block(input_x_static, mixer_kwargs=mixer_kwargs)
+            if isinstance(res, tuple):
+                res = res[0]
+            output_sparse_static.copy_(res)
+        # Warmup CUDA Graph replays
+        for _ in range(5):
+            graph_regular.replay()
+            graph_sparse.replay()
+        torch.cuda.synchronize()
+        # --- Measure CUDA Graph Replay Latency ---
+        num_replays = 10
+        start = time.time()
+        for _ in range(num_replays):
+            graph_regular.replay()
+        torch.cuda.synchronize()
+        regular_graph_time = (time.time() - start) * 1000 / num_replays
+        start = time.time()
+        for _ in range(num_replays):
+            graph_sparse.replay()
+        torch.cuda.synchronize()
+        sparse_graph_time = (time.time() - start) * 1000 / num_replays
+        speedup = regular_graph_time / sparse_graph_time
+        # print()
+        # print("With CUDA Graphs")
+        # print(f"Regular block time (CUDA Graphs): {regular_graph_time} ms")
+        # print(f"Sparse block time (CUDA Graphs): {sparse_graph_time} ms")
+        # print(f"Speedup (CUDA Graphs): {speedup}")
+    return regular_graph_time, sparse_graph_time, speedup
+if __name__ == "__main__":
+    args = arg_parser()
+    device = _get_device(0)
+    dtype = torch.float16
+    gpu_name = get_gpu_name()
+    # Parameter grids.
+    # batch_sizes = [1, 4, 8, 16]
+    # seq_lengths = [128, 512]
+    # index_sizes = [512, 1024, 2048, 4096]
+    # attn_topks = [0.3, 0.4, 0.5]
+    batch_sizes = [1, 8, 16, 32]
+    seq_lengths = [1024, 2048]
+    # index_sizes = [512, 1024, 2048, 4096, 8192]
+    index_size_p = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
+    total_neurons = args.in_features * 4
+    # Calculate initial index_size values
+    index_sizes = [int(total_neurons * i) for i in index_size_p]
+    # Round up to the nearest multiple of 128 if necessary
+    index_sizes = [math.ceil(size / 128) * 128 if size % 128 != 0 else size for size in index_sizes]
+    attn_topks = [0.3, 0.4, 0.5]
+    # Calculate total number of simulations.
+    total_runs = len(batch_sizes) * len(seq_lengths) * len(index_sizes) * len(attn_topks)
+    output_file = f"results/simulations/{gpu_name}_select_block_{args.in_features}_inference_sim.csv"
+    with open(output_file, mode='w', newline='') as csv_file:
+        fieldnames = ["in_features", "batch_size", "seq_len", "index_size", "neuron_activation", "attn_topk",
+                      "regular_graph_time_ms", "sparse_graph_time_ms", "speedup"]
+        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+        writer.writeheader()
+        # Iterate over all combinations with tqdm progress bar.
+        for batch_size in tqdm(batch_sizes, desc="Batch Sizes"):
+            for seq_len in seq_lengths:
+                for index_size in index_sizes:
+                    for attn_topk in attn_topks:
+                        reg_time, spa_time, speedup = run_simulation(args, batch_size, seq_len, index_size, attn_topk, device, dtype)
+                        writer.writerow({
+                            "in_features": args.in_features,
+                            "batch_size": batch_size,
+                            "seq_len": seq_len,
+                            "index_size": index_size,
+                            "neuron_activation": index_size / total_neurons,
+                            "attn_topk": attn_topk,
+                            "regular_graph_time_ms": reg_time,
+                            "sparse_graph_time_ms": spa_time,
+                            "speedup": speedup
+                        })
+                        csv_file.flush()
+    print(f"Simulation complete. Results saved to {output_file}")

HybridTensor/models/__pycache__/create_sparse_model.cpython-310.pyc ADDED Viewed

Binary file (15.3 kB). View file

HybridTensor/models/__pycache__/create_sparse_model.cpython-39.pyc ADDED Viewed

Binary file (18.4 kB). View file

HybridTensor/models/__pycache__/helper.cpython-310.pyc ADDED Viewed

Binary file (4.52 kB). View file

HybridTensor/models/__pycache__/helper.cpython-39.pyc ADDED Viewed

Binary file (4.65 kB). View file

HybridTensor/models/__pycache__/llama.cpython-39.pyc ADDED Viewed

Binary file (2.51 kB). View file

HybridTensor/models/__pycache__/opt.cpython-310.pyc ADDED Viewed

Binary file (4.9 kB). View file

HybridTensor/models/__pycache__/opt.cpython-39.pyc ADDED Viewed

Binary file (5.19 kB). View file

HybridTensor/models/create_sparse_model.py ADDED Viewed

	@@ -0,0 +1,854 @@

+import math
+import torch
+import torch.nn as nn
+from functools import partial
+from einops import rearrange
+from transformers import GPT2Config
+from collections import namedtuple
+from HybridTensor.modules.SelectiveMHA import SMHA, SelectMHA, ParallelSelectMHA, MHARouter, ParallelMHARouter
+from HybridTensor.modules.SelectiveMLP import SelectiveMLP, ParallelSelectiveMLP, MLPRouter, ParallelMLPRouter
+from HybridTensor.modules.SelectiveBlock import SelectBlock
+# from HybridTensor.modules.SelectiveBlock_v1 import SelectBlock
+import torch.nn.functional as F
+from flash_attn.utils.distributed import (
+    all_gather,
+    all_gather_raw,
+    get_dim_for_local_rank,
+    sync_shared_params,
+)
+from collections.abc import Sequence
+from flash_attn.modules.mha import MHA, ParallelMHA
+from flash_attn.modules.mlp import FusedMLP, ParallelFusedMLP, GatedMlp, ParallelGatedMlp, Mlp, ParallelMLP
+from flash_attn.ops.activations import sqrelu_fwd
+from flash_attn.modules.block import Block
+try:
+    from flash_attn.ops.triton.layer_norm import layer_norm_fn, RMSNorm
+except ImportError:
+    layer_norm_fn, RMSNorm = None, None
+from flash_attn.modules.embedding import GPT2Embeddings, ParallelGPT2Embeddings
+from flash_attn.utils.distributed import sync_shared_params, all_gather_raw
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+from flash_attn.utils.generation import GenerationMixin
+from flash_attn.models.opt import remap_state_dict_hf_opt
+try:
+    from flash_attn.ops.fused_dense import ColumnParallelLinear
+except ImportError:
+    ColumnParallelLinear = None
+try:
+    from flash_attn.ops.triton.mlp import FusedDenseSqreluDense
+except ImportError:
+    FusedDenseSqreluDense = None
+try:
+    from flash_attn.ops.triton.layer_norm import layer_norm_fn, RMSNorm
+except ImportError:
+    layer_norm_fn, RMSNorm = None, None
+from HybridTensor.models.helper import remap_state_dict_gpt2, shard_state_dict_tp
+def create_mixer_cls(config, layer_idx=None, process_group=None, device=None, dtype=None):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    attn_scale_power = 0.5 if not getattr(config, "mup_scale_qk_dot_by_d", False) else 1.0
+    softmax_scale = 1.0 if not config.scale_attn_weights else (head_dim ** (-attn_scale_power))
+    softmax_scale *= getattr(config, "mup_attn_multiplier", 1.0)
+    if config.scale_attn_by_inverse_layer_idx:
+        assert layer_idx is not None
+        softmax_scale /= float(layer_idx + 1)
+    dwconv = getattr(config, "attn_dwconv", False)
+    if dwconv:
+        assert process_group is None, "TensorParallel MHA does not support dwconv yet"
+    qkv_proj_bias = getattr(config, "qkv_proj_bias", True)
+    out_proj_bias = getattr(config, "out_proj_bias", True)
+    rotary_emb_dim = int(getattr(config, "rotary_emb_fraction", 0.0) * head_dim)
+    rotary_emb_base = getattr(config, "rotary_emb_base", 10000.0)
+    rotary_emb_scale_base = getattr(config, "rotary_emb_scale_base", None)
+    rotary_emb_interleaved = getattr(config, "rotary_emb_interleaved", False)
+    use_alibi = getattr(config, "use_alibi", False)
+    use_triton = getattr(config, "use_triton", True)    # toggle cuda or triton decode kernels
+    window_size = getattr(config, "window_size", (-1, -1))
+    use_flash_attn = getattr(config, "use_flash_attn", False)
+    fused_bias_fc = getattr(config, "fused_bias_fc", False)
+    if not fused_bias_fc:
+        assert process_group is None, "TensorParallel MHA requires fused_bias_fc"
+    mlp_sparse = getattr(config, "mlp_sparse", False)
+    att_sparse = getattr(config, "att_sparse", False)
+    num_heads = getattr(config, "num_attention_heads", None)
+    n_head_kv = getattr(config, "n_head_kv", num_heads)
+    if num_heads != n_head_kv:
+        att_sparse = False
+    if process_group is None:
+        mha_cls = SMHA # SelectMHA if att_sparse else MHA
+    else:
+        mha_cls = ParallelSelectMHA if att_sparse else ParallelMHA
+    # mha_cls = SelectMHA if process_group is None else ParallelSelectMHA
+    serial_kwargs = (
+        {"fused_bias_fc": fused_bias_fc, "dwconv": dwconv} if process_group is None else {}
+    )
+    parallel_kwargs = (
+        {
+            "process_group": process_group,
+            "sequence_parallel": getattr(config, "sequence_parallel", False),
+        }
+        if process_group is not None
+        else {}
+    )
+    num_heads_kv = getattr(config, "n_head_kv", None)
+    mixer_cls = partial(
+        mha_cls,
+        num_heads=config.num_attention_heads,
+        num_heads_kv=num_heads_kv,
+        qkv_proj_bias=qkv_proj_bias,
+        out_proj_bias=out_proj_bias,
+        dropout=config.attn_pdrop,
+        softmax_scale=softmax_scale,
+        causal=True,
+        layer_idx=layer_idx,
+        rotary_emb_dim=rotary_emb_dim,
+        rotary_emb_base=rotary_emb_base,
+        rotary_emb_scale_base=rotary_emb_scale_base,
+        rotary_emb_interleaved=rotary_emb_interleaved,
+        use_alibi=use_alibi,
+        window_size=window_size,
+        use_flash_attn=use_flash_attn,
+        **serial_kwargs,
+        **parallel_kwargs,
+        **factory_kwargs,
+    )
+    return mixer_cls
+def create_mlp_cls_old(config, layer_idx=None, process_group=None, device=None, dtype=None):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
+    fused_mlp = getattr(config, "fused_mlp", False)
+    if fused_mlp:
+        assert config.activation_function in [
+            "gelu_new",
+            "gelu_fast",
+            "gelu_approx",
+            "gelu_pytorch_tanh",
+            "relu",
+            "sqrelu",
+        ]
+    assert fused_mlp == True, "Not supported not fused mlp for now"
+    mlp_sparse = getattr(config, "mlp_sparse", False)
+    use_heuristic = getattr(config, "use_heuristic", True)
+    mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
+    # mlp_checkpoint_lvl could be a list, which contains the checkpoint_lvl for each layer
+    if isinstance(mlp_checkpoint_lvl, Sequence):
+        assert layer_idx is not None
+        mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx]
+    if fused_mlp:
+        if FusedMLP is None:
+            raise ImportError("fused_dense is not installed")
+        # activation = (
+        #     "gelu_approx"
+        #     if config.activation_function in ["gelu_new", "gelu_fast", "gelu_approx"]
+        #     else "relu"
+        # )
+        if config.activation_function in ["gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh"]:
+            activation = "gelu_approx"
+        else:
+            activation = "relu" # config.activation_function
+        if process_group is None:
+            mlp_cls = SelectiveMLP if mlp_sparse else FusedMLP
+        else:
+            mlp_cls = ParallelSelectiveMLP if mlp_sparse else ParallelFusedMLP
+        parallel_kwargs = (
+            {
+                "process_group": process_group,
+                "sequence_parallel": getattr(config, "sequence_parallel", True),
+            }
+            if process_group is not None
+            else {}
+        )
+        sparsity_kwargs = (
+            {
+                "use_heuristic": use_heuristic,
+            }
+            if mlp_sparse
+            else {}
+        )
+        mlp_cls = partial(
+            mlp_cls,
+            hidden_features=inner_dim,
+            activation=activation,
+            checkpoint_lvl=mlp_checkpoint_lvl,
+            # layer_idx=layer_idx,
+            **parallel_kwargs,
+            **factory_kwargs,
+            **sparsity_kwargs,
+        )
+    else:
+        raise RuntimeError("MLP type not supported")
+    return mlp_cls
+def create_mlp_cls(config, layer_idx=None, process_group=None, device=None, dtype=None):
+    """
+    Create an MLP class that supports both sparse MLPs (via fused mlp) and GatedMLPs.
+    If the activation function is one of "glu", "swiglu", or "geglu", then GatedMlp is used
+    (and mlp_sparse is ignored). Otherwise, fused_mlp is used to decide between sparse and
+    dense implementations.
+    """
+    from functools import partial
+    factory_kwargs = {"device": device, "dtype": dtype}
+    mlp_fc1_bias = getattr(config, "mlp_fc1_bias", True)
+    mlp_fc2_bias = getattr(config, "mlp_fc2_bias", True)
+    # Check for gated activations
+    if config.activation_function in ["glu", "swiglu", "geglu"]:
+        # For gated activations we do not support sparsity yet.
+        activation = (
+            F.sigmoid if config.activation_function == "glu"
+            else (F.silu if config.activation_function == "swiglu" else F.gelu)
+        )
+        mlp_cls = GatedMlp if process_group is None else ParallelGatedMlp
+        parallel_kwargs = (
+            {"process_group": process_group, "sequence_parallel": getattr(config, "sequence_parallel", True)}
+            if process_group is not None else {}
+        )
+        mlp_multiple_of = getattr(config, "mlp_multiple_of", 128)
+        mlp_cls = partial(
+            mlp_cls,
+            hidden_features=config.n_inner,
+            activation=activation,
+            bias1=mlp_fc1_bias,
+            bias2=mlp_fc2_bias,
+            multiple_of=mlp_multiple_of,
+            **parallel_kwargs,
+            **factory_kwargs,
+        )
+        return mlp_cls
+    # For non-gated activations:
+    fused_mlp = getattr(config, "fused_mlp", False)
+    fused_dense_sqrelu_dense = getattr(config, "fused_dense_sqrelu_dense", False)
+    if fused_dense_sqrelu_dense:
+        assert config.activation_function == "sqrelu", (
+            "fused_dense_sqrelu_dense only supports approximate activation_function sqrelu"
+        )
+    assert not (fused_dense_sqrelu_dense and fused_mlp)
+    if fused_mlp:
+        # Ensure valid activation function.
+        assert config.activation_function in [
+            "gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh", "relu", "sqrelu"
+        ]
+        # Support checkpoint level (possibly a list)
+        mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
+        if isinstance(mlp_checkpoint_lvl, (list, tuple)):
+            assert layer_idx is not None
+            mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx]
+        # Choose activation string.
+        if config.activation_function in ["gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh"]:
+            activation = "gelu_approx"
+        else:
+            activation = "relu"
+        # Determine inner dim.
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
+        mlp_sparse = getattr(config, "mlp_sparse", False)
+        use_heuristic = getattr(config, "use_heuristic", True)
+        if process_group is None:
+            mlp_cls = SelectiveMLP if mlp_sparse else FusedMLP
+        else:
+            mlp_cls = ParallelSelectiveMLP if mlp_sparse else ParallelFusedMLP
+        parallel_kwargs = (
+            {"process_group": process_group, "sequence_parallel": getattr(config, "sequence_parallel", True)}
+            if process_group is not None else {}
+        )
+        sparsity_kwargs = {"use_heuristic": use_heuristic} if mlp_sparse else {}
+        mlp_cls = partial(
+            mlp_cls,
+            hidden_features=inner_dim,
+            activation=activation,
+            checkpoint_lvl=mlp_checkpoint_lvl,
+            bias1=mlp_fc1_bias,
+            bias2=mlp_fc2_bias,
+            **parallel_kwargs,
+            **factory_kwargs,
+            **sparsity_kwargs,
+        )
+        return mlp_cls
+    elif fused_dense_sqrelu_dense:
+        if process_group is not None:
+            assert fused_mlp, "Tensor Parallel is not implemented for FusedDenseSqreluDense"
+        assert FusedDenseSqreluDense is not None
+        mlp_checkpoint_lvl = getattr(config, "mlp_checkpoint_lvl", 0)
+        if isinstance(mlp_checkpoint_lvl, (list, tuple)):
+            assert layer_idx is not None
+            mlp_checkpoint_lvl = mlp_checkpoint_lvl[layer_idx]
+        mlp_cls = partial(
+            FusedDenseSqreluDense,
+            hidden_features=config.n_inner,
+            checkpoint_lvl=mlp_checkpoint_lvl,
+            **factory_kwargs,
+        )
+        return mlp_cls
+    else:
+        # Non-fused, non-sparse branch.
+        assert config.activation_function in [
+            "gelu", "gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh", "relu", "sqrelu"
+        ]
+        if config.activation_function == "relu":
+            activation = partial(F.relu, inplace=True)
+        elif config.activation_function == "sqrelu":
+            activation = sqrelu_fwd
+        else:
+            approximate = "tanh" if config.activation_function in [
+                "gelu_new", "gelu_fast", "gelu_approx", "gelu_pytorch_tanh"
+            ] else "none"
+            activation = partial(F.gelu, approximate=approximate)
+        mlp_sparse = getattr(config, "mlp_sparse", False)
+        mlp_cls = Mlp if process_group is None else ParallelMLP
+        parallel_kwargs = (
+            {"process_group": process_group, "sequence_parallel": getattr(config, "sequence_parallel", True)}
+            if process_group is not None else {}
+        )
+        mlp_cls = partial(
+            mlp_cls,
+            hidden_features=config.n_inner,
+            activation=activation,
+            bias1=mlp_fc1_bias,
+            bias2=mlp_fc2_bias,
+            **parallel_kwargs,
+            **factory_kwargs,
+        )
+        return mlp_cls
+def create_mlp_router_cls(config, sp_config = None, layer_idx=None, process_group=None, device=None, dtype=None):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    num_neurons = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
+    # this can be made different per layer by adding mlp_low_rank_dim_{layer_idx} in the sp_config
+    low_rank_dim = getattr(sp_config, "mlp_low_rank_dim", 1024)
+    # per layer activation threshold
+    act_th = getattr(config, "mlp_act_th", 0.5)
+    if process_group is None:
+        mlp_router_cls = MLPRouter
+    else:
+        mlp_router_cls = ParallelMLPRouter
+    parallel_kwargs = (
+        {
+            "process_group": process_group,
+            "sequence_parallel": getattr(config, "sequence_parallel", True),
+        }
+        if process_group is not None
+        else {}
+    )
+    mlp_router_cls = partial(mlp_router_cls,
+                             low_rank_dim = low_rank_dim,
+                             out_dim = num_neurons,
+                             act_th = act_th,
+                             **parallel_kwargs,
+                             **factory_kwargs)
+    return mlp_router_cls
+def create_mha_router_cls(config, sp_config = None, layer_idx=None, process_group=None, device=None, dtype=None):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    num_heads = config.num_attention_heads
+    n_head_kv = getattr(config, "n_head_kv", num_heads)
+    if num_heads != n_head_kv:
+        out_dim = n_head_kv
+    else:
+        out_dim = num_heads
+    low_rank_dim = getattr(sp_config, "attn_low_rank_dim", 128) # optional, default to 128
+    # per layer activation topk, to make this different per layer, add a different attn_topk_{layer_idx} in the sp_config
+    attn_topk = getattr(sp_config, "attn_topk", 0.5)
+    if process_group is None:
+        mha_router_cls = MHARouter
+    else:
+        mha_router_cls = ParallelMHARouter
+    parallel_kwargs = (
+        {
+            "process_group": process_group,
+            "sequence_parallel": getattr(config, "sequence_parallel", True),
+        }
+        if process_group is not None
+        else {}
+    )
+    mha_router_cls = partial(mha_router_cls,
+                             low_rank_dim = low_rank_dim,
+                             out_dim = out_dim,
+                             top_k = attn_topk,
+                             **parallel_kwargs,
+                             **factory_kwargs)
+    return mha_router_cls
+def create_block(config, sp_config, layer_idx=None, process_group=None, device=None, dtype=None):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    sequence_parallel = getattr(config, "sequence_parallel", True)
+    mixer_cls = create_mixer_cls(config, layer_idx, process_group=process_group, **factory_kwargs)
+    mlp_cls = create_mlp_cls(config, layer_idx, process_group=process_group, **factory_kwargs)
+    use_rms_norm = getattr(config, "rms_norm", False)
+    norm_cls = partial(
+        nn.LayerNorm if not use_rms_norm else RMSNorm,
+        eps=config.layer_norm_epsilon,
+        **factory_kwargs,
+    )
+    # TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable
+    residual_in_fp32 = getattr(config, "residual_in_fp32", False)
+    resid_dropout1 = config.resid_pdrop if layer_idx is None or layer_idx > 0 else config.embd_pdrop
+    prenorm = getattr(config, "prenorm", True)
+    parallel_block = getattr(config, "parallel_block", False)
+    mlp_sparse = getattr(config, "mlp_sparse", False)
+    att_sparse = getattr(config, "att_sparse", False)
+    block_sparse = mlp_sparse or att_sparse
+    if not parallel_block:
+        if block_sparse:
+            mha_router_cls = create_mha_router_cls(config, sp_config, layer_idx, process_group=process_group, **factory_kwargs) if att_sparse else None
+            mlp_router_cls = create_mlp_router_cls(config, sp_config, layer_idx, process_group=process_group, **factory_kwargs) if mlp_sparse else None
+            block = SelectBlock(
+                config.hidden_size,
+                mixer_cls,
+                mlp_cls,
+                mlp_router = mlp_router_cls,
+                mha_router = mha_router_cls,
+                norm_cls=norm_cls,
+                prenorm=prenorm,
+                resid_dropout1=resid_dropout1,
+                resid_dropout2=config.resid_pdrop,
+                fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
+                residual_in_fp32=residual_in_fp32,
+                sequence_parallel=sequence_parallel and process_group is not None,
+                mark_shared_params=process_group is not None,
+            )
+        else:
+            block = Block(
+                config.hidden_size,
+                mixer_cls,
+                mlp_cls,
+                norm_cls=norm_cls,
+                prenorm=prenorm,
+                resid_dropout1=resid_dropout1,
+                resid_dropout2=config.resid_pdrop,
+                fused_dropout_add_ln=getattr(config, "fused_dropout_add_ln", False),
+                residual_in_fp32=residual_in_fp32,
+                sequence_parallel=sequence_parallel and process_group is not None,
+                mark_shared_params=process_group is not None,
+            )
+    else:
+        # not implemented
+        raise RuntimeError("ParallelBlock not implemented")
+    block.layer_idx = layer_idx
+    return block
+class GPTPreTrainedModel(nn.Module):
+    """An abstract class to handle weights initialization and
+    a simple interface for dowloading and loading pretrained models.
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__()
+        if not isinstance(config, GPT2Config):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `GPT2Config`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                )
+            )
+        self.config = config
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name,
+        config,
+        sp_config,
+        *args,
+        strict=True,
+        device=None,
+        dtype=None,
+        world_size=1,
+        rank=0,
+        **kwargs,
+    ):
+        """
+        Instantiate a GPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
+        Download and cache the pre-trained model file if needed.
+        """
+        # Instantiate model.
+        model = cls(config, sp_config, *args, device=device, dtype=dtype, **kwargs)
+        # Load state_dict in cpu because we already initialized the model in GPU, and we don't
+        # want extra stuff taking up more GPU memory
+        state_dict = state_dict_from_pretrained(model_name, device="cpu", dtype=dtype)
+        if model_name.startswith("gpt2"):
+            state_dict = remap_state_dict_gpt2(state_dict, config)
+        elif model_name.startswith("facebook/opt"):
+            state_dict = remap_state_dict_hf_opt(state_dict, config)
+        else:
+            raise NotImplementedError(f"Model {model_name} not supported")
+        if world_size > 1:
+            state_dict = shard_state_dict_tp(state_dict, config, world_size, rank)
+        load_return = model.load_state_dict(state_dict, strict=strict)
+        # logger.info(load_return)
+        return model
+# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
+def _init_weights(
+    module, n_layer, initializer_range=0.02, rescale_prenorm_residual=True
+):
+    if isinstance(module, nn.Linear):
+        nn.init.normal_(module.weight, std=initializer_range)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Embedding):
+        nn.init.normal_(module.weight, std=initializer_range)
+    if rescale_prenorm_residual:
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if name in ["out_proj.weight", "fc2.weight"]:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                nn.init.normal_(
+                    p, mean=0.0, std=initializer_range / math.sqrt(2 * n_layer)
+                )
+class GPTModel(GPTPreTrainedModel):
+    def __init__(self, config: GPT2Config, sp_config=None, process_group=None, device=None, dtype=None):
+        super().__init__(config)
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.process_group = process_group
+        self.sequence_parallel = getattr(config, "sequence_parallel", True)
+        assert config.activation_function in [
+            "gelu",
+            "gelu_new",
+            "gelu_fast",
+            "gelu_approx",
+            "relu",
+            "sqrelu",
+            "glu",
+            "swiglu",
+            "geglu",
+        ]
+        pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+        vocab_size = (
+            math.ceil(config.vocab_size / pad_vocab_size_multiple)
+            * pad_vocab_size_multiple
+        )
+        # TD [2022-07-30]: Force residual in fp32, seems to make fp16 training more stable
+        self.residual_in_fp32 = getattr(config, "residual_in_fp32", False)
+        # These 2 options are for OPT-350m
+        self.prenorm = getattr(config, "prenorm", True)
+        use_rms_norm = getattr(config, "rms_norm", False)
+        word_embed_proj_dim = getattr(config, "word_embed_proj_dim", None)
+        if process_group is None:
+            self.embeddings = GPT2Embeddings(
+                config.hidden_size,
+                vocab_size,
+                config.max_position_embeddings,
+                word_embed_proj_dim=word_embed_proj_dim,
+                **factory_kwargs,
+            )
+        else:
+            self.embeddings = ParallelGPT2Embeddings(
+                config.hidden_size,
+                vocab_size,
+                config.max_position_embeddings,
+                process_group=process_group,
+                sequence_parallel=self.sequence_parallel,
+                **factory_kwargs,
+            )
+        # We change the order of dropout, residual and layer norm:
+        # Instead of LN -> Attn / MLP -> Dropout -> Add, we do:
+        # Dropout -> Add -> LN -> Attn / MLP, returning both the residual branch (output of Add) and
+        # the main branch (output of MLP). The model definition is unchanged, but the mapping of the
+        # nn.Dropout probabilities are changed.
+        # This is for performance reason: we can fuse dropout + add + layer_norm.
+        self.layers = nn.ModuleList(
+            [
+                create_block(
+                    config, sp_config, layer_idx=i, process_group=process_group, **factory_kwargs
+                )
+                for i in range(config.num_hidden_layers)
+            ]
+        )
+        self.fused_dropout_add_ln = getattr(config, "fused_dropout_add_ln", False)
+        if self.fused_dropout_add_ln:
+            if layer_norm_fn is None:
+                raise ImportError("Triton is not installed")
+        if self.prenorm:
+            self.drop_f = nn.Dropout(config.resid_pdrop)
+            norm_cls = nn.LayerNorm if not use_rms_norm else RMSNorm
+            # self.ln_f = nn.LayerNorm(
+            #     config.hidden_size, eps=config.layer_norm_epsilon, **factory_kwargs
+            # )
+            self.ln_f = norm_cls(
+                config.hidden_size, eps=config.layer_norm_epsilon, **factory_kwargs
+            )
+        if process_group is not None:
+            for p in self.ln_f.parameters():
+                # Mark the norm parameters as "shared_params" so that we sync their values at init.
+                p._shared_params = True
+                # Mark the norm params as "sequence_parallel" so we run all-reduce on their grads.
+                if self.sequence_parallel:
+                    p._sequence_parallel = True
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=config.num_hidden_layers,
+                initializer_range=config.initializer_range,
+            )
+        )
+        self.tie_weights()
+        self.sparse = False
+        if config.mlp_sparse or config.att_sparse:
+            self.sparse = True
+    def tie_weights(self):
+        if self.process_group is not None:
+            sync_shared_params(self, self.process_group)
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return {
+            i: layer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+            for i, layer in enumerate(self.layers)
+        }
+    def forward(self, input_ids, position_ids=None, inference_params=None):
+        # If using Tensor Parallel with sequence parallel, we combine the batch and the seqlen
+        # dimensions so that we can split on it easily, in case of small batch size.
+        # Only the attention layers need to know the seqlen.
+        embedding_kwargs = (
+            {"combine_batch_seqlen_dim": True}
+            if self.process_group is not None and self.sequence_parallel
+            else {}
+        )
+        hidden_states = self.embeddings(
+            input_ids, position_ids=position_ids, **embedding_kwargs
+        )
+        residual = None
+        mixer_kwargs = (
+            {"seqlen": input_ids.shape[1]}
+            if self.process_group is not None and self.sequence_parallel
+            else {}
+        )
+        if inference_params is not None:
+            mixer_kwargs["inference_params"] = inference_params
+        else:
+            mixer_kwargs["inference_params"] = None
+        # else:
+        for layer in self.layers:
+            if self.prenorm:
+                hidden_states, residual = layer(
+                    hidden_states,
+                    residual,
+                    mixer_kwargs=mixer_kwargs,
+                )
+            else:
+                hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
+        if self.prenorm:
+            if not self.fused_dropout_add_ln:
+                dropped = self.drop_f(hidden_states)
+                residual = (dropped + residual) if residual is not None else dropped
+                hidden_states = self.ln_f(residual.to(dtype=self.ln_f.weight.dtype))
+            else:
+                # Set prenorm=False here since we don't need the residual
+                if hidden_states.shape != residual.shape:
+                    hidden_states = hidden_states.view(residual.shape)
+                hidden_states = layer_norm_fn(
+                    hidden_states,
+                    self.ln_f.weight,
+                    self.ln_f.bias,
+                    residual=residual,
+                    x1=None,
+                    eps=self.ln_f.eps,
+                    dropout_p=self.drop_f.p if self.training else 0.0,
+                    prenorm=False,
+                    is_rms_norm=isinstance(self.ln_f, RMSNorm)
+                )
+        return hidden_states
+class GPTLMHeadModel(GPTPreTrainedModel, GenerationMixin):
+    def __init__(self, config: GPT2Config, sp_config = None, process_group=None, device=None, dtype=None):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__(config)
+        self.process_group = process_group
+        self.transformer = GPTModel(
+            config, sp_config, process_group=process_group, **factory_kwargs
+        )
+        self.tie_word_embeddings = getattr(config, "tie_word_embeddings", True)
+        pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+        vocab_size = (
+            math.ceil(config.vocab_size / pad_vocab_size_multiple)
+            * pad_vocab_size_multiple
+        )
+        # This option is for OPT-350m
+        word_embed_proj_dim = getattr(config, "word_embed_proj_dim", None)
+        embed_dim = (
+            config.n_embd if word_embed_proj_dim is None else word_embed_proj_dim
+        )
+        if word_embed_proj_dim is not None:
+            self.project_out = nn.Linear(
+                config.n_embd, embed_dim, bias=False, **factory_kwargs
+            )
+        else:
+            self.project_out = None
+        mup_width_scale = getattr(config, "mup_width_scale", 1.0)
+        mup_output_multiplier = getattr(config, "mup_output_multiplier", 1.0)
+        self.output_scale = mup_output_multiplier * mup_width_scale
+        if process_group is None:
+            self.lm_head = nn.Linear(
+                embed_dim, vocab_size, bias=False, **factory_kwargs
+            )
+        else:
+            if ColumnParallelLinear is None:
+                raise ImportError("fused_dense_lib is not installed")
+            self.lm_head = ColumnParallelLinear(
+                embed_dim,
+                vocab_size,
+                process_group,
+                bias=False,
+                sequence_parallel=getattr(config, "sequence_parallel", True),
+                **factory_kwargs,
+            )
+        self.norm_head = getattr(config, "norm_head", False)
+        # Initialize weights and apply final processing
+        self.apply(
+            partial(
+                _init_weights,
+                n_layer=config.num_hidden_layers,
+                initializer_range=config.initializer_range,
+            )
+        )
+        self.tie_weights()
+    def tie_weights(self):
+        if self.tie_word_embeddings:
+            self.lm_head.weight = self.transformer.embeddings.word_embeddings.weight  # llama does not use tied weights
+        if self.process_group is not None:
+            sync_shared_params(self, self.process_group)
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.transformer.allocate_inference_cache(
+            batch_size, max_seqlen, dtype=dtype, **kwargs
+        )
+    def forward(self, input_ids, position_ids=None, inference_params=None, num_last_tokens=0):
+        """
+        input_ids: (batch, seqlen) int tensor
+        inference_params: for generation. Adapted from Megatron-LM (and Apex)
+        https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
+        num_last_tokens: if > 0, only return the logits for the last n tokens
+        """
+        assert (
+            input_ids.ndim == 2
+        ), f"Expected `input_ids` to have shape [b, slen], but got shape {input_ids.shape}"
+        b, slen = input_ids.shape
+        hidden_states = self.transformer(
+            input_ids, position_ids=position_ids, inference_params=inference_params
+        )
+        if inference_params is not None:
+            assert hidden_states.ndim == 3, "sequence_parallel is not supported in generation mode"
+        if num_last_tokens > 0:
+            hidden_states = hidden_states[:, -num_last_tokens:]
+        if self.project_out is not None:
+            hidden_states = self.project_out(hidden_states)
+        if self.output_scale != 1.0:
+            hidden_states = hidden_states * self.output_scale
+        if not self.norm_head:
+            lm_logits = self.lm_head(hidden_states)
+        else:
+            lm_head_weight = F.normalize(self.lm_head.weight)
+            if isinstance(self.lm_head, ColumnParallelLinear) and self.lm_head.sequence_parallel:
+                hidden_states = all_gather(hidden_states, self.lm_head.process_group)
+            lm_logits = F.linear(hidden_states, lm_head_weight, bias=self.lm_head.bias)
+        # During inference, we want the full logit for sampling
+        if isinstance(self.lm_head, ColumnParallelLinear) and inference_params is not None:
+            lm_logits, _ = all_gather_raw(lm_logits, self.lm_head.process_group)
+            lm_logits = rearrange(lm_logits, "(n b) ... d -> b ... (n d)", b=b)
+        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
+        return CausalLMOutput(logits=lm_logits)
+    def load_state_dict(self, state_dict, strict=True):
+        # Remapping from our checkpoints that used a different ordering of layers in the block
+        # Previous: Attn / MLP -> Dropout -> Add -> LN
+        # Current: Dropout -> Add -> LN -> Attn / MLP
+        if "transformer.ln_0.weight" in state_dict:
+            n_layers = len(self.transformer.layers)
+            ln_weight = state_dict.pop(
+                f"transformer.layers.{n_layers - 1}.norm2.weight"
+            )
+            ln_bias = state_dict.pop(f"transformer.layers.{n_layers - 1}.norm2.bias")
+            state_dict["transformer.ln_f.weight"] = ln_weight
+            state_dict["transformer.ln_f.bias"] = ln_bias
+            for l in reversed(range(n_layers)):
+                ln_weight = state_dict.pop(f"transformer.layers.{l}.norm1.weight")
+                ln_bias = state_dict.pop(f"transformer.layers.{l}.norm1.bias")
+                state_dict[f"transformer.layers.{l}.norm2.weight"] = ln_weight
+                state_dict[f"transformer.layers.{l}.norm2.bias"] = ln_bias
+                if l > 0:
+                    ln_weight = state_dict.pop(
+                        f"transformer.layers.{l - 1}.norm2.weight"
+                    )
+                    ln_bias = state_dict.pop(f"transformer.layers.{l - 1}.norm2.bias")
+                    state_dict[f"transformer.layers.{l}.norm1.weight"] = ln_weight
+                    state_dict[f"transformer.layers.{l}.norm1.bias"] = ln_bias
+            ln_weight = state_dict.pop("transformer.ln_0.weight")
+            ln_bias = state_dict.pop("transformer.ln_0.bias")
+            state_dict[f"transformer.layers.0.norm1.weight"] = ln_weight
+            state_dict[f"transformer.layers.0.norm1.bias"] = ln_bias
+        return super().load_state_dict(state_dict, strict=strict)

HybridTensor/models/helper.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import math
+import re
+from collections import OrderedDict
+from einops import rearrange
+def remap_state_dict_gpt2(state_dict, config):
+    # Word embedding and position embedding
+    def key_mapping_pos_emb(key):
+        return re.sub(r"^wpe.", "transformer.embeddings.position_embeddings.", key)
+    state_dict = OrderedDict((key_mapping_pos_emb(k), v) for k, v in state_dict.items())
+    word_embeddings = state_dict.pop("wte.weight")
+    # It's possible that vocab_size is padded to be a multiple of 8, for example.
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    )
+    state_dict["transformer.embeddings.word_embeddings.weight"] = F.pad(
+        word_embeddings, (0, 0, 0, vocab_size - word_embeddings.shape[0])
+    )
+    state_dict["lm_head.weight"] = state_dict[
+        "transformer.embeddings.word_embeddings.weight"
+    ]
+    # LayerNorm
+    def key_mapping_ln(key):
+        key = re.sub(r"^ln_f.(weight|bias)", r"transformer.ln_f.\1", key)
+        key = re.sub(
+            r"^h.(\d+).ln_(1|2).(weight|bias)", r"transformer.layers.\1.norm\2.\3", key
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_ln(k), v) for k, v in state_dict.items())
+    # MLP
+    for d in range(config.num_hidden_layers):
+        W1 = state_dict.pop(f"h.{d}.mlp.c_fc.weight")
+        state_dict[f"transformer.layers.{d}.mlp.fc1.weight"] = W1.t()
+        W2 = state_dict.pop(f"h.{d}.mlp.c_proj.weight")
+        state_dict[f"transformer.layers.{d}.mlp.fc2.weight"] = W2.t()
+    def key_mapping_mlp(key):
+        key = re.sub(
+            r"^h.(\d+).mlp.c_fc.bias", r"transformer.layers.\1.mlp.fc1.bias", key
+        )
+        key = re.sub(
+            r"^h.(\d+).mlp.c_proj.bias", r"transformer.layers.\1.mlp.fc2.bias", key
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_mlp(k), v) for k, v in state_dict.items())
+    # Attention
+    for d in range(config.num_hidden_layers):
+        state_dict.pop(f"h.{d}.attn.bias")  # We don't store this bias
+        Wqkv = state_dict.pop(f"h.{d}.attn.c_attn.weight")
+        state_dict[f"transformer.layers.{d}.mixer.Wqkv.weight"] = Wqkv.t()
+        Wout = state_dict.pop(f"h.{d}.attn.c_proj.weight")
+        state_dict[f"transformer.layers.{d}.mixer.out_proj.weight"] = Wout.t()
+    def key_mapping_attn(key):
+        key = re.sub(
+            r"^h.(\d+).attn.c_attn.bias", r"transformer.layers.\1.mixer.Wqkv.bias", key
+        )
+        key = re.sub(
+            r"^h.(\d+).attn.c_proj.bias",
+            r"transformer.layers.\1.mixer.out_proj.bias",
+            key,
+        )
+        return key
+    state_dict = OrderedDict((key_mapping_attn(k), v) for k, v in state_dict.items())
+    return state_dict
+def shard_state_dict_tp(state_dict, config, world_size, rank):
+    """Convert the state_dict of a standard GPT model to the state_dict of a GPT model
+    with tensor parallel.
+    """
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    )
+    assert vocab_size % world_size == 0
+    assert config.hidden_size % world_size == 0
+    inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
+    assert inner_dim % world_size == 0
+    def shard_first_dim(state_dict, key):
+        x = state_dict[key]
+        dim = x.shape[0] // world_size
+        state_dict[key] = x[rank * dim : (rank + 1) * dim]
+    def shard_last_dim(state_dict, key):
+        x = state_dict[key]
+        dim = x.shape[-1] // world_size
+        state_dict[key] = x[..., rank * dim : (rank + 1) * dim]
+    def shard_qkv_headdim(state_dict, key):
+        x = rearrange(state_dict[key], "(three d) ... -> three d ...", three=3)
+        dim = x.shape[1] // world_size
+        state_dict[key] = rearrange(
+            x[:, rank * dim : (rank + 1) * dim], "three d ... -> (three d) ..."
+        )
+    shard_first_dim(state_dict, "transformer.embeddings.word_embeddings.weight")
+    if "lm_head.weight" in state_dict:
+        shard_first_dim(state_dict, "lm_head.weight")
+    if "transformer.embeddings.position_embeddings.weight" in state_dict:
+        shard_last_dim(state_dict, "transformer.embeddings.position_embeddings.weight")
+    for i in range(config.num_hidden_layers):
+        shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.weight")
+        shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.bias")
+        shard_last_dim(state_dict, f"transformer.layers.{i}.mixer.out_proj.weight")
+        if rank != 0:
+            state_dict.pop(f"transformer.layers.{i}.mixer.out_proj.bias")
+        shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
+        shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.bias")
+        shard_last_dim(state_dict, f"transformer.layers.{i}.mlp.fc2.weight")
+        if rank != 0:
+            state_dict.pop(f"transformer.layers.{i}.mlp.fc2.bias")
+    return state_dict

HybridTensor/models/llama.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from transformers import LlamaConfig, LlamaTokenizer
+import torch
+import torch.nn as nn
+from HybridTensor.models.create_sparse_model import GPTLMHeadModel
+from HybridTensor.modules.SelectiveRouters import create_mlp_router_state_dict, create_attn_router_state_dict
+# from flash_attn.models.gpt import GPTLMHeadModel
+from transformers import AutoConfig, AutoTokenizer
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+from flash_attn.models.llama import (
+    config_from_checkpoint,
+    inv_remap_state_dict_hf_llama,
+    llama_config_to_gpt2_config,
+    remap_state_dict_hf_llama,
+    remap_state_dict_meta_llama,
+    state_dicts_from_checkpoint,
+)
+class SparseConfig:
+    def __init__(self):
+        self.mlp_low_rank_dim = 1024
+        self.attn_low_rank_dim = 128
+        self.mlp_act_th = 0.5
+        self.attn_topk = 0.3
+def build_dense_llama(model_name: str, device = None, dtype=torch.float16, process_group = None, world_size = None, rank = None, **kwargs):
+    config = llama_config_to_gpt2_config(AutoConfig.from_pretrained(model_name, trust_remote_code=True))
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused GatedMLP yet
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+    config.prenorm = True
+    state_dict = state_dict_from_pretrained(model_name, device='cpu', dtype=dtype)
+    state_dict = remap_state_dict_hf_llama(state_dict, config)
+    model = GPTLMHeadModel(config, device=device, dtype=dtype)
+    model.load_state_dict(state_dict, strict=True)
+    model.eval()
+    return model
+def build_sparse_llama(args, model_name: str, attn_ckpt_dir: str, device = None, dtype=torch.float16, process_group = None, world_size = None, rank = None, **kwargs):
+    config = llama_config_to_gpt2_config(AutoConfig.from_pretrained(model_name, trust_remote_code=True))
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = False  # We don't have fused GatedMLP yet
+    config.fused_dropout_add_ln = True
+    config.residual_in_fp32 = True
+    config.prenorm = True
+    spconfig = SparseConfig()
+    spconfig.attn_topk = args.attn_topk
+    config.mlp_sparse = False
+    config.att_sparse = True
+    state_dict = state_dict_from_pretrained(model_name, device='cpu', dtype=dtype)
+    state_dict = remap_state_dict_hf_llama(state_dict, config)
+    model = GPTLMHeadModel(config, sp_config= spconfig, device=device, dtype=dtype)
+    if attn_ckpt_dir is not None:
+        attn_router_state_dict = create_attn_router_state_dict(attn_ckpt_dir)
+        merged_state_dict = {**state_dict, **attn_router_state_dict}
+    # TODO: Add code for tensor parallel state dict sharding
+    model.load_state_dict(merged_state_dict, strict=True)
+    model.eval()
+    return model

HybridTensor/models/opt.py ADDED Viewed

	@@ -0,0 +1,229 @@

+from HybridTensor.utils.activations import OPT_MODELS
+import torch
+import math
+from einops import rearrange
+from flash_attn.utils.pretrained import state_dict_from_pretrained
+from flash_attn.models.opt import remap_state_dict_hf_opt
+from HybridTensor.modules.SelectiveRouters import create_mlp_router_state_dict, create_attn_router_state_dict
+from HybridTensor.models.create_sparse_model import GPTLMHeadModel as GPTLMHeadModelSparse
+from flash_attn.models.gpt import GPTLMHeadModel
+from transformers.models.opt import OPTConfig
+from flash_attn.models.opt import opt_config_to_gpt2_config
+class SparseConfig:
+    def __init__(self):
+        self.mlp_low_rank_dim = 1024
+        self.attn_low_rank_dim = 128
+        self.mlp_act_th = 0.5
+        self.attn_topk = 0.3
+def shard_state_dict_tp(state_dict, config, world_size, rank):
+    """Convert the state_dict of a standard GPT model to the state_dict of a GPT model
+    with tensor parallel.
+    """
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    )
+    assert vocab_size % world_size == 0
+    assert config.hidden_size % world_size == 0
+    inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
+    assert inner_dim % world_size == 0
+    shared_state_dict = {}
+    def shard_first_dim(new, old, key):
+        x = old[key]
+        dim = x.shape[0] // world_size
+        new[key] = x[rank * dim : (rank + 1) * dim]
+    def shard_last_dim(new, old, key):
+        x = old[key]
+        dim = x.shape[-1] // world_size
+        new[key] = x[..., rank * dim : (rank + 1) * dim]
+    def shard_qkv_headdim(new, old, key):
+        x = rearrange(old[key], "(three d) ... -> three d ...", three=3)
+        dim = x.shape[1] // world_size
+        new[key] = rearrange(
+            x[:, rank * dim : (rank + 1) * dim], "three d ... -> (three d) ..."
+        )
+    shard_first_dim(shared_state_dict, state_dict, "transformer.embeddings.word_embeddings.weight")
+    if "lm_head.weight" in state_dict:
+        shard_first_dim(shared_state_dict, state_dict, "lm_head.weight")
+    if "transformer.embeddings.position_embeddings.weight" in state_dict:
+        shard_last_dim(shared_state_dict, state_dict, "transformer.embeddings.position_embeddings.weight")
+    for i in range(config.num_hidden_layers):
+        # attention
+        shard_qkv_headdim(shared_state_dict, state_dict, f"transformer.layers.{i}.mixer.Wqkv.weight")
+        shard_qkv_headdim(shared_state_dict, state_dict, f"transformer.layers.{i}.mixer.Wqkv.bias")
+        shard_last_dim(shared_state_dict, state_dict, f"transformer.layers.{i}.mixer.out_proj.weight")
+        # mlp
+        shard_first_dim(shared_state_dict, state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
+        shard_first_dim(shared_state_dict, state_dict, f"transformer.layers.{i}.mlp.fc1.bias")
+        shard_last_dim(shared_state_dict, state_dict, f"transformer.layers.{i}.mlp.fc2.weight")
+        if rank == 0:
+            shared_state_dict[f"transformer.layers.{i}.mlp.fc2.bias"] = state_dict[f"transformer.layers.{i}.mlp.fc2.bias"]
+            shared_state_dict[f"transformer.layers.{i}.mixer.out_proj.bias"] = state_dict[f"transformer.layers.{i}.mixer.out_proj.bias"]
+        shared_state_dict[f"transformer.layers.{i}.norm1.weight"] = state_dict[f"transformer.layers.{i}.norm1.weight"]
+        shared_state_dict[f"transformer.layers.{i}.norm1.bias"] = state_dict[f"transformer.layers.{i}.norm1.bias"]
+        shared_state_dict[f"transformer.layers.{i}.norm2.weight"] = state_dict[f"transformer.layers.{i}.norm2.weight"]
+        shared_state_dict[f"transformer.layers.{i}.norm2.bias"] = state_dict[f"transformer.layers.{i}.norm2.bias"]
+        # routers
+        # mlp router
+        shared_state_dict[f"transformer.layers.{i}.mlp_router.fc1.weight"] = state_dict[f"transformer.layers.{i}.mlp_router.fc1.weight"]
+        shard_first_dim(shared_state_dict, state_dict, f"transformer.layers.{i}.mlp_router.fc2.weight")
+        # mha router
+        shard_first_dim(shared_state_dict, state_dict, f"transformer.layers.{i}.mha_router.linear1.weight")
+        shard_first_dim(shared_state_dict, state_dict, f"transformer.layers.{i}.mha_router.linear1.bias")
+    shared_state_dict[f"transformer.ln_f.weight"] = state_dict["transformer.ln_f.weight"]
+    shared_state_dict[f"transformer.ln_f.bias"] = state_dict["transformer.ln_f.bias"]
+    # shared_state_dict[f"transformer.ln_f.weight"] = state_dict["transformer.final_layer_norm.weight"]
+    # shared_state_dict[f"transformer.ln_f.bias"] = state_dict["transformer.final_layer_norm.bias"]
+    return shared_state_dict
+'''
+def shard_state_dict_tp(state_dict, config, world_size, rank):
+    """Convert the state_dict of a standard GPT model to the state_dict of a GPT model
+    with tensor parallel.
+    """
+    pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
+    vocab_size = (
+        math.ceil(config.vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
+    )
+    assert vocab_size % world_size == 0
+    assert config.hidden_size % world_size == 0
+    inner_dim = config.n_inner if config.n_inner is not None else 4 * config.hidden_size
+    assert inner_dim % world_size == 0
+    def shard_first_dim(state_dict, key):
+        x = state_dict[key]
+        dim = x.shape[0] // world_size
+        state_dict[key] = x[rank * dim : (rank + 1) * dim]
+    def shard_last_dim(state_dict, key):
+        x = state_dict[key]
+        dim = x.shape[-1] // world_size
+        state_dict[key] = x[..., rank * dim : (rank + 1) * dim]
+    def shard_qkv_headdim(state_dict, key):
+        x = rearrange(state_dict[key], "(three d) ... -> three d ...", three=3)
+        dim = x.shape[1] // world_size
+        state_dict[key] = rearrange(
+            x[:, rank * dim : (rank + 1) * dim], "three d ... -> (three d) ..."
+        )
+    shard_first_dim(state_dict, "transformer.embeddings.word_embeddings.weight")
+    if "lm_head.weight" in state_dict:
+        shard_first_dim(state_dict, "lm_head.weight")
+    if "transformer.embeddings.position_embeddings.weight" in state_dict:
+        shard_last_dim(state_dict, "transformer.embeddings.position_embeddings.weight")
+    for i in range(config.num_hidden_layers):
+        shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.weight")
+        shard_qkv_headdim(state_dict, f"transformer.layers.{i}.mixer.Wqkv.bias")
+        shard_last_dim(state_dict, f"transformer.layers.{i}.mixer.out_proj.weight")
+        if rank != 0:
+            state_dict.pop(f"transformer.layers.{i}.mixer.out_proj.bias")
+        shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.weight")
+        shard_first_dim(state_dict, f"transformer.layers.{i}.mlp.fc1.bias")
+        shard_last_dim(state_dict, f"transformer.layers.{i}.mlp.fc2.weight")
+        if rank != 0:
+            state_dict.pop(f"transformer.layers.{i}.mlp.fc2.bias")
+    return state_dict
+'''
+def build_sparse_opt(args, model_name, mlp_ckpt_dir, attn_ckpt_dir, device = None, dtype=torch.float16, process_group = None, world_size = None, rank = None):
+    # dtype = torch.float16
+    config = OPTConfig.from_pretrained(model_name)
+    config = opt_config_to_gpt2_config(config)
+    if device in ('cpu', torch.device('cpu')):
+        config.fused_mlp = False
+        config.fused_dropout_add_ln = False
+        config.use_flash_attn = False
+        config.fused_bias_fc = False
+    else:
+        config.fused_mlp = True
+        config.fused_dropout_add_ln = True
+        config.use_flash_attn = True
+        config.fused_bias_fc = True
+        config.sequence_parallel = False
+    config.residual_in_fp32 = getattr(config, "prenorm", True)
+    config.pad_vocab_size_multiple = 8
+    config.mlp_sparse = True
+    config.att_sparse = True
+    config.use_heuristic = True
+    if config.use_heuristic:
+        print("Using pre-compiled heuristic")
+    else:
+        print("Compiling new heuristic during runtime")
+    spconfig = SparseConfig()
+    spconfig.mlp_act_th = 0.5   # sets the threshold for the MLP routers for all layers
+    spconfig.attn_topk = args.attn_topk    # sets the topk for the attention routers for all layers
+    # build model
+    print("Bulding Model with sparse routers")
+    model_sparse = GPTLMHeadModelSparse(config = config, sp_config = spconfig, process_group = process_group, device = device, dtype=dtype)
+    # print(model_sparse)
+    # load pretrained weights into the sparse model
+    state_dict = state_dict_from_pretrained(model_name, device="cpu", dtype=dtype)
+    state_dict = remap_state_dict_hf_opt(state_dict, config)
+    # load the routers into the model
+    if mlp_ckpt_dir is not None and attn_ckpt_dir is not None:
+        mlp_router_state_dict = create_mlp_router_state_dict(mlp_ckpt_dir)
+        attn_router_state_dict = create_attn_router_state_dict(attn_ckpt_dir)
+        # merge the state dict
+        merged_state_dict = {**state_dict, **mlp_router_state_dict, **attn_router_state_dict}
+        if process_group is not None:
+            merged_state_dict = shard_state_dict_tp(merged_state_dict, config, world_size, rank)
+        model_sparse.load_state_dict(merged_state_dict, strict=True)
+    else:
+        if process_group is not None:
+            state_dict = shard_state_dict_tp(state_dict, config, world_size, rank)
+        model_sparse.load_state_dict(state_dict, strict=False)
+    return model_sparse
+def build_dense_opt(model_name, device = None, dtype=torch.float16, process_group = None, world_size = None, rank = None):
+    dtype = torch.float16
+    config = opt_config_to_gpt2_config(OPTConfig.from_pretrained(model_name))
+    config.use_flash_attn = True
+    config.fused_bias_fc = True
+    config.fused_mlp = True
+    # config.fused_dropout_add_ln = True
+    config.sequence_parallel = False
+    # Only prenorm supports residual_in_fp32
+    config.residual_in_fp32 = getattr(config, "prenorm", True)
+    config.pad_vocab_size_multiple = 8
+    # build model
+    print("Bulding Dense Model")
+    model = GPTLMHeadModel.from_pretrained(model_name, config, process_group = process_group, world_size = world_size, rank = rank, device=device, dtype=dtype)
+    return model

HybridTensor/modules/SelectiveBlock.py ADDED Viewed

	@@ -0,0 +1,960 @@

+from functools import partial
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torchvision.ops import StochasticDepth
+try:
+    from flash_attn.ops.triton.layer_norm import layer_norm_fn, RMSNorm
+except ImportError:
+    layer_norm_fn, RMSNorm = None, None
+class SelectBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        mixer_cls=None,
+        mlp_cls=None,
+        mlp_router=None,
+        mha_router=None,
+        norm_cls=nn.LayerNorm,
+        dropout_cls=nn.Dropout,
+        prenorm=True,
+        resid_dropout1=0.0,
+        resid_dropout2=0.0,
+        drop_path1=0.0,
+        drop_path2=0.0,
+        fused_dropout_add_ln=False,
+        return_residual=False,
+        residual_in_fp32=False,
+        sequence_parallel=False,
+        mark_shared_params=False,
+    ):
+        """
+        For prenorm=True, this Block has a slightly different structure compared to a regular
+        prenorm Transformer block.
+        The standard block is: LN -> MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add.
+        Here we do: Dropout -> Add -> LN -> MHA -> Dropout -> Add -> LN -> MLP, etc.
+        If you want to do concurrency with CUDA graphs, your shapes must remain fixed
+        (batch_size, seq_len, etc.) across captures and replays. Also avoid any operations
+        that cause dynamic shape changes or memory allocations.
+        """
+        super().__init__()
+        self.prenorm = prenorm
+        self.fused_dropout_add_ln = fused_dropout_add_ln
+        self.return_residual = return_residual
+        self.residual_in_fp32 = residual_in_fp32
+        if self.residual_in_fp32:
+            assert self.prenorm, "residual_in_fp32 is only compatible with prenorm=True"
+        assert mixer_cls is not None and mlp_cls is not None, (
+            "mixer_cls and mlp_cls cannot be None in SelectBlock"
+        )
+        # MHA & MLP submodules
+        self.mixer = mixer_cls(dim)
+        self.dropout1 = dropout_cls(resid_dropout1)
+        self.drop_path1 = StochasticDepth(drop_path1, mode="row")
+        self.norm1 = norm_cls(dim)
+        self.mlp = mlp_cls(dim)
+        self.total_neurons = self.mlp.fc1.weight.shape[0]
+        # Routers
+        if mlp_router is not None:
+            self.mlp_router = mlp_router(dim)
+            self.skip_attn_router = False
+        else:
+            self.mlp_router = None
+            self.skip_attn_router = True
+        if mha_router is not None:
+            self.mha_router = mha_router(dim)
+        else:
+            self.mha_router = None
+        if not isinstance(self.mlp, nn.Identity):
+            self.dropout2 = dropout_cls(resid_dropout2)
+            self.drop_path2 = StochasticDepth(drop_path2, mode="row")
+            self.norm2 = norm_cls(dim)
+        if self.fused_dropout_add_ln:
+            assert layer_norm_fn is not None, "Triton layer_norm_fn not installed"
+            assert isinstance(self.norm1, (nn.LayerNorm, RMSNorm)) and isinstance(self.dropout1, nn.Dropout)
+        # Mark the norm parameters for sequence parallel / shared params if needed
+        if sequence_parallel:
+            for p in self.norm1.parameters():
+                p._sequence_parallel = True
+            if hasattr(self, "norm2"):
+                for p in self.norm2.parameters():
+                    p._sequence_parallel = True
+        if mark_shared_params:
+            for p in self.norm1.parameters():
+                p._shared_params = True
+            if hasattr(self, "norm2"):
+                for p in self.norm2.parameters():
+                    p._shared_params = True
+        self.mlp_topk = None
+        self.skip_mlp_router = False
+        self.skip_attn_router = False
+        # We'll use an extra stream for concurrency
+        self.sparse_stream = torch.cuda.Stream(device="cuda", priority=0)
+        self.main_stream = torch.cuda.Stream(device="cuda", priority=-5)
+        # We'll record events to coordinate concurrency
+        self.mha_event = torch.cuda.Event(enable_timing=False, blocking=False)
+        self.mlp_event = torch.cuda.Event(enable_timing=False, blocking=False)
+        self.use_tensor_parallel = mark_shared_params
+        if self.use_tensor_parallel:
+            # save the stream and events in the mixer and mlp classes
+            self.mlp.router = self.mlp_router
+            self.mixer.router = self.mha_router
+        self.mlp_topk_layers = None     # this will be a dictionary of layer_idx -> topk value
+        self.attn_topk_layers = None    # this will be a dictionary of layer_idx -> topk value
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+    def prefill_forward(self, hidden_states: Tensor, residual: Optional[Tensor] = None, mixer_kwargs=None, mixer_subset=None):
+        hidden_states = self.mixer(hidden_states, **mixer_kwargs)
+        if mixer_subset is not None:
+            residual = residual[:, mixer_subset]
+        if not isinstance(self.mlp, nn.Identity):
+            if not self.fused_dropout_add_ln:
+                dropped = self.drop_path2(self.dropout2(hidden_states))
+                if dropped.shape != residual.shape:
+                    dropped = dropped.view(residual.shape)
+                residual = (dropped + residual) if residual is not None else dropped
+                hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
+                if self.residual_in_fp32:
+                    residual = residual.to(torch.float32)
+            else:
+                if self.drop_path2.p == 0 or not self.training:
+                    rowscale2 = None
+                else:
+                    rowscale2 = self.drop_path2(
+                        torch.ones(
+                            hidden_states.shape[:-1],
+                            device=hidden_states.device,
+                            dtype=hidden_states.dtype,
+                        )
+                    )
+                if hidden_states.shape != residual.shape:
+                    hidden_states = hidden_states.view(residual.shape)
+                hidden_states, residual = layer_norm_fn(
+                    hidden_states,
+                    self.norm2.weight,
+                    self.norm2.bias,
+                    residual=residual,
+                    eps=self.norm2.eps,
+                    dropout_p=self.dropout2.p if self.training else 0.0,
+                    rowscale=rowscale2,
+                    prenorm=True,
+                    residual_in_fp32=self.residual_in_fp32,
+                    is_rms_norm=isinstance(self.norm2, RMSNorm),
+                )
+            hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+    def decode_forward(self, hidden_states: Tensor, residual: Optional[Tensor] = None, mixer_subset=None, mixer_kwargs=None):
+        """ Single GPU Decode Forward
+        Args:
+            hidden_states (Tensor): _description_
+            residual (Optional[Tensor], optional): _description_. Defaults to None.
+            mixer_subset (_type_, optional): _description_. Defaults to None.
+        """
+        curr_stream = torch.cuda.current_stream()
+        # We want to run MHA & mlp_router in parallel on different streams
+        router_inputs = hidden_states.squeeze(1)  # shape (batch_size, dim)
+        self.main_stream.wait_stream(curr_stream)
+        self.sparse_stream.wait_stream(curr_stream)
+        main_stream = self.main_stream
+        # if mlp_topk > th * total_neurons, skip mlp router
+        # if self.mlp_topk > 0.8 * self.total_neurons:
+        #     self.skip_mlp_router = True
+        # else:
+        #     self.skip_mlp_router = False
+        # [Sparse stream]  mlp_router
+        if not self.skip_mlp_router:
+            with torch.cuda.stream(self.sparse_stream):
+                index_vec = self.mlp_router._select_neurons_topk(router_inputs, topk = self.mlp_topk)
+                self.sparse_stream.record_event(self.mlp_event)
+        # [Main stream]  MHA
+        with torch.cuda.stream(main_stream):
+            batch_head_idx = self.mha_router._select_heads(router_inputs)
+            hidden_states = self.mixer(
+                hidden_states,
+                batch_head_idx=batch_head_idx,
+                **mixer_kwargs
+            )
+            main_stream.record_event(self.mha_event)
+        # Now we unify after both are done, then do the next steps
+        with torch.cuda.stream(main_stream):
+            # Wait on router & MHA
+            curr_stream.wait_stream(main_stream)
+            main_stream.wait_event(self.mha_event)
+            # normal residual / layernorm
+            if mixer_subset is not None:
+                residual = residual[:, mixer_subset]
+            if not isinstance(self.mlp, nn.Identity):
+                if not self.fused_dropout_add_ln:
+                    dropped = self.drop_path2(self.dropout2(hidden_states))
+                    residual = (dropped + residual) if residual is not None else dropped
+                    hidden_states = self.norm2(
+                        residual.to(dtype=self.norm2.weight.dtype)
+                    )
+                    if self.residual_in_fp32:
+                        residual = residual.to(torch.float32)
+                else:
+                    if self.drop_path2.p == 0 or not self.training:
+                        rowscale2 = None
+                    else:
+                        rowscale2 = self.drop_path2(
+                            torch.ones(
+                                hidden_states.shape[:-1],
+                                device=hidden_states.device,
+                                dtype=hidden_states.dtype,
+                            )
+                        )
+                    if hidden_states.shape != residual.shape:
+                        hidden_states = hidden_states.view(residual.shape)
+                    hidden_states, residual = layer_norm_fn(
+                        hidden_states,
+                        self.norm2.weight,
+                        self.norm2.bias,
+                        residual=residual,
+                        eps=self.norm2.eps,
+                        dropout_p=self.dropout2.p if self.training else 0.0,
+                        rowscale=rowscale2,
+                        prenorm=True,
+                        residual_in_fp32=self.residual_in_fp32,
+                        is_rms_norm=isinstance(self.norm2, RMSNorm),
+                    )
+                # hidden_states = self.mlp(hidden_states, index_vec=test_index_vec, index_size=test_index_size)
+                if self.skip_mlp_router:
+                    hidden_states = self.mlp(hidden_states, index_vec=None)
+                else:
+                    curr_stream.wait_stream(self.sparse_stream)
+                    main_stream.wait_event(self.mlp_event)
+                    hidden_states = self.mlp(hidden_states, index_vec=index_vec)
+                curr_stream.wait_stream(main_stream)
+                curr_stream.wait_stream(self.sparse_stream)
+        return hidden_states, residual
+    def tp_decode_forward(self, hidden_states: Tensor, residual: Optional[Tensor] = None, mixer_subset=None, mixer_kwargs=None):
+        """
+        Tensor Parallel Decode Forward
+        """
+        curr_stream = torch.cuda.current_stream()
+        self.sparse_stream.wait_stream(curr_stream)
+        # self.main_stream.wait_stream(curr_stream)
+        router_inputs = hidden_states.squeeze(1)  # shape (batch_size, dim)
+        if self.mlp_topk > 0.8 * self.total_neurons:
+            self.skip_mlp_router = True
+        else:
+            self.skip_mlp_router = False
+        # attention router is synchronous
+        batch_head_idx = self.mha_router._select_heads(router_inputs)
+        # mlp router is asynchronous
+        if not self.skip_mlp_router:
+            with torch.cuda.stream(self.sparse_stream):
+                index_vec = self.mlp_router._select_neurons_topk(router_inputs, topk = self.mlp_topk)
+                self.sparse_stream.record_event(self.mlp_event)
+        hidden_states = self.mixer(hidden_states, **mixer_kwargs, batch_head_idx=batch_head_idx)
+        if mixer_subset is not None:
+            residual = residual[:, mixer_subset]
+        if not isinstance(self.mlp, nn.Identity):
+            if not self.fused_dropout_add_ln:
+                dropped = self.drop_path2(self.dropout2(hidden_states))
+                if dropped.shape != residual.shape:
+                    dropped = dropped.view(residual.shape)
+                residual = (dropped + residual) if residual is not None else dropped
+                hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
+                if self.residual_in_fp32:
+                    residual = residual.to(torch.float32)
+            else:
+                if self.drop_path2.p == 0 or not self.training:
+                    rowscale2 = None
+                else:
+                    rowscale2 = self.drop_path2(
+                        torch.ones(
+                            hidden_states.shape[:-1],
+                            device=hidden_states.device,
+                            dtype=hidden_states.dtype,
+                        )
+                    )
+                if hidden_states.shape != residual.shape:
+                    hidden_states = hidden_states.view(residual.shape)
+                hidden_states, residual = layer_norm_fn(
+                    hidden_states,
+                    self.norm2.weight,
+                    self.norm2.bias,
+                    residual=residual,
+                    eps=self.norm2.eps,
+                    dropout_p=self.dropout2.p if self.training else 0.0,
+                    rowscale=rowscale2,
+                    prenorm=True,
+                    residual_in_fp32=self.residual_in_fp32,
+                    is_rms_norm=isinstance(self.norm2, RMSNorm),
+                )
+            # curr_stream.wait_stream(self.sparse_stream)
+            if self.skip_mlp_router:
+                hidden_states = self.mlp(hidden_states, index_vec=None)
+            else:
+                curr_stream.wait_event(self.mlp_event)
+                hidden_states = self.mlp(hidden_states, index_vec=index_vec)
+        return hidden_states, residual
+    def attn_sparse_forward(self, hidden_states: Tensor, residual: Optional[Tensor] = None, mixer_subset=None, mixer_kwargs=None):
+        """
+        Decode Forward with Sparse Attention Router
+        """
+        # We want to run MHA & mlp_router in parallel on different streams
+        router_inputs = hidden_states.squeeze(1)  # shape (batch_size, dim)
+        batch_head_idx = self.mha_router._select_heads(router_inputs)
+        # print(f"hidden_states shape: {hidden_states.shape}")
+        # print(f"hidden states: {hidden_states}")
+        hidden_states = self.mixer(hidden_states, batch_head_idx=batch_head_idx, **mixer_kwargs)
+        # normal residual / layernorm
+        if mixer_subset is not None:
+            residual = residual[:, mixer_subset]
+        if not isinstance(self.mlp, nn.Identity):
+            if not self.fused_dropout_add_ln:
+                dropped = self.drop_path2(self.dropout2(hidden_states))
+                residual = (dropped + residual) if residual is not None else dropped
+                hidden_states = self.norm2(
+                    residual.to(dtype=self.norm2.weight.dtype)
+                )
+                if self.residual_in_fp32:
+                    residual = residual.to(torch.float32)
+            else:
+                if self.drop_path2.p == 0 or not self.training:
+                    rowscale2 = None
+                else:
+                    rowscale2 = self.drop_path2(
+                        torch.ones(hidden_states.shape[:-1], device=hidden_states.device, dtype=hidden_states.dtype,)
+                    )
+                if hidden_states.shape != residual.shape:
+                    hidden_states = hidden_states.view(residual.shape)
+                hidden_states, residual = layer_norm_fn(hidden_states, self.norm2.weight, self.norm2.bias, residual=residual,
+                                                        eps=self.norm2.eps, dropout_p=self.dropout2.p if self.training else 0.0,
+                                                        rowscale=rowscale2, prenorm=True, residual_in_fp32=self.residual_in_fp32,
+                                                        is_rms_norm=isinstance(self.norm2, RMSNorm),)
+            # hidden_states = self.mlp(hidden_states, index_vec=test_index_vec, index_size=test_index_size)
+            hidden_states = self.mlp(hidden_states)
+        return hidden_states, residual
+    def mlp_sparse_forward(self, hidden_states: Tensor, residual: Optional[Tensor] = None, mixer_subset=None, mixer_kwargs=None):
+        """ Single GPU Decode Forward
+        Args:
+            hidden_states (Tensor): _description_
+            residual (Optional[Tensor], optional): _description_. Defaults to None.
+            mixer_subset (_type_, optional): _description_. Defaults to None.
+        """
+        curr_stream = torch.cuda.current_stream()
+        # We want to run MHA & mlp_router in parallel on different streams
+        router_inputs = hidden_states.squeeze(1)  # shape (batch_size, dim)
+        self.main_stream.wait_stream(curr_stream)
+        self.sparse_stream.wait_stream(curr_stream)
+        main_stream = self.main_stream
+        # if mlp_topk > th * total_neurons, skip mlp router
+        if self.mlp_topk > 0.8 * self.total_neurons:
+            self.skip_mlp_router = True
+        else:
+            self.skip_mlp_router = False
+        # [Sparse stream]  mlp_router
+        if not self.skip_mlp_router:
+            with torch.cuda.stream(self.sparse_stream):
+                index_vec = self.mlp_router._select_neurons_topk(router_inputs, topk = self.mlp_topk)
+                self.sparse_stream.record_event(self.mlp_event)
+        # [Main stream]  MHA
+        with torch.cuda.stream(main_stream):
+            # batch_head_idx = self.mha_router._select_heads(router_inputs)
+            hidden_states = self.mixer(
+                hidden_states,
+                batch_head_idx=None,
+                **mixer_kwargs
+            )
+            main_stream.record_event(self.mha_event)
+        # Now we unify after both are done, then do the next steps
+        with torch.cuda.stream(main_stream):
+            # Wait on router & MHA
+            curr_stream.wait_stream(main_stream)
+            main_stream.wait_event(self.mha_event)
+            # normal residual / layernorm
+            if mixer_subset is not None:
+                residual = residual[:, mixer_subset]
+            if not isinstance(self.mlp, nn.Identity):
+                if not self.fused_dropout_add_ln:
+                    dropped = self.drop_path2(self.dropout2(hidden_states))
+                    residual = (dropped + residual) if residual is not None else dropped
+                    hidden_states = self.norm2(
+                        residual.to(dtype=self.norm2.weight.dtype)
+                    )
+                    if self.residual_in_fp32:
+                        residual = residual.to(torch.float32)
+                else:
+                    if self.drop_path2.p == 0 or not self.training:
+                        rowscale2 = None
+                    else:
+                        rowscale2 = self.drop_path2(
+                            torch.ones(
+                                hidden_states.shape[:-1],
+                                device=hidden_states.device,
+                                dtype=hidden_states.dtype,
+                            )
+                        )
+                    if hidden_states.shape != residual.shape:
+                        hidden_states = hidden_states.view(residual.shape)
+                    hidden_states, residual = layer_norm_fn(
+                        hidden_states,
+                        self.norm2.weight,
+                        self.norm2.bias,
+                        residual=residual,
+                        eps=self.norm2.eps,
+                        dropout_p=self.dropout2.p if self.training else 0.0,
+                        rowscale=rowscale2,
+                        prenorm=True,
+                        residual_in_fp32=self.residual_in_fp32,
+                        is_rms_norm=isinstance(self.norm2, RMSNorm),
+                    )
+                # hidden_states = self.mlp(hidden_states, index_vec=test_index_vec, index_size=test_index_size)
+                if self.skip_mlp_router:
+                    hidden_states = self.mlp(hidden_states, index_vec=None)
+                else:
+                    curr_stream.wait_stream(self.sparse_stream)
+                    main_stream.wait_event(self.mlp_event)
+                    hidden_states = self.mlp(hidden_states, index_vec=index_vec)
+                curr_stream.wait_stream(main_stream)
+                curr_stream.wait_stream(self.sparse_stream)
+        return hidden_states, residual
+    def forward(
+        self,
+        hidden_states: Tensor,
+        residual: Optional[Tensor] = None,
+        mixer_subset=None,
+        mixer_kwargs=None,
+        mlp_topk=None,
+        attn_topk=None,
+    ):
+        """
+        This forward pass includes concurrency logic in the decode branch.
+        If you're capturing with a CUDA graph, the concurrency (two-stream usage) must be
+        inside the captured region so that the replay reproduces the parallel streams.
+        """
+        # simulation values
+        if mlp_topk is not None:
+            self.mlp_topk = mlp_topk
+        if attn_topk is not None:
+            self.mha_router.topk = attn_topk
+        if mixer_kwargs is None:
+            mixer_kwargs = {"inference_params": None}
+        else:
+            # Ensure 'inference_params' key exists
+            if "inference_params" not in mixer_kwargs:
+                mixer_kwargs["inference_params"] = None
+        if self.prenorm:
+            # --- 1) Prenorm’s dropout/add/layernorm
+            if not self.fused_dropout_add_ln:
+                dropped = self.drop_path1(self.dropout1(hidden_states))
+                residual = (dropped + residual) if residual is not None else dropped
+                hidden_states = self.norm1(residual.to(dtype=self.norm1.weight.dtype))
+                if self.residual_in_fp32:
+                    residual = residual.to(torch.float32)
+            else:
+                # fused dropout + add + layernorm
+                if self.drop_path1.p == 0 or not self.training:
+                    rowscale1 = None
+                else:
+                    rowscale1 = self.drop_path1(
+                        torch.ones(
+                            hidden_states.shape[:-1],
+                            device=hidden_states.device,
+                            dtype=hidden_states.dtype,
+                        )
+                    )
+                if residual is not None and hidden_states.shape != residual.shape:
+                    hidden_states = hidden_states.view(residual.shape)
+                hidden_states, residual = layer_norm_fn(
+                    hidden_states,
+                    self.norm1.weight,
+                    self.norm1.bias,
+                    residual=residual,
+                    eps=self.norm1.eps,
+                    dropout_p=self.dropout1.p if self.training else 0.0,
+                    rowscale=rowscale1,
+                    prenorm=True,
+                    residual_in_fp32=self.residual_in_fp32,
+                    is_rms_norm=isinstance(self.norm1, RMSNorm),
+                )
+            if mixer_subset is not None:
+                mixer_kwargs["mixer_subset"] = mixer_subset
+            # Check if we are in the prefill or decode stage
+            prefill_stage = (
+                mixer_kwargs["inference_params"] is None
+                or mixer_kwargs["inference_params"].seqlen_offset == 0
+            )
+            if prefill_stage:
+                # --- 2) Prefill stage (no concurrency): just do normal forward
+                hidden_states, residual = self.prefill_forward(hidden_states, residual, mixer_kwargs, mixer_subset)
+            else:
+                # --- 3) Decode stage:
+                if self.mlp_router is None:
+                    # decode stage with only attention router, works with both single gpu and tensor parallel
+                    hidden_states, residual = self.attn_sparse_forward(hidden_states, residual, mixer_subset, mixer_kwargs)
+                else:
+                    if not self.use_tensor_parallel:
+                        if self.mha_router is None:
+                            # decode stage with mlp routers (opt models and single gpu)
+                            hidden_states, residual = self.mlp_sparse_forward(hidden_states, residual, mixer_subset, mixer_kwargs)
+                        else:
+                            # decode stage with mlp and attention routers (opt models and single gpu)
+                            hidden_states, residual = self.decode_forward(hidden_states, residual, mixer_subset, mixer_kwargs)
+                    else:
+                        # uses both mlp and attention routers in tensor parallel
+                        hidden_states, residual = self.tp_decode_forward(hidden_states, residual, mixer_subset, mixer_kwargs)
+            return hidden_states, residual
+        else:
+            # post-norm architecture not implemented here
+            raise NotImplementedError
+# class SelectBlock(nn.Module):
+#     def __init__(
+#         self,
+#         dim,
+#         mixer_cls=None,
+#         mlp_cls=None,
+#         mlp_router=None,
+#         mha_router=None,
+#         norm_cls=nn.LayerNorm,
+#         dropout_cls=nn.Dropout,
+#         prenorm=True,
+#         resid_dropout1=0.0,
+#         resid_dropout2=0.0,
+#         drop_path1=0.0,
+#         drop_path2=0.0,
+#         fused_dropout_add_ln=False,
+#         return_residual=False,
+#         residual_in_fp32=False,
+#         sequence_parallel=False,
+#         mark_shared_params=False,
+#     ):
+#         """
+#         For prenorm=True, this Block has a slightly different structure compared to a regular
+#         prenorm Transformer block.
+#         The standard block is: LN -> MHA -> Dropout -> Add -> LN -> MLP -> Dropout -> Add.
+#         Here we do: Dropout -> Add -> LN -> MHA -> Dropout -> Add -> LN -> MLP, etc.
+#         If you want to do concurrency with CUDA graphs, your shapes must remain fixed
+#         (batch_size, seq_len, etc.) across captures and replays. Also avoid any operations
+#         that cause dynamic shape changes or memory allocations.
+#         """
+#         super().__init__()
+#         self.prenorm = prenorm
+#         self.fused_dropout_add_ln = fused_dropout_add_ln
+#         self.return_residual = return_residual
+#         self.residual_in_fp32 = residual_in_fp32
+#         if self.residual_in_fp32:
+#             assert self.prenorm, "residual_in_fp32 is only compatible with prenorm=True"
+#         assert mixer_cls is not None and mlp_cls is not None, (
+#             "mixer_cls and mlp_cls cannot be None in SelectBlock"
+#         )
+#         # MHA & MLP submodules
+#         self.mixer = mixer_cls(dim)
+#         self.dropout1 = dropout_cls(resid_dropout1)
+#         self.drop_path1 = StochasticDepth(drop_path1, mode="row")
+#         self.norm1 = norm_cls(dim)
+#         self.mlp = mlp_cls(dim)
+#         # Routers
+#         self.mlp_router = mlp_router(dim)
+#         self.mha_router = mha_router(dim)
+#         if not isinstance(self.mlp, nn.Identity):
+#             self.dropout2 = dropout_cls(resid_dropout2)
+#             self.drop_path2 = StochasticDepth(drop_path2, mode="row")
+#             self.norm2 = norm_cls(dim)
+#         if self.fused_dropout_add_ln:
+#             assert layer_norm_fn is not None, "Triton layer_norm_fn not installed"
+#             assert isinstance(self.norm1, (nn.LayerNorm, RMSNorm)) and isinstance(self.dropout1, nn.Dropout)
+#         # Mark the norm parameters for sequence parallel / shared params if needed
+#         if sequence_parallel:
+#             for p in self.norm1.parameters():
+#                 p._sequence_parallel = True
+#             if hasattr(self, "norm2"):
+#                 for p in self.norm2.parameters():
+#                     p._sequence_parallel = True
+#         if mark_shared_params:
+#             for p in self.norm1.parameters():
+#                 p._shared_params = True
+#             if hasattr(self, "norm2"):
+#                 for p in self.norm2.parameters():
+#                     p._shared_params = True
+#         self.mlp_topk = None
+#         self.skip_mlp_router = False
+#         self.skip_attn_router = False
+#         # We'll use an extra stream for concurrency
+#         self.sparse_stream = torch.cuda.Stream(device="cuda", priority=0)
+#         self.main_stream = torch.cuda.Stream(device="cuda", priority=-5)
+#         # We'll record events to coordinate concurrency
+#         self.mha_event = torch.cuda.Event(enable_timing=False, blocking=False)
+#         self.mlp_event = torch.cuda.Event(enable_timing=False, blocking=False)
+#         self.use_tensor_parallel = mark_shared_params
+#         if self.use_tensor_parallel:
+#             # TODO: save the routers in the mixer and mlp classes
+#             # save the stream and events in the mixer and mlp classes
+#             pass
+#     def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+#         return self.mixer.allocate_inference_cache(batch_size, max_seqlen, dtype=dtype, **kwargs)
+#     def prefill_forward(self, hidden_states: Tensor, residual: Optional[Tensor] = None, mixer_kwargs=None, mixer_subset=None):
+#         hidden_states = self.mixer(hidden_states, **mixer_kwargs)
+#         if mixer_subset is not None:
+#             residual = residual[:, mixer_subset]
+#         if not isinstance(self.mlp, nn.Identity):
+#             if not self.fused_dropout_add_ln:
+#                 dropped = self.drop_path2(self.dropout2(hidden_states))
+#                 if dropped.shape != residual.shape:
+#                     dropped = dropped.view(residual.shape)
+#                 residual = (dropped + residual) if residual is not None else dropped
+#                 hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
+#                 if self.residual_in_fp32:
+#                     residual = residual.to(torch.float32)
+#             else:
+#                 if self.drop_path2.p == 0 or not self.training:
+#                     rowscale2 = None
+#                 else:
+#                     rowscale2 = self.drop_path2(
+#                         torch.ones(
+#                             hidden_states.shape[:-1],
+#                             device=hidden_states.device,
+#                             dtype=hidden_states.dtype,
+#                         )
+#                     )
+#                 if hidden_states.shape != residual.shape:
+#                     hidden_states = hidden_states.view(residual.shape)
+#                 hidden_states, residual = layer_norm_fn(
+#                     hidden_states,
+#                     self.norm2.weight,
+#                     self.norm2.bias,
+#                     residual=residual,
+#                     eps=self.norm2.eps,
+#                     dropout_p=self.dropout2.p if self.training else 0.0,
+#                     rowscale=rowscale2,
+#                     prenorm=True,
+#                     residual_in_fp32=self.residual_in_fp32,
+#                     is_rms_norm=isinstance(self.norm2, RMSNorm),
+#                 )
+#             hidden_states = self.mlp(hidden_states)
+#         return hidden_states, residual
+#     def decode_forward(self, hidden_states: Tensor, residual: Optional[Tensor] = None, mixer_subset=None, mixer_kwargs=None):
+#         """ Single GPU Decode Forward
+#         Args:
+#             hidden_states (Tensor): _description_
+#             residual (Optional[Tensor], optional): _description_. Defaults to None.
+#             mixer_subset (_type_, optional): _description_. Defaults to None.
+#         """
+#         curr_stream = torch.cuda.current_stream()
+#         # We want to run MHA & mlp_router in parallel on different streams
+#         router_inputs = hidden_states.squeeze(1)  # shape (batch_size, dim)
+#         self.main_stream.wait_stream(curr_stream)
+#         self.sparse_stream.wait_stream(curr_stream)
+#         # We'll do MHA on the "main_stream" and the router on "sparse_stream"
+#         main_stream = self.main_stream
+#         # In a captured region, each 'with torch.cuda.stream(...)' block
+#         # is replayed in concurrency. The shape must remain consistent.
+#         # [Sparse stream]  mlp_router
+#         if not self.skip_mlp_router:
+#             with torch.cuda.stream(self.sparse_stream):  # <-- CHANGED
+#                 # index_size, index_vec  = self.mlp_router._select_neurons_cuda_safe(router_inputs)    # need to fix this; make CUDA Graph safe
+#                 # vec = self.mlp_router(router_inputs)
+#                 index_vec = self.mlp_router._select_neurons_topk(router_inputs, topk = self.mlp_topk)
+#                 self.sparse_stream.record_event(self.mlp_event)
+#         # [Main stream]  MHA
+#         with torch.cuda.stream(main_stream):  # <-- CHANGED
+#             batch_head_idx = self.mha_router._select_heads(router_inputs)
+#             hidden_states = self.mixer(
+#                 hidden_states,
+#                 batch_head_idx=batch_head_idx,
+#                 # batch_head_idx=None,
+#                 **mixer_kwargs
+#             )
+#             main_stream.record_event(self.mha_event)
+#         # Now we unify after both are done, then do the next steps
+#         with torch.cuda.stream(main_stream):  # <-- CHANGED
+#             # Wait on router & MHA
+#             curr_stream.wait_stream(main_stream)
+#             main_stream.wait_event(self.mha_event)
+#             # normal residual / layernorm
+#             if mixer_subset is not None:
+#                 residual = residual[:, mixer_subset]
+#             if not isinstance(self.mlp, nn.Identity):
+#                 if not self.fused_dropout_add_ln:
+#                     dropped = self.drop_path2(self.dropout2(hidden_states))
+#                     residual = (dropped + residual) if residual is not None else dropped
+#                     hidden_states = self.norm2(
+#                         residual.to(dtype=self.norm2.weight.dtype)
+#                     )
+#                     if self.residual_in_fp32:
+#                         residual = residual.to(torch.float32)
+#                 else:
+#                     if self.drop_path2.p == 0 or not self.training:
+#                         rowscale2 = None
+#                     else:
+#                         rowscale2 = self.drop_path2(
+#                             torch.ones(
+#                                 hidden_states.shape[:-1],
+#                                 device=hidden_states.device,
+#                                 dtype=hidden_states.dtype,
+#                             )
+#                         )
+#                     if hidden_states.shape != residual.shape:
+#                         hidden_states = hidden_states.view(residual.shape)
+#                     hidden_states, residual = layer_norm_fn(
+#                         hidden_states,
+#                         self.norm2.weight,
+#                         self.norm2.bias,
+#                         residual=residual,
+#                         eps=self.norm2.eps,
+#                         dropout_p=self.dropout2.p if self.training else 0.0,
+#                         rowscale=rowscale2,
+#                         prenorm=True,
+#                         residual_in_fp32=self.residual_in_fp32,
+#                         is_rms_norm=isinstance(self.norm2, RMSNorm),
+#                     )
+#                 # Finally do MLP with the router's index vector
+#                 curr_stream.wait_stream(self.sparse_stream)
+#                 main_stream.wait_event(self.mlp_event)
+#                 # hidden_states = self.mlp(hidden_states, index_vec=test_index_vec, index_size=test_index_size)
+#                 if self.skip_mlp_router:
+#                     hidden_states = self.mlp(hidden_states, index_vec=None)
+#                 else:
+#                     hidden_states = self.mlp(hidden_states, index_vec=index_vec)
+#                 curr_stream.wait_stream(main_stream)
+#                 curr_stream.wait_stream(self.sparse_stream)
+#         return hidden_states, residual
+#     def tp_decode_forward(self, hidden_states: Tensor, residual: Optional[Tensor] = None, mixer_subset=None, mixer_kwargs=None):
+#         """
+#         Tensor Parallel Decode Forward
+#         Args:
+#             hidden_states (Tensor): _description_
+#             residual (Optional[Tensor], optional): _description_. Defaults to None.
+#             mixer_subset (_type_, optional): _description_. Defaults to None.
+#         """
+#         # TODO: need to add routing
+#         hidden_states = self.mixer(hidden_states, **mixer_kwargs)
+#         if mixer_subset is not None:
+#             residual = residual[:, mixer_subset]
+#         if not isinstance(self.mlp, nn.Identity):
+#             if not self.fused_dropout_add_ln:
+#                 dropped = self.drop_path2(self.dropout2(hidden_states))
+#                 if dropped.shape != residual.shape:
+#                     dropped = dropped.view(residual.shape)
+#                 residual = (dropped + residual) if residual is not None else dropped
+#                 hidden_states = self.norm2(residual.to(dtype=self.norm2.weight.dtype))
+#                 if self.residual_in_fp32:
+#                     residual = residual.to(torch.float32)
+#             else:
+#                 if self.drop_path2.p == 0 or not self.training:
+#                     rowscale2 = None
+#                 else:
+#                     rowscale2 = self.drop_path2(
+#                         torch.ones(
+#                             hidden_states.shape[:-1],
+#                             device=hidden_states.device,
+#                             dtype=hidden_states.dtype,
+#                         )
+#                     )
+#                 if hidden_states.shape != residual.shape:
+#                     hidden_states = hidden_states.view(residual.shape)
+#                 hidden_states, residual = layer_norm_fn(
+#                     hidden_states,
+#                     self.norm2.weight,
+#                     self.norm2.bias,
+#                     residual=residual,
+#                     eps=self.norm2.eps,
+#                     dropout_p=self.dropout2.p if self.training else 0.0,
+#                     rowscale=rowscale2,
+#                     prenorm=True,
+#                     residual_in_fp32=self.residual_in_fp32,
+#                     is_rms_norm=isinstance(self.norm2, RMSNorm),
+#                 )
+#             hidden_states = self.mlp(hidden_states)
+#         return hidden_states, residual
+#     def forward(
+#         self,
+#         hidden_states: Tensor,
+#         residual: Optional[Tensor] = None,
+#         mixer_subset=None,
+#         mixer_kwargs=None,
+#     ):
+#         """
+#         This forward pass includes concurrency logic in the decode branch.
+#         If you're capturing with a CUDA graph, the concurrency (two-stream usage) must be
+#         inside the captured region so that the replay reproduces the parallel streams.
+#         """
+#         if mixer_kwargs is None:
+#             mixer_kwargs = {"inference_params": None}
+#         else:
+#             # Ensure 'inference_params' key exists
+#             if "inference_params" not in mixer_kwargs:
+#                 mixer_kwargs["inference_params"] = None
+#         if self.prenorm:
+#             # --- 1) Prenorm’s dropout/add/layernorm
+#             if not self.fused_dropout_add_ln:
+#                 dropped = self.drop_path1(self.dropout1(hidden_states))
+#                 residual = (dropped + residual) if residual is not None else dropped
+#                 hidden_states = self.norm1(residual.to(dtype=self.norm1.weight.dtype))
+#                 if self.residual_in_fp32:
+#                     residual = residual.to(torch.float32)
+#             else:
+#                 # fused dropout + add + layernorm
+#                 if self.drop_path1.p == 0 or not self.training:
+#                     rowscale1 = None
+#                 else:
+#                     rowscale1 = self.drop_path1(
+#                         torch.ones(
+#                             hidden_states.shape[:-1],
+#                             device=hidden_states.device,
+#                             dtype=hidden_states.dtype,
+#                         )
+#                     )
+#                 if residual is not None and hidden_states.shape != residual.shape:
+#                     hidden_states = hidden_states.view(residual.shape)
+#                 hidden_states, residual = layer_norm_fn(
+#                     hidden_states,
+#                     self.norm1.weight,
+#                     self.norm1.bias,
+#                     residual=residual,
+#                     eps=self.norm1.eps,
+#                     dropout_p=self.dropout1.p if self.training else 0.0,
+#                     rowscale=rowscale1,
+#                     prenorm=True,
+#                     residual_in_fp32=self.residual_in_fp32,
+#                     is_rms_norm=isinstance(self.norm1, RMSNorm),
+#                 )
+#             if mixer_subset is not None:
+#                 mixer_kwargs["mixer_subset"] = mixer_subset
+#             # Check if we are in the prefill or decode stage
+#             prefill_stage = (
+#                 mixer_kwargs["inference_params"] is None
+#                 or mixer_kwargs["inference_params"].seqlen_offset == 0
+#             )
+#             if prefill_stage:
+#                 # --- 2) Prefill stage (no concurrency): just do normal forward
+#                 hidden_states, residual = self.prefill_forward(hidden_states, residual, mixer_kwargs, mixer_subset)
+#             else:
+#                 # # --- 3) Decode stage:
+#                 if not self.use_tensor_parallel:
+#                     hidden_states, residual = self.decode_forward(hidden_states, residual, mixer_subset, mixer_kwargs)
+#                 else:
+#                     # routing is slightly different in tensor parallel; we overlap the router with allreduce
+#                     hidden_states, residual = self.tp_decode_forward(hidden_states, residual, mixer_subset)
+#             return hidden_states, residual
+#         else:
+#             # post-norm architecture not implemented here
+#             raise NotImplementedError

HybridTensor/modules/SelectiveMHA.py ADDED Viewed

	@@ -0,0 +1,1579 @@

+import math
+from functools import partial
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from flash_attn.utils.distributed import get_dim_for_local_rank
+from flash_attn.utils.distributed import all_reduce
+try:
+    from flash_attn import (
+        flash_attn_kvpacked_func,
+        flash_attn_qkvpacked_func,
+        flash_attn_varlen_kvpacked_func,
+        flash_attn_varlen_qkvpacked_func,
+        flash_attn_with_kvcache,
+    )
+except ImportError:
+    flash_attn_varlen_qkvpacked_func, flash_attn_varlen_kvpacked_func = None, None
+    flash_attn_qkvpacked_func, flash_attn_kvpacked_func = None, None
+    flash_attn_with_kvcache = None
+try:
+    from flash_attn.ops.fused_dense import ColumnParallelLinear, FusedDense, RowParallelLinear, fused_dense_func
+except ImportError:
+    FusedDense, ColumnParallelLinear, RowParallelLinear = None, None, None
+try:
+    from flash_attn.layers.rotary import RotaryEmbedding
+except ImportError:
+    RotaryEmbedding = None
+from flash_attn.modules.mha import SelfAttention, FlashSelfAttention, LinearResidual, FlashCrossAttention, CrossAttention
+from flash_attn.modules.mha import get_alibi_slopes #, _update_kv_cache
+from flash_attn.utils.generation import InferenceParams
+# from HybridTensor.modules.references.mha_dejavu import ParallelTracker # use this in the full implementation
+# from HybridTensor.modules.references.mha_dejavu import ParallelMHASparseAttMlp
+# from HybridTensor.triton.references.attention_proj_sparse import qkv_proj_sparse
+# from HybridTensor.triton.select_attn import select_attn
+# from HybridTensor.triton.select_attn_64b_kernel import select_attn
+from HybridTensor.triton.attn_interface import flash_attn_with_kvcache_triton
+from HybridTensor.triton.select_attn_v1 import select_attn
+from HybridTensor.utils.utils import arg_parser, generate_BH_index, generate_random_BH_index
+from HybridTensor.utils.profiling import cuda_profiler
+class MHARouter(torch.nn.Module):
+    def __init__(self, embed_dim, low_rank_dim = None, out_dim = None, top_k = 0.5, device = None, dtype = None):
+        super(MHARouter, self).__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.model_dim = embed_dim
+        self.num_heads = out_dim
+        self.topk = top_k
+        self.linear1 = torch.nn.Linear(embed_dim, out_dim, bias = True, **factory_kwargs)
+    def forward(self, x):
+        out = self.linear1(x)
+        return out
+    def _select_heads(self, x, topk = None):
+        if topk is None:
+            topk = int(self.topk * self.num_heads)
+        else:
+            topk = int(self.num_heads * topk)
+        head_scores = self.forward(x)
+        _, selected_heads = torch.topk(head_scores, topk, dim=1)
+        return selected_heads
+class ParallelMHARouter(torch.nn.Module):
+    def __init__(self, embed_dim, low_rank_dim, out_dim, top_k, process_group, sequence_parallel=False, device = None, dtype = None):
+        super(ParallelMHARouter, self).__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.model_dim = embed_dim
+        self.num_heads = out_dim
+        self.topk = top_k
+        world_size = torch.distributed.get_world_size(process_group)
+        self.local_heads = out_dim // world_size
+        self.linear1 = ColumnParallelLinear(
+            embed_dim,
+            out_dim,
+            process_group,
+            bias=True,
+            sequence_parallel=sequence_parallel,
+            **factory_kwargs,
+        )
+    def forward(self, x):
+        out = self.linear1(x)
+        return out
+    def _select_heads(self, x, topk = None):
+        if topk is None:
+            topk = int(self.topk * self.local_heads)
+        else:
+            topk = int(self.local_heads * topk)
+        head_scores = self.forward(x)
+        # head_scores = head_scores.squeeze(1)
+        # print(f"Head Scores.shape: {head_scores.shape}")
+        _, selected_heads = torch.topk(head_scores, topk, dim=1)
+        # print(f"Selected Heads: {selected_heads}")
+        return selected_heads
+def _update_kv_cache(kv, inference_params, layer_idx):
+    """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
+    # Pre-allocate memory for key-values for inference.
+    num_heads, head_dim = kv.shape[-2:]
+    if layer_idx not in inference_params.key_value_memory_dict:
+        kv_cache = torch.empty(
+            inference_params.max_batch_size,
+            inference_params.max_seqlen,
+            2,
+            num_heads,
+            head_dim,
+            dtype=kv.dtype,
+            device=kv.device,
+        )
+        inference_params.key_value_memory_dict[layer_idx] = kv_cache
+    else:
+        kv_cache = inference_params.key_value_memory_dict[layer_idx]
+    # Adjust key and value for inference
+    batch_start = inference_params.batch_size_offset
+    batch_end = batch_start + kv.shape[0]
+    sequence_start = inference_params.seqlen_offset
+    sequence_end = sequence_start + kv.shape[1]
+    assert batch_end <= kv_cache.shape[0]
+    assert sequence_end <= kv_cache.shape[1]
+    assert kv_cache is not None
+    kv_cache[batch_start:batch_end, sequence_start:sequence_end, ...] = kv
+    return kv_cache[batch_start:batch_end, :sequence_end, ...]
+class SMHA(nn.Module):
+    """Multi-head self-attention and cross-attention with Triton decode kernels + Selective Attention"""
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        num_heads_kv=None,
+        cross_attn=False,
+        qkv_proj_bias=True,
+        out_proj_bias=True,
+        dropout=0.0,
+        softmax_scale=None,
+        causal=False,
+        layer_idx=None,
+        dwconv=False,
+        rotary_emb_dim=0,
+        rotary_emb_base=10000.0,
+        rotary_emb_scale_base=None,
+        rotary_emb_interleaved=False,
+        use_alibi=False,
+        window_size=(-1, -1),
+        fused_bias_fc=False,
+        use_flash_attn=False,
+        return_residual=False,
+        checkpointing=False,
+        use_triton=True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        """
+        num_heads_kv: can be used to toggle MQA / GQA. If None, use num_heads.
+        return_residual: whether to return the input x along with the output. This is for
+            performance reason: for post-norm architecture, returning the input allows us
+            to fuse the backward of nn.Linear with the residual connection.
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.cross_attn = cross_attn
+        self.causal = causal
+        self.layer_idx = layer_idx
+        self.dwconv = dwconv
+        self.rotary_emb_dim = rotary_emb_dim
+        self.use_flash_attn = use_flash_attn
+        self.return_residual = return_residual
+        self.checkpointing = checkpointing
+        self.use_triton = use_triton
+        if use_alibi:
+            assert use_flash_attn, "ALiBi code path requires flash_attn"
+            alibi_slopes = torch.tensor(get_alibi_slopes(num_heads), device=device)
+        else:
+            alibi_slopes = None
+        if window_size != (-1, -1):
+            assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn"
+        self.num_heads = num_heads
+        self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
+        assert (
+            self.num_heads % self.num_heads_kv == 0
+        ), "num_heads must be divisible by num_heads_kv"
+        assert self.embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.head_dim = self.embed_dim // num_heads
+        qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
+        kv_dim = 2 * self.head_dim * self.num_heads_kv
+        if self.rotary_emb_dim > 0:
+            assert not cross_attn, "MHA with rotary embedding does not support cross-attention yet"
+            assert RotaryEmbedding is not None, "rotary_emb is not installed"
+            self.rotary_emb = RotaryEmbedding(
+                self.rotary_emb_dim,
+                base=rotary_emb_base,
+                scale_base=rotary_emb_scale_base,
+                interleaved=rotary_emb_interleaved,
+                device=device,
+            )
+        if fused_bias_fc and FusedDense is None:
+            raise ImportError("fused_dense is not installed")
+        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
+        linear_resid_cls = (
+            LinearResidual if not fused_bias_fc else partial(FusedDense, return_residual=True)
+        )
+        wqkv_cls = linear_cls if not self.return_residual else linear_resid_cls
+        inner_attn_cls = (
+            partial(FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size)
+            if use_flash_attn
+            else SelfAttention
+        )
+        inner_cross_attn_cls = (
+            partial(FlashCrossAttention, alibi_slopes=alibi_slopes, window_size=window_size)
+            if use_flash_attn
+            else CrossAttention
+        )
+        if not self.cross_attn:
+            self.Wqkv = wqkv_cls(embed_dim, qkv_dim, bias=qkv_proj_bias, **factory_kwargs)
+        else:
+            self.Wq = linear_cls(embed_dim, embed_dim, bias=qkv_proj_bias, **factory_kwargs)
+            self.Wkv = wqkv_cls(embed_dim, kv_dim, bias=qkv_proj_bias, **factory_kwargs)
+        if self.dwconv:
+            if self.num_heads_kv == self.num_heads:
+                self.dwconv_qkv = nn.Conv1d(
+                    qkv_dim, qkv_dim, kernel_size=3, padding=2, groups=qkv_dim
+                )
+            else:
+                self.dwconv_q = nn.Conv1d(
+                    embed_dim, embed_dim, kernel_size=3, padding=2, groups=embed_dim
+                )
+                self.dwconv_kv = nn.Conv1d(kv_dim, kv_dim, kernel_size=3, padding=2, groups=kv_dim)
+        self.inner_attn = inner_attn_cls(
+            causal=causal,
+            softmax_scale=softmax_scale,
+            attention_dropout=dropout,
+        )
+        self.inner_cross_attn = inner_cross_attn_cls(
+            causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
+        )
+        self.out_proj = linear_cls(embed_dim, embed_dim, bias=out_proj_bias, **factory_kwargs)
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        dtype = self.out_proj.weight.dtype if dtype is None else dtype
+        device = self.out_proj.weight.device
+        return torch.empty(
+            batch_size,
+            max_seqlen,
+            2,
+            self.num_heads_kv,
+            self.head_dim,
+            dtype=dtype,
+            device=device,
+        )
+    def _update_kv_cache(self, kv, inference_params):
+        """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
+        assert not self.dwconv, "Generation does not support dwconv yet"
+        assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
+        return _update_kv_cache(kv, inference_params, self.layer_idx)
+    def _apply_rotary_update_kvcache_attention(self, q, kv, inference_params):
+        """
+        Fast path that combine 3 steps: apply rotary to Q and K, update kv cache, and apply attention.
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        kv: (batch_size, seqlen_k, 2, nheads_kv, head_dim)
+        """
+        assert inference_params is not None and inference_params.seqlen_offset > 0
+        assert self.use_flash_attn
+        if self.rotary_emb_dim > 0:
+            assert self.rotary_emb.scale is None, "This code path does not support xPos"
+            self.rotary_emb._update_cos_sin_cache(
+                inference_params.max_seqlen, device=q.device, dtype=q.dtype
+            )
+            rotary_cos, rotary_sin = self.rotary_emb._cos_cached, self.rotary_emb._sin_cached
+        else:
+            rotary_cos, rotary_sin = None, None
+        batch = q.shape[0]
+        kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+        cache_seqlens = (
+            inference_params.lengths_per_sample[:batch]
+            if inference_params.lengths_per_sample is not None
+            else inference_params.seqlen_offset
+        )
+        alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+        context = flash_attn_with_kvcache(
+            q,
+            kv_cache[:, :, 0],
+            kv_cache[:, :, 1],
+            kv[:, :, 0],
+            kv[:, :, 1],
+            rotary_cos=rotary_cos,
+            rotary_sin=rotary_sin,
+            cache_seqlens=cache_seqlens,
+            softmax_scale=self.inner_cross_attn.softmax_scale,
+            causal=self.inner_cross_attn.causal,
+            rotary_interleaved=self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False,
+            alibi_slopes=alibi_slopes,
+        )
+        return context
+    def _update_kvcache_attention_triton(self, q, kv, inference_params, batch_head_idx=None):
+        """
+        The rotary embeddings have to be applied before calling this function. The KV cache is update here.
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        kv: (batch_size, seqlen_k, 2, nheads_kv, head_dim)
+        """
+        if (
+            inference_params.seqlen_offset == 0
+            or flash_attn_with_kvcache is None
+            or not self.use_flash_attn
+        ):
+            # TODO: this only uses seqlen_offset and not lengths_per_sample.
+            kv = self._update_kv_cache(kv, inference_params)
+            return self.inner_cross_attn(q, kv)
+        else:
+            batch = q.shape[0]
+            kv_cache = self._update_kv_cache(kv, inference_params)
+            cache_seqlens = (
+                inference_params.lengths_per_sample[:batch]
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+            alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+            context = flash_attn_with_kvcache_triton(
+                q,
+                kv_cache[:, :, 0],
+                kv_cache[:, :, 1],
+                None, # kv[:, :, 0],
+                None, #kv[:, :, 1],
+                rotary_cos=None,
+                rotary_sin=None,
+                cache_seqlens=cache_seqlens,
+                softmax_scale=self.inner_cross_attn.softmax_scale,
+                causal=self.inner_cross_attn.causal,
+                rotary_interleaved= False, #self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False,
+                alibi_slopes=alibi_slopes,
+                batch_head_idx=batch_head_idx,
+            )
+            return context
+    def _update_kvcache_attention(self, q, kv, inference_params):
+        """Write kv to inference_params, then do attention"""
+        if (
+            inference_params.seqlen_offset == 0
+            or flash_attn_with_kvcache is None
+            or not self.use_flash_attn
+        ):
+            # TODO: this only uses seqlen_offset and not lengths_per_sample.
+            kv = self._update_kv_cache(kv, inference_params)
+            return self.inner_cross_attn(q, kv)
+        else:
+            batch = q.shape[0]
+            kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+            cache_seqlens = (
+                inference_params.lengths_per_sample[:batch]
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+            alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+            return flash_attn_with_kvcache(
+                q,
+                kv_cache[:, :, 0],
+                kv_cache[:, :, 1],
+                kv[:, :, 0],
+                kv[:, :, 1],
+                cache_seqlens=cache_seqlens,
+                softmax_scale=self.inner_cross_attn.softmax_scale,
+                causal=self.inner_cross_attn.causal,
+                alibi_slopes=alibi_slopes,
+            )
+    def forward(
+        self,
+        x,
+        x_kv=None,
+        key_padding_mask=None,
+        cu_seqlens=None,
+        max_seqlen=None,
+        mixer_subset=None,
+        inference_params=None,
+        batch_head_idx=None,
+        use_triton=True,
+        **kwargs,
+    ):
+        """
+        Arguments:
+            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if
+                cu_seqlens is None and max_seqlen is None, else (total, hidden_dim) where total
+                is the is the sum of the sequence lengths in the batch.
+            x_kv: (batch, seqlen, hidden_dim), only applicable for cross-attention. If None, use x.
+            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
+                of the sequences in the batch, used to index into x. Only applicable when using
+                FlashAttention.
+            max_seqlen: int. Maximum sequence length in the batch.
+            key_padding_mask: boolean mask, True means to keep, False means to mask out.
+                (batch, seqlen). Only applicable when not using FlashAttention.
+            mixer_subset: for cross-attention only. If not None, will take a subset of x
+                before applying the query projection. Useful for e.g., ViT where we only care
+                about the CLS token in the last layer.
+            inference_params: for generation. Adapted from Megatron-LM (and Apex)
+            batch_head_idx: (batch, num_heads). The index of the heads to be selected. Only applicable for Selective Head/Group Attention.
+            use_triton: whether to use triton kernels for attention in decode. If False, use the original flash attention implementation.
+            https://github.com/NVIDIA/apex/blob/3ff1a10f72ec07067c4e44759442329804ac5162/apex/transformer/testing/standalone_transformer_lm.py#L470
+        """
+        if cu_seqlens is not None:
+            assert max_seqlen is not None
+            assert key_padding_mask is None
+            assert self.use_flash_attn
+            assert not self.dwconv
+            assert self.rotary_emb_dim == 0
+        if key_padding_mask is not None:
+            assert cu_seqlens is None
+            assert max_seqlen is None
+            assert not self.use_flash_attn
+        if inference_params is not None:
+            assert key_padding_mask is None
+            assert cu_seqlens is None and max_seqlen is None
+            assert not self.dwconv
+        # use_triton = self.use_triton if use_triton is None else use_triton
+        kwargs = (
+            {"cu_seqlens": cu_seqlens, "max_seqlen": max_seqlen, **kwargs}
+            if self.use_flash_attn
+            else {"key_padding_mask": key_padding_mask, **kwargs}
+        )
+        seqlen_offset = (
+            0
+            if inference_params is None
+            else (
+                inference_params.lengths_per_sample
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+        )
+        rotary_max_seqlen = inference_params.max_seqlen if inference_params is not None else None
+        batch, seqlen = x.shape[:2]
+        if not self.cross_attn and self.num_heads_kv == self.num_heads:
+            assert x_kv is None and mixer_subset is None
+            if not self.return_residual:
+                qkv = self.Wqkv(x)
+            else:
+                qkv, x = self.Wqkv(x)
+            if self.dwconv:
+                qkv = rearrange(
+                    self.dwconv_qkv(rearrange(qkv, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
+                ).contiguous()
+            qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
+            if (
+                inference_params is None
+                or inference_params.seqlen_offset == 0
+                or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0)
+                or not self.use_flash_attn
+            ):
+                # prefill stage
+                if self.rotary_emb_dim > 0:
+                    qkv = self.rotary_emb(
+                        qkv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
+                    )
+                if inference_params is None:
+                    if not self.checkpointing:
+                        context = self.inner_attn(qkv, **kwargs)
+                    else:
+                        context = torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, **kwargs)
+                else:
+                    if use_triton:
+                        # print("Using the (prefill) triton flash attention implementation")
+                        context = self._update_kvcache_attention_triton(
+                            qkv[:, :, 0], qkv[:, :, 1:], inference_params, batch_head_idx
+                        )
+                    else:
+                        # print("Using the (prefill) original flash attention implementation")
+                        context = self._update_kvcache_attention(
+                            qkv[:, :, 0], qkv[:, :, 1:], inference_params
+                        )
+            else:
+                # decode stage
+                # print("Using triton kernels for attention")
+                if use_triton:
+                    if self.rotary_emb_dim > 0:
+                        qkv = self.rotary_emb(
+                            qkv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
+                        )
+                    context = self._update_kvcache_attention_triton(
+                        qkv[:, :, 0], qkv[:, :, 1:], inference_params, batch_head_idx
+                    )
+                else:
+                    # print("Using the original flash attention implementation")
+                    context = self._apply_rotary_update_kvcache_attention(
+                        qkv[:, :, 0], qkv[:, :, 1:], inference_params
+                    )
+        else:   # cross-attention or MQA/GQA
+            if self.cross_attn:
+                if not self.return_residual:
+                    q = self.Wq(x if mixer_subset is None else x[:, mixer_subset])
+                    kv = self.Wkv(x_kv if x_kv is not None else x)
+                else:
+                    if x_kv is not None:
+                        kv, x_kv = self.Wkv(x_kv)
+                    else:
+                        kv, x = self.Wkv(x)
+                    q = self.Wq(x if mixer_subset is None else x[:, mixer_subset])
+            else:
+                assert self.num_heads_kv != self.num_heads
+                if not self.return_residual:
+                    qkv = self.Wqkv(x)
+                else:
+                    qkv, x = self.Wqkv(x)
+                q = qkv[..., : self.num_heads * self.head_dim]
+                kv = qkv[..., self.num_heads * self.head_dim :]
+            q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
+            kv = rearrange(kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim)
+            if self.dwconv:
+                q = rearrange(
+                    self.dwconv_q(rearrange(q, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
+                ).contiguous()
+                kv = rearrange(
+                    self.dwconv_kv(rearrange(kv, "b s d -> b d s"))[..., :-2], "b d s -> b s d"
+                ).contiguous()
+            if (
+                inference_params is None
+                or inference_params.seqlen_offset == 0
+                or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0)
+                or not self.use_flash_attn
+            ):
+                # prefill
+                if self.rotary_emb_dim > 0:
+                    q, kv = self.rotary_emb(
+                        q, kv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
+                    )
+                if inference_params is None:
+                    if not self.checkpointing:
+                        context = self.inner_cross_attn(q, kv, **kwargs)
+                    else:
+                        context = torch.utils.checkpoint.checkpoint(
+                            self.inner_cross_attn, q, kv, **kwargs
+                        )
+                else:
+                    if use_triton:
+                        context = self._update_kvcache_attention_triton(
+                            q, kv, inference_params, batch_head_idx
+                        )
+                    else:
+                        context = self._update_kvcache_attention(q, kv, inference_params)
+            else:
+                # decode
+                # print("Using triton kernels for attention")
+                if use_triton:
+                    if self.rotary_emb_dim > 0:
+                        q, kv = self.rotary_emb(
+                            q, kv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
+                        )
+                    context = self._update_kvcache_attention_triton(
+                        q, kv, inference_params, batch_head_idx
+                    )
+                else:
+                    # print("Using the original gqa flash attention implementation")
+                    context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
+        # print(f"Context.shape: {context.shape}")
+        # print(f"Context: {context}")
+        out = self.out_proj(rearrange(context, "... h d -> ... (h d)"))
+        return out if not self.return_residual else (out, x)
+class ParallelSMHA(nn.Module):
+    """Multi-head self-attention and cross-attention"""
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        process_group,
+        num_heads_kv=None,
+        qkv_proj_bias=True,
+        out_proj_bias=True,
+        dropout=0.0,
+        softmax_scale=None,
+        causal=False,
+        layer_idx=None,
+        rotary_emb_dim=0,
+        rotary_emb_base=10000.0,
+        rotary_emb_scale_base=None,
+        rotary_emb_interleaved=False,
+        use_alibi=False,
+        window_size=(-1, -1),
+        use_flash_attn=False,
+        checkpointing=False,
+        sequence_parallel=True,
+        device=None,
+        dtype=None,
+    ) -> None:
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.causal = causal
+        self.layer_idx = layer_idx
+        self.rotary_emb_dim = rotary_emb_dim
+        self.use_flash_attn = use_flash_attn
+        self.checkpointing = checkpointing
+        self.process_group = process_group
+        self.world_size = process_group.size()
+        self.local_rank = torch.distributed.get_rank(process_group)
+        self.num_heads = num_heads
+        assert self.embed_dim % self.num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
+        assert (
+            self.num_heads % self.num_heads_kv == 0
+        ), "num_heads must be divisible by num_heads_kv"
+        self.num_heads_per_rank = get_dim_for_local_rank(
+            self.num_heads, self.world_size, self.local_rank
+        )
+        self.num_heads_kv_per_rank = get_dim_for_local_rank(
+            self.num_heads_kv, self.world_size, self.local_rank
+        )
+        self.head_dim = self.embed_dim // num_heads
+        qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
+        if use_alibi:
+            assert use_flash_attn, "ALiBi code path requires flash_attn"
+            num_heads_local = math.ceil(self.num_heads / self.world_size)
+            alibi_slopes = torch.tensor(
+                get_alibi_slopes(num_heads)[
+                    self.local_rank * num_heads_local : (self.local_rank + 1) * num_heads_local
+                ],
+                device=device,
+            )
+        else:
+            alibi_slopes = None
+        if window_size != (-1, -1):
+            assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn"
+        if self.rotary_emb_dim > 0:
+            assert RotaryEmbedding is not None, "rotary_emb is not installed"
+            self.rotary_emb = RotaryEmbedding(
+                self.rotary_emb_dim,
+                base=rotary_emb_base,
+                scale_base=rotary_emb_scale_base,
+                interleaved=rotary_emb_interleaved,
+                device=device,
+            )
+        if ColumnParallelLinear is None or RowParallelLinear is None:
+            raise ImportError("fused_dense is not installed")
+        self.Wqkv = ColumnParallelLinear(
+            embed_dim,
+            qkv_dim,
+            process_group,
+            bias=qkv_proj_bias,
+            sequence_parallel=sequence_parallel,
+            multiple_of=self.head_dim * (self.num_heads // self.num_heads_kv + 2),
+            **factory_kwargs,
+        )
+        inner_attn_cls = (
+            partial(FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size)
+            if use_flash_attn
+            else SelfAttention
+        )
+        inner_cross_attn_cls = (
+            partial(FlashCrossAttention, alibi_slopes=alibi_slopes, window_size=window_size)
+            if use_flash_attn
+            else CrossAttention
+        )
+        self.inner_attn = inner_attn_cls(
+            causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
+        )
+        self.inner_cross_attn = inner_cross_attn_cls(
+            causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
+        )
+        self.out_proj = RowParallelLinear(
+            embed_dim,
+            embed_dim,
+            process_group,
+            bias=out_proj_bias,
+            sequence_parallel=sequence_parallel,
+            multiple_of=self.head_dim,
+            **factory_kwargs,
+        )
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        dtype = self.out_proj.weight.dtype if dtype is None else dtype
+        device = self.out_proj.weight.device
+        return torch.empty(
+            batch_size,
+            max_seqlen,
+            2,
+            self.num_heads_kv_per_rank,
+            self.head_dim,
+            dtype=dtype,
+            device=device,
+        )
+    def _update_kv_cache(self, kv, inference_params):
+        """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)"""
+        assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
+        return _update_kv_cache(kv, inference_params, self.layer_idx)
+    def _apply_rotary_update_kvcache_attention(self, q, kv, inference_params):
+        """
+        Fast path that combine 3 steps: apply rotary to Q and K, update kv cache, and apply attention.
+        q: (batch_size, seqlen_q, nheads, head_dim)
+        kv: (batch_size, seqlen_k, 2, nheads_kv, head_dim)
+        """
+        assert inference_params is not None and inference_params.seqlen_offset > 0
+        assert self.use_flash_attn
+        if self.rotary_emb_dim > 0:
+            assert self.rotary_emb.scale is None, "This code path does not support xPos"
+            self.rotary_emb._update_cos_sin_cache(
+                inference_params.max_seqlen, device=q.device, dtype=q.dtype
+            )
+            rotary_cos, rotary_sin = self.rotary_emb._cos_cached, self.rotary_emb._sin_cached
+        else:
+            rotary_cos, rotary_sin = None, None
+        batch = q.shape[0]
+        kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+        cache_seqlens = (
+            inference_params.lengths_per_sample[:batch]
+            if inference_params.lengths_per_sample is not None
+            else inference_params.seqlen_offset
+        )
+        alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+        context = flash_attn_with_kvcache(
+            q,
+            kv_cache[:, :, 0],
+            kv_cache[:, :, 1],
+            kv[:, :, 0],
+            kv[:, :, 1],
+            rotary_cos=rotary_cos,
+            rotary_sin=rotary_sin,
+            cache_seqlens=cache_seqlens,
+            softmax_scale=self.inner_cross_attn.softmax_scale,
+            causal=self.inner_cross_attn.causal,
+            rotary_interleaved=self.rotary_emb.interleaved if self.rotary_emb_dim > 0 else False,
+            alibi_slopes=alibi_slopes,
+        )
+        return context
+    def _update_kvcache_attention(self, q, kv, inference_params):
+        """Write kv to inference_params, then do attention"""
+        if inference_params.seqlen_offset == 0 or not self.use_flash_attn:
+            # TODO: this only uses seqlen_offset and not lengths_per_sample.
+            kv = self._update_kv_cache(kv, inference_params)
+            return self.inner_cross_attn(q, kv)
+        else:
+            batch = q.shape[0]
+            kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+            cache_seqlens = (
+                inference_params.lengths_per_sample[:batch]
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+            alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+            context = flash_attn_with_kvcache(
+                q,
+                kv_cache[:, :, 0],
+                kv_cache[:, :, 1],
+                kv[:, :, 0],
+                kv[:, :, 1],
+                cache_seqlens=cache_seqlens,
+                softmax_scale=self.inner_cross_attn.softmax_scale,
+                causal=self.inner_cross_attn.causal,
+                alibi_slopes=alibi_slopes,
+            )
+            return context
+    def forward(self, x, seqlen=None, inference_params=None, **kwargs):
+        """
+        Arguments:
+            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if seqlen=None.
+                If seqlen is not None, x is (batch * seqlen, hidden_dim). This is so that when we
+                split x during sequence parallel, we split the batch * seqlen dimension
+                (in case batch is small).
+        """
+        qkv = self.Wqkv(x)
+        if seqlen is not None:
+            qkv = rearrange(qkv, "(b s) ... -> b s ...", s=seqlen)
+        seqlen_offset = (
+            0
+            if inference_params is None
+            else (
+                inference_params.lengths_per_sample
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+        )
+        rotary_max_seqlen = inference_params.max_seqlen if inference_params is not None else None
+        if self.num_heads_kv == self.num_heads:
+            qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, d=self.head_dim)
+            if (
+                inference_params is None
+                or inference_params.seqlen_offset == 0
+                or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0)
+                or not self.use_flash_attn
+            ):
+                if self.rotary_emb_dim > 0:
+                    qkv = self.rotary_emb(
+                        qkv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
+                    )
+                if inference_params is None:
+                    if not self.checkpointing:
+                        context = self.inner_attn(qkv, **kwargs)
+                    else:
+                        context = torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, **kwargs)
+                else:
+                    context = self._update_kvcache_attention(
+                        qkv[:, :, 0], qkv[:, :, 1:], inference_params
+                    )
+            else:
+                context = self._apply_rotary_update_kvcache_attention(
+                    qkv[:, :, 0], qkv[:, :, 1:], inference_params
+                )
+        else:   # GQA/MQA
+            q = rearrange(
+                qkv[..., : self.num_heads_per_rank * self.head_dim],
+                "... (h d) -> ... h d",
+                d=self.head_dim,
+            )
+            kv = rearrange(
+                qkv[..., self.num_heads_per_rank * self.head_dim :],
+                "... (two hkv d) -> ... two hkv d",
+                two=2,
+                d=self.head_dim,
+            )
+            if (
+                inference_params is None
+                or inference_params.seqlen_offset == 0
+                or (self.rotary_emb_dim == 0 or self.rotary_emb_dim % 16 != 0)
+                or not self.use_flash_attn
+            ):
+                if self.rotary_emb_dim > 0:
+                    q, kv = self.rotary_emb(
+                        q, kv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
+                    )
+                if inference_params is None:
+                    if not self.checkpointing:
+                        context = self.inner_cross_attn(q, kv, **kwargs)
+                    else:
+                        context = torch.utils.checkpoint.checkpoint(
+                            self.inner_cross_attn, q, kv, **kwargs
+                        )
+                else:
+                    context = self._update_kvcache_attention(q, kv, inference_params)
+            else:
+                context = self._apply_rotary_update_kvcache_attention(q, kv, inference_params)
+        context = rearrange(context, "b s h d -> b s (h d)")
+        if seqlen is not None:
+            context = rearrange(context, "b s d -> (b s) d")
+        out = self.out_proj(context)
+        return out
+class SelectMHA(nn.Module):
+    """Multi-head, Group-query self-attention using select attention"""
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        num_heads_kv=None,
+        cross_attn=False,
+        qkv_proj_bias=True,
+        out_proj_bias=True,
+        dropout=0.0,
+        softmax_scale=None,
+        causal=False,
+        layer_idx=None,
+        dwconv=False,
+        rotary_emb_dim=0,
+        rotary_emb_base=10000.0,
+        rotary_emb_scale_base=None,
+        rotary_emb_interleaved=False,
+        use_alibi=False,
+        window_size=(-1, -1),
+        fused_bias_fc=False,
+        use_flash_attn=True,
+        return_residual=False,
+        checkpointing=False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        """
+        num_heads_kv: can be used to toggle MQA / GQA. If None, use num_heads.
+        return_residual: whether to return the input x along with the output. This is for
+            performance reason: for post-norm architecture, returning the input allows us
+            to fuse the backward of nn.Linear with the residual connection.
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.cross_attn = cross_attn
+        self.causal = causal
+        self.layer_idx = layer_idx
+        self.dwconv = dwconv
+        self.rotary_emb_dim = rotary_emb_dim
+        self.use_flash_attn = True # use_flash_attn
+        self.return_residual = return_residual
+        self.checkpointing = checkpointing
+        if use_alibi:
+            assert use_flash_attn, "ALiBi code path requires flash_attn"
+            alibi_slopes = torch.tensor(get_alibi_slopes(num_heads), device=device)
+        else:
+            alibi_slopes = None
+        if window_size != (-1, -1):
+            assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn"
+        self.num_heads = num_heads
+        self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
+        assert (
+            self.num_heads % self.num_heads_kv == 0
+        ), "num_heads must be divisible by num_heads_kv"
+        assert self.embed_dim % num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.head_dim = self.embed_dim // num_heads
+        qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
+        kv_dim = 2 * self.head_dim * self.num_heads_kv
+        if self.rotary_emb_dim > 0:
+            assert not cross_attn, "MHA with rotary embedding does not support cross-attention yet"
+            assert RotaryEmbedding is not None, "rotary_emb is not installed"
+            self.rotary_emb = RotaryEmbedding(
+                self.rotary_emb_dim,
+                base=rotary_emb_base,
+                scale_base=rotary_emb_scale_base,
+                interleaved=rotary_emb_interleaved,
+                device=device,
+            )
+        if fused_bias_fc and FusedDense is None:
+            raise ImportError("fused_dense is not installed")
+        linear_cls = nn.Linear if not fused_bias_fc else FusedDense
+        linear_resid_cls = (
+            LinearResidual if not fused_bias_fc else partial(FusedDense, return_residual=True)
+        )
+        wqkv_cls = linear_cls if not self.return_residual else linear_resid_cls
+        inner_attn_cls = (
+            partial(FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size)
+            if use_flash_attn
+            else SelfAttention
+        )
+        self.Wqkv = wqkv_cls(embed_dim, qkv_dim, bias=qkv_proj_bias, **factory_kwargs)
+        self.inner_attn = inner_attn_cls(
+            causal=causal,
+            softmax_scale=softmax_scale,
+            attention_dropout=dropout,
+        )
+        self.softmax_scale = softmax_scale
+        self.out_proj = linear_cls(embed_dim, embed_dim, bias=out_proj_bias, **factory_kwargs)
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        dtype = self.out_proj.weight.dtype if dtype is None else dtype
+        device = self.out_proj.weight.device
+        return torch.empty(
+            batch_size,
+            max_seqlen,
+            2,
+            self.num_heads_kv,
+            self.head_dim,
+            dtype=dtype,
+            device=device,
+        )
+    def _update_kv_cache(self, kv, inference_params):
+        """Update kv cache in inference_params."""
+        assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
+        return _update_kv_cache(kv, inference_params, self.layer_idx)
+    def forward(
+        self,
+        x,
+        x_kv=None,
+        key_padding_mask=None,
+        cu_seqlens=None,
+        max_seqlen=None,
+        mixer_subset=None,
+        inference_params=None,
+        batch_head_idx=None,
+        **kwargs,
+    ):
+        """
+        Arguments:
+            x: (batch, seqlen, hidden_dim)
+            batch_head_idx: Tensor of indices specifying which batch and head indices to select.
+                Shape: (batch_size, top_k)
+            inference_params: for generation.
+        """
+        seqlen_offset = (
+            0
+            if inference_params is None
+            else (
+                inference_params.lengths_per_sample
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+        )
+        rotary_max_seqlen = inference_params.max_seqlen if inference_params is not None else None
+        batch, seqlen = x.shape[:2]
+        if not self.cross_attn and self.num_heads_kv == self.num_heads:
+            # Self-attention, no MQA/GQA
+            assert x_kv is None and mixer_subset is None
+            if not self.return_residual:
+                qkv = self.Wqkv(x)
+            else:
+                qkv, x = self.Wqkv(x)
+            qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
+            if self.rotary_emb_dim > 0:
+                qkv = self.rotary_emb(
+                    qkv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
+                )
+            if inference_params is None or inference_params.seqlen_offset == 0:
+                # Inference stage without inference_params
+                if inference_params is not None:
+                    # Update kv cache during prefill
+                    kv = self._update_kv_cache(qkv[:, :, 1:], inference_params)
+                context = self.inner_attn(qkv, **kwargs)
+            else:
+                # Generation stage
+                if batch_head_idx is None:
+                    # Apply select attention without kv cache update
+                    context = self._update_kvcache_attention(q = qkv[:, :, 0], kv = qkv[:, :, 1:], inference_params = inference_params)
+                else:
+                    # Apply select attention with kv cache update
+                    context = self._update_kvcache_select_attn(q = qkv[:, :, 0], kv = qkv[:, :, 1:], inference_params = inference_params, batch_head_idx = batch_head_idx)
+        else:
+            raise NotImplementedError("SelectMHA currently supports only self-attention without MQA/GQA.")
+        out = self.out_proj(rearrange(context, "... h d -> ... (h d)"))
+        return out if not self.return_residual else (out, x)
+    def _update_kvcache_select_attn(self, q, kv, inference_params, batch_head_idx):
+        """
+        Apply select attention during generation stage.
+        q: (batch_size, seqlen=1, n_heads, head_dim)
+        kv: (batch_size, seqlen=1, 2, n_heads, head_dim)
+        batch_head_idx: Tensor of indices specifying which batch and head indices to select.
+            Shape:  (batch_size, top_k)
+        # currently only supports batches with same seqlen
+        # different seqlen requires a simple update in the select_attn kernel to load the seqlen, future work
+        """
+        # check batch_head_idx shape
+        # assert batch_head_idx.shape[0] == 2, "batch_head_idx must have shape (N_selected, 2)"
+        # check batch_head_idx is not None
+        assert batch_head_idx is not None, "batch_head_idx must not be None"
+        # update kv cache
+        kv_cache = self._update_kv_cache(kv, inference_params)
+        # inference_params.seqlen_offset += 1 # if seqlen_offset is int
+        batch = q.shape[0]
+        # kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+        # make sure seqlen_offset accounts for the current token
+        cache_seqlens = (
+            inference_params.lengths_per_sample[:batch]
+            if inference_params.lengths_per_sample is not None
+            else inference_params.seqlen_offset + 1 # +1 for the current token
+        )
+        # need to reshape or view keys and value with shape (batch_size, seqlen, 1, n_heads, head_dim)
+        q = q.unsqueeze(2)
+        k_cache = kv_cache[:, :, 0].unsqueeze(2)
+        v_cache = kv_cache[:, :, 1].unsqueeze(2)
+        # Call select_attn
+        context = select_attn(
+            q,
+            k_cache,
+            v_cache,
+            self.softmax_scale,
+            batch_head_idx,
+            cache_seqlens)
+        # context: (batch_size, seqlen_q=1, G=1, H, head_dim)
+        # context = context.squeeze(2)  # Remove G dimension
+        batch_size = batch_head_idx.shape[0]
+        context = context.view(batch_size, 1, self.num_heads, self.head_dim)
+        return context
+    def _update_kvcache_attention(self, q, kv, inference_params):
+        """Write kv to inference_params, then do attention"""
+        if (
+            inference_params.seqlen_offset == 0
+            or flash_attn_with_kvcache is None
+            or not self.use_flash_attn
+        ):
+            # TODO: this only uses seqlen_offset and not lengths_per_sample.
+            kv = self._update_kv_cache(kv, inference_params)
+            return self.inner_cross_attn(q, kv)
+        else:
+            batch = q.shape[0]
+            kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+            cache_seqlens = (
+                inference_params.lengths_per_sample[:batch]
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+            # alibi_slopes = getattr(self.inner_cross_attn, "alibi_slopes", None)
+            alibi_slopes = None
+            return flash_attn_with_kvcache(
+                q,
+                kv_cache[:, :, 0],
+                kv_cache[:, :, 1],
+                kv[:, :, 0],
+                kv[:, :, 1],
+                cache_seqlens=cache_seqlens,
+                softmax_scale=self.inner_attn.softmax_scale,
+                causal=self.inner_attn.causal,
+                alibi_slopes=alibi_slopes,
+            )
+    # dummy function for testing
+    def _select_attn(self, q, kv, inference_params, batch_head_idx):
+        """
+        Apply select attention during generation stage.
+        q: (batch_size, seqlen=1, n_heads, head_dim)
+        kv: (batch_size, seqlen=1, 2, n_heads, head_dim)
+        batch_head_idx: Tensor of indices specifying which batch and head indices to select.
+            Shape:  (N_selected, 2)
+        # currently only supports batches with same seqlen
+        # different seqlen requires a simple update in the select_attn kernel to load the seqlen, future work
+        """
+        # check batch_head_idx shape
+        assert batch_head_idx.shape[1] == 2, "batch_head_idx must have shape (N_selected, 2)"
+        # update kv cache
+        # kv_cache = self._update_kv_cache(kv, inference_params)
+        # inference_params.seqlen_offset += 1 # if seqlen_offset is int
+        batch = q.shape[0]
+        kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+        # make sure seqlen_offset accounts for the current token
+        cache_seqlens = (
+            inference_params.lengths_per_sample[:batch]
+            if inference_params.lengths_per_sample is not None
+            else inference_params.seqlen_offset # +1 for the current token
+        )
+        # need to reshape or view keys and value with shape (batch_size, seqlen, 1, n_heads, head_dim)
+        q = q.unsqueeze(2)
+        k_cache = kv_cache[:, :, 0].unsqueeze(2)
+        v_cache = kv_cache[:, :, 1].unsqueeze(2)
+        # Call select_attn
+        context = select_attn(
+            q,
+            k_cache,
+            v_cache,
+            self.softmax_scale,
+            batch_head_idx,
+            cache_seqlens)
+        # context: (batch_size, seqlen_q=1, G=1, H, head_dim)
+        context = context.squeeze(2)  # Remove G dimension
+        return context
+# SelectiveGQA: Future work
+class ParallelSelectMHA(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        process_group,
+        num_heads_kv=None,
+        qkv_proj_bias=True,
+        out_proj_bias=True,
+        dropout=0.0,
+        softmax_scale=None,
+        causal=True,
+        layer_idx=None,
+        dwconv=False,
+        rotary_emb_dim=0,
+        rotary_emb_base=10000.0,
+        rotary_emb_scale_base=None,
+        rotary_emb_interleaved=False,
+        use_alibi=False,
+        window_size=(-1, -1),
+        fused_bias_fc=True,
+        use_flash_attn=True,
+        return_residual=False,
+        checkpointing=False,
+        sequence_parallel=False,
+        device=None,
+        dtype=None,
+    ) -> None:
+        """
+        num_heads_kv: can be used to toggle MQA / GQA. If None, use num_heads.
+        return_residual: whether to return the input x along with the output. This is for
+            performance reason: for post-norm architecture, returning the input allows us
+            to fuse the backward of nn.Linear with the residual connection.
+        """
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.causal = causal
+        self.layer_idx = layer_idx
+        self.dwconv = dwconv
+        self.rotary_emb_dim = rotary_emb_dim
+        self.use_flash_attn = use_flash_attn
+        self.return_residual = return_residual
+        self.checkpointing = checkpointing
+        self.process_group = process_group
+        self.world_size = process_group.size()
+        self.local_rank = torch.distributed.get_rank(process_group)
+        self.num_heads = num_heads
+        assert self.embed_dim % self.num_heads == 0, "embed_dim must be divisible by num_heads"
+        self.num_heads_kv = num_heads_kv if num_heads_kv is not None else num_heads
+        assert (
+            self.num_heads % self.num_heads_kv == 0
+        ), "num_heads must be divisible by num_heads_kv"
+        self.num_heads_per_rank = get_dim_for_local_rank(
+            self.num_heads, self.world_size, self.local_rank
+        )
+        self.num_heads_kv_per_rank = get_dim_for_local_rank(
+            self.num_heads_kv, self.world_size, self.local_rank
+        )
+        self.head_dim = self.embed_dim // num_heads
+        qkv_dim = self.head_dim * (self.num_heads + 2 * self.num_heads_kv)
+        if use_alibi:
+            assert use_flash_attn, "ALiBi code path requires flash_attn"
+            num_heads_local = math.ceil(self.num_heads / self.world_size)
+            alibi_slopes = torch.tensor(
+                get_alibi_slopes(num_heads)[
+                    self.local_rank * num_heads_local : (self.local_rank + 1) * num_heads_local
+                ],
+                device=device,
+            )
+        else:
+            alibi_slopes = None
+        if window_size != (-1, -1):
+            assert use_flash_attn, "Local (sliding window) attention code path requires flash_attn"
+        if self.rotary_emb_dim > 0:
+            assert RotaryEmbedding is not None, "rotary_emb is not installed"
+            self.rotary_emb = RotaryEmbedding(
+                self.rotary_emb_dim,
+                base=rotary_emb_base,
+                scale_base=rotary_emb_scale_base,
+                interleaved=rotary_emb_interleaved,
+                device=device,
+            )
+        if ColumnParallelLinear is None or RowParallelLinear is None:
+            raise ImportError("fused_dense is not installed")
+        self.Wqkv = ColumnParallelLinear(
+            embed_dim,
+            qkv_dim,
+            process_group,
+            bias=qkv_proj_bias,
+            sequence_parallel=sequence_parallel,
+            multiple_of=self.head_dim * (self.num_heads // self.num_heads_kv + 2),
+            **factory_kwargs,
+        )
+        inner_attn_cls = (
+            partial(FlashSelfAttention, alibi_slopes=alibi_slopes, window_size=window_size)
+            if use_flash_attn
+            else SelfAttention
+        )
+        self.inner_attn = inner_attn_cls(
+            causal=causal,
+            softmax_scale=softmax_scale,
+            attention_dropout=dropout,
+        )
+        self.softmax_scale = softmax_scale
+        # replace this with no reduce
+        self.out_proj = RowParallelLinear(
+            embed_dim,
+            embed_dim,
+            process_group,
+            bias=out_proj_bias,
+            sequence_parallel=sequence_parallel,
+            multiple_of=self.head_dim,
+            **factory_kwargs,
+        )
+        self.mha_router = None
+        self.mlp_router = None
+        # We'll use an extra stream for concurrency
+        self.current_stream = None
+        self.sparse_stream = torch.cuda.Stream(device="cuda", priority=0)
+        self.main_stream = torch.cuda.Stream(device="cuda", priority=-5)
+        self.mha_router_event = torch.cuda.Event(enable_timing=False, blocking=False)
+        self.mlp_router_event = torch.cuda.Event(enable_timing=False, blocking=False)
+        self.main_event = torch.cuda.Event(enable_timing=False, blocking=False)
+        # self.local_head_idx = generate_random_BH_index(1, self.num_heads_per_rank,self.num_heads_per_rank)
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None):
+        dtype = self.out_proj.weight.dtype if dtype is None else dtype
+        device = self.out_proj.weight.device
+        return torch.empty(
+            batch_size,
+            max_seqlen,
+            2,
+            self.num_heads_kv_per_rank,
+            self.head_dim,
+            dtype=dtype,
+            device=device,
+        )
+    def _update_kv_cache(self, kv, inference_params):
+        """Update kv cache in inference_params."""
+        assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
+        return _update_kv_cache(kv, inference_params, self.layer_idx)
+    def forward(
+        self,
+        x,
+        seqlen=None,
+        inference_params=None,
+        batch_head_idx=None,
+        **kwargs,
+    ):
+        """
+        Arguments:
+            x: (batch, seqlen, hidden_dim)
+            batch_head_idx: Tensor of indices specifying which batch and head indices to select.
+                Shape: (N_selected,)
+            inference_params: for generation.
+        """
+        router_inputs = x.squeeze(1)
+        self.current_stream = torch.cuda.current_stream()
+        self.main_stream.wait_stream(self.current_stream )
+        self.sparse_stream.wait_stream(self.current_stream )
+        is_decode = inference_params is not None and inference_params.seqlen_offset > 0
+        # if self.mha_router and is_decode:
+        #     with torch.cuda.stream(self.sparse_stream):
+        #         batch_head_idx = self.mha_router._select_heads(router_inputs)
+        #         self.sparse_stream.record_event(self.mha_router_event)
+        qkv = self.Wqkv(x)
+        if seqlen is not None:
+            qkv = rearrange(qkv, "(b s) ... -> b s ...", s=seqlen)
+        seqlen_offset = (
+            0
+            if inference_params is None
+            else (
+                inference_params.lengths_per_sample
+                if inference_params.lengths_per_sample is not None
+                else inference_params.seqlen_offset
+            )
+        )
+        rotary_max_seqlen = inference_params.max_seqlen if inference_params is not None else None
+        batch, seqlen = x.shape[:2]
+        if self.num_heads_kv == self.num_heads:
+            # Self-attention, no MQA/GQA
+            qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, d=self.head_dim)
+            if self.rotary_emb_dim > 0:
+                qkv = self.rotary_emb(
+                    qkv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
+                )
+            if inference_params is None or inference_params.seqlen_offset == 0:
+                # Inference stage without inference_params, prefill stage
+                if inference_params is not None:
+                    # Update kv cache during prefill
+                    kv = self._update_kv_cache(qkv[:, :, 1:], inference_params)
+                context = self.inner_attn(qkv, **kwargs)
+            else:
+                # Generation stage
+                # apply rotary embeddings
+                if self.rotary_emb_dim > 0:
+                    qkv = self.rotary_emb(
+                        qkv, seqlen_offset=seqlen_offset, max_seqlen=rotary_max_seqlen
+                    )
+                # Apply select attention with kv cache update
+                context = self._update_kvcache_select_attn(qkv[:, :, 0], qkv[:, :, 1:], inference_params, batch_head_idx)
+        else:   # cross-attention, MQA/GQA
+            raise NotImplementedError("SelectMHA currently supports only self-attention without MQA/GQA.")
+        context = rearrange(context, "b s h d -> b s (h d)")
+        if seqlen is not None:
+            context = rearrange(context, "b s d -> (b s) d")
+        # out = self.out_proj(rearrange(context, "... h d -> ... (h d)"))
+        # out = self.out_proj(context)
+        out = fused_dense_func(context, self.out_proj.weight, self.out_proj.bias)
+        # if is_decode:
+        #     if self.mlp_router:
+        #         with torch.cuda.stream(self.sparse_stream):
+        #             index_vec = self.mlp_router._select_neurons_topk(router_inputs, topk = self.mlp_topk)
+        #             self.sparse_stream.record_event(self.mlp_router_event)
+        #     with torch.cuda.stream(self.main_stream):
+        #         out = all_reduce(out, self.process_group)
+        #         self.main_stream.record_event(self.main_event)
+        #     self.current_stream.wait_event(self.mlp_router_event)
+        #     self.current_stream.wait_event(self.main_event)
+        #     # index_vec = self.mlp_router._select_neurons_topk(router_inputs, topk = self.mlp_topk)
+        #     # out = all_reduce(out, self.process_group)
+        #     return out, index_vec
+        # else:
+        #     out = all_reduce(out, self.process_group)
+        out = all_reduce(out, self.process_group)
+        return out if not self.return_residual else (out, x)
+        # return out
+    def _update_kvcache_select_attn(self, q, kv, inference_params, batch_head_idx = None):
+        """
+        Apply select attention during generation stage.
+        q: (batch_size, seqlen=1, n_heads, head_dim)
+        kv: (batch_size, seqlen=1, 2, n_heads, head_dim)
+        batch_head_idx: Tensor of indices specifying which batch and head indices to select.
+            Shape:  (batch_size, top_k)
+        """
+        # check batch_head_idx shape
+        # assert batch_head_idx.shape[1] == 2, "batch_head_idx must have shape (N_selected, 2)"
+        # if batch_head_idx is None:
+        #     batch_head_idx = self.local_head_idx
+        #     print("Using local_head_idx, router not used.")
+        # batch_head_idx = self.local_head_idx
+        # print("Using local_head_idx, router not used.")
+        # update kv cache
+        kv_cache = self._update_kv_cache(kv, inference_params)
+        batch = q.shape[0]
+        # kv_cache = inference_params.key_value_memory_dict[self.layer_idx][:batch]
+        cache_seqlens = (
+            inference_params.lengths_per_sample[:batch]
+            if inference_params.lengths_per_sample is not None
+            else inference_params.seqlen_offset + 1 # +1 for the current token
+        )
+        # need to reshape or view keys and value with shape (batch_size, seqlen, 1, n_heads, head_dim)
+        q = q.unsqueeze(2)
+        k_cache = kv_cache[:, :, 0].unsqueeze(2)
+        v_cache = kv_cache[:, :, 1].unsqueeze(2)
+        self.current_stream.wait_event(self.mha_router_event)
+        assert batch_head_idx is not None, "batch_head_idx must not be None"
+        # Call select_attn
+        context = select_attn(
+            q,
+            k_cache,
+            v_cache,
+            self.softmax_scale,
+            batch_head_idx,
+            cache_seqlens
+        )
+        # context: (batch_size, seqlen_q=1, G=1, H, head_dim)
+        # context = context.squeeze(2)  # Remove G dimension
+        context = context.view(batch, 1, self.num_heads_kv_per_rank, self.head_dim)
+        return context
+'''
+PYTHONWARNINGS="ignore" python -m HybridTensor.modules.SelectiveMHA --batch_size 8 --in_features 8192 --seq_len 512 --head_density 0.25
+'''
+if __name__ == "__main__":
+    args = arg_parser()
+    max_seqlen = args.seq_len + 128
+    max_batch_size = args.batch_size
+    device = torch.device(f"cuda:{args.device}")
+    # simulates SelectiveMHA inference generation stage
+    inference_params = InferenceParams(max_seqlen=max_seqlen, max_batch_size=max_batch_size)
+    nheads = args.in_features // 128
+    softmax_scale = 1 / (128 ** 0.5)
+    rotary_emb_dim = 0
+    # build SelectiveMHA
+    select_mha = SelectMHA(
+        embed_dim=args.in_features,
+        num_heads=nheads,
+        num_heads_kv=None,
+        causal=True,
+        layer_idx=0,
+        use_flash_attn=True,
+        softmax_scale=softmax_scale,
+        return_residual=False,
+        rotary_emb_dim=rotary_emb_dim,
+        device=device,
+        dtype=torch.float16,
+    )
+    standard_mha = SMHA(
+        embed_dim=args.in_features,
+        num_heads=nheads,
+        num_heads_kv=None,
+        causal=True,
+        layer_idx=0,
+        use_flash_attn=True,
+        softmax_scale=softmax_scale,
+        return_residual=False,
+        rotary_emb_dim=rotary_emb_dim,
+        device=device,
+        dtype=torch.float16,
+    )
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+    with torch.no_grad():
+        # prefill stage to generate kv cache for all batches
+        og_x = torch.randn(args.batch_size, args.seq_len, args.in_features, device=device, dtype=torch.float16, requires_grad=False)
+        # out, time_ms = cuda_profiler(select_mha, og_x, inference_params=inference_params)
+        # print(f"MHA Prefill time: {time_ms:.3f} ms")
+        # out = select_mha(og_x, inference_params=inference_params)
+        # simulate kv cache, bug in flash_attn for larger batches
+        kv = torch.randn(args.batch_size, args.seq_len, 2, nheads, 128,  device=device, dtype=torch.float16, requires_grad=False)
+        _ = _update_kv_cache(kv, inference_params, 0)
+        # increment the sequence length to move to the generation stage
+        inference_params.seqlen_offset += args.seq_len
+        input_x = torch.randn(args.batch_size, 1, args.in_features, device=device, dtype=torch.float16, requires_grad=False)
+        selected_heads = math.ceil(nheads * args.head_density)
+        # generate batch_head_idx for SelectiveMHA
+        # batch_head_index = generate_BH_index(args.batch_size, nheads, selected_heads, device=device)
+        batch_head_index = generate_random_BH_index(args.batch_size, nheads, selected_heads, device=device)
+        # generatation stage Standard MHA
+        out, standard_time_ms = cuda_profiler(standard_mha, input_x, inference_params=inference_params)
+        print(f"Standard MHA time: {standard_time_ms:.3f} ms")
+        # generatation stage SelectiveMHA
+        out, select_time_ms = cuda_profiler(select_mha, input_x, inference_params=inference_params, batch_head_idx=batch_head_index)
+        print(f"SelectMHA time: {select_time_ms:.3f} ms")
+        speedup = standard_time_ms / select_time_ms
+        print(f"Speedup: {speedup:.3f}")

HybridTensor/modules/SelectiveMLP.py ADDED Viewed

	@@ -0,0 +1,580 @@

+# python -m HybridTensor.modules.SelectiveMLP --batch_size 8 --index_size 512
+from typing import Optional
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.distributed import ProcessGroup
+import torch.distributed as dist
+# import fused_dense_cuda  # from apex
+import fused_dense_lib as fused_dense_cuda
+from flash_attn.utils.distributed import reduce_scatter, all_reduce
+from einops import rearrange
+# from HybridTensor.modules.MLP import SelectiveMLPFunc
+from HybridTensor.modules.references.fused_dense import ColumnParallelLinear, RowParallelLinear, fused_mlp_func
+from HybridTensor.modules.references.MLP import SelectiveMLPTriton
+from HybridTensor.utils.utils import arg_parser, sparse_index
+from HybridTensor.utils.profiling import cuda_profiler
+# compiles the kernels for the first time, takes time
+from HybridTensor.triton.gather_gemm_col import gather_matmul_col
+from HybridTensor.triton.gather_gemm_row import gather_matmul_row
+# needs to be compiled before running
+from HybridTensor.triton.heuristics.gather_gemm_col_h import gather_matmul_col as gather_matmul_col_h
+from HybridTensor.triton.heuristics.gather_gemm_row_h import gather_matmul_row as gather_matmul_row_h
+# from HybridTensor.triton.cg_safe.gather_gemm_col_cg import gather_matmul_col
+# from HybridTensor.triton.cg_safe.gather_gemm_row_cg import gather_matmul_row
+def SelectiveMLPFunc(x, fc1_w, fc2_w, index_vec, bias1 = None, bias2 = None, activation='relu', use_heuristic=True):
+    if use_heuristic:
+        out = gather_matmul_col_h(x, fc1_w, index_vec, bias = bias1, activations=activation)
+        out = gather_matmul_row_h(out, fc2_w, index_vec, bias = bias2)
+    else:
+        out = gather_matmul_col(x, fc1_w, index_vec, bias = bias1, activations=activation)
+        out = gather_matmul_row(out, fc2_w, index_vec, bias = bias2)
+    return out
+# cg safe version
+# def SelectiveMLPFunc(x, fc1_w, fc2_w, index_vec, index_size, bias1 = None, bias2 = None, activation='relu', use_heuristic=True):
+#     out = gather_matmul_col(x, fc1_w, index_vec, index_size, bias = bias1, activations=activation)
+#     out = gather_matmul_row(out, fc2_w, index_vec, index_size, bias = bias2)
+#     return out
+class MLPRouter(nn.Module):
+    def __init__(self, embed_dim, low_rank_dim, out_dim, act_th, device=None, dtype=None):
+        """
+        Initializes the MHARouter class.
+        Args:
+            embed_dim (int): Dimensionality of the input embeddings.
+            low_rank_dim (int): Dimensionality of the intermediate layer.
+            out_dim (int): Number of neurons.
+        """
+        super(MLPRouter, self).__init__()
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.fc1 = nn.Linear(embed_dim, low_rank_dim, bias=False, **factory_kwargs)
+        self.fc2 = nn.Linear(low_rank_dim, out_dim, bias=False, **factory_kwargs)
+        self.act_th = act_th
+        self.num_neurons = out_dim
+        self.largest = self.num_neurons + 1
+    def forward(self, x):
+        """
+        Forward pass of the MHARouter.
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, embed_dim).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, num_heads).
+        """
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return x
+    def _select_neurons_topk(self, x, topk=None):
+        neurons = self.forward(x)
+        neurons_nonzero = torch.nn.ReLU()(neurons)
+        _, index_vec = neurons_nonzero.sum(dim=0).topk(topk, dim=0, sorted=False)
+        # index_vec, _ = index_vec.sort()
+        return index_vec
+    def _select_neurons(self, x, th=None):
+        '''
+        Threshold based selection of neurons, not CG safe
+        '''
+        if th is None:
+            th = self.act_th
+        neurons = self.forward(x)
+        activated = (neurons > th).sum(dim=0)
+        index_vec = activated.nonzero().flatten()
+        return index_vec
+    def _select_neurons_cuda_safe(self, x, th=None):
+        '''
+        This function is used with threshold and is used for CG safe version of the code
+        '''
+        if th is None:
+            th = self.act_th
+        neurons = self.forward(x)
+        activated = (neurons > th).sum(dim=0)
+        indices = torch.arange(self.num_neurons, device=activated.device)
+        selected = torch.where(activated > th, indices, torch.full_like(indices, self.largest))
+        index_vec, _ = torch.sort(selected)
+        index_size = ((index_vec < self.largest).sum()).to(torch.int32)
+        return index_size, index_vec
+class ParallelMLPRouter(nn.Module):
+    """
+    Parallel Sparse Predictor for MHA layer.
+    """
+    def __init__(
+        self,
+        embed_dim,
+        low_rank_dim,
+        out_dim,
+        act_th,
+        process_group,
+        sequence_parallel=False,
+        device=None,
+        dtype=None,
+    ):
+        """
+        Initializes the ParallelMHARouter class.
+        Args:
+            embed_dim (int): Dimensionality of the input embeddings.
+            low_rank_dim (int): Dimensionality of the intermediate layer.
+            out_dim (int): Output dimensionality (typically number of neurons).
+            process_group (torch.distributed.ProcessGroup): Process group for parallelism.
+            sequence_parallel (bool, optional): Whether to use sequence parallelism. Defaults to False.
+            device (torch.device, optional): Device to run the module on. Defaults to None.
+            dtype (torch.dtype, optional): Data type of the module parameters. Defaults to None.
+        """
+        super(ParallelMLPRouter, self).__init__()
+        assert process_group is not None, "ParallelMHARouter requires a process group."
+        factory_kwargs = {"device": device, "dtype": dtype}
+        self.process_group = process_group
+        self.embed_dim = embed_dim
+        self.act_th = act_th
+        self.fc1 = nn.Linear(
+            embed_dim, low_rank_dim, bias=False, **factory_kwargs
+        )
+        self.fc2 = ColumnParallelLinear(
+            low_rank_dim,
+            out_dim,
+            process_group,
+            bias=False,
+            sequence_parallel=sequence_parallel,
+            **factory_kwargs,
+        )
+    # def _select_neurons(self, neurons, th=None):
+    #     if th is None:
+    #         th = self.act_th
+    #     activated = (neurons > th).sum(dim=0)
+    #     index_vec = activated.nonzero().flatten()
+    #     return index_vec
+    def forward(self, x):
+        """
+        Forward pass of the ParallelMHARouter.
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, seq_len, embed_dim).
+        Returns:
+            torch.Tensor: Output tensor of shape (batch_size, seq_len, out_dim).
+        """
+        x = self.fc1(x)
+        x = self.fc2(x)
+        return x
+    def _select_neurons(self, x, th=None):
+        if th is None:
+            th = self.act_th
+        neurons = self.forward(x)
+        activated = (neurons > th).sum(dim=0)
+        index_vec = activated.nonzero().flatten()
+        return index_vec
+    def _select_neurons_topk(self, x, topk=None):
+        neurons = self.forward(x)
+        neurons_nonzero = torch.nn.ReLU()(neurons) #.squeeze(1)
+        # print(f"neurons_nonzero shape: {neurons_nonzero.shape}")
+        # print(f"Top k neurons: {topk}")
+        _, index_vec = neurons_nonzero.sum(dim=0).topk(topk, dim=0, sorted=False)
+        # index_vec, _ = index_vec.sort()
+        return index_vec
+class SelectiveMLP(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        activation='relu',
+        layer_idx=None,
+        bias1=True,
+        bias2=True,
+        return_residual=False,
+        checkpoint_lvl=0,
+        use_heuristic=True,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        out_features = out_features if out_features is not None else in_features
+        hidden_features = hidden_features if hidden_features is not None else in_features * 4
+        self.return_residual = return_residual
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1, **factory_kwargs)
+        self.activation = activation
+        self.activation_fn = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2, **factory_kwargs)
+        # self.fc2_weight_t = self.fc2.weight.t().contiguous()
+        self.fc2_weight_t = None
+        self.use_heuristic = use_heuristic
+    def _init_weights(self):
+        # if weights are updated, we need to update the transpose
+        self.fc2_weight_t = self.fc2.weight.t().contiguous()
+    def forward(self, x, index_vec=None, index_size=None):
+        if index_vec is not None:
+            # sparse forward,
+            # update on first run
+            if self.fc2_weight_t is None:
+                self.fc2_weight_t = self.fc2.weight.t().contiguous()
+                # Remove the original parameter to free memory.
+                self.fc2.weight = None
+                del self.fc2._parameters['weight']
+            x = x.view(-1, x.size(-1))
+            # x = x.squeeze(1)
+            y = SelectiveMLPFunc(x = x, fc1_w = self.fc1.weight,
+                                fc2_w = self.fc2_weight_t, index_vec = index_vec,
+                                bias1 = self.fc1.bias, bias2 = self.fc2.bias,
+                                activation=self.activation, use_heuristic=self.use_heuristic)
+        else:
+            # dense forward
+            y = self.fc1(x)
+            y = self.activation_fn(y)
+            if self.fc2_weight_t is not None:
+                y = torch.matmul(y, self.fc2_weight_t)
+            else:
+                y = self.fc2(y)
+        return y if not self.return_residual else (y, x)
+class ParallelSelectiveMLP(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features,
+        out_features=None,
+        activation="relu",
+        layer_idx=None,
+        process_group: ProcessGroup = None,
+        bias1=True,
+        bias2=True,
+        return_residual=False,
+        sequence_parallel=False,
+        use_heuristic=True,
+        checkpoint_lvl=0,
+        heuristic="auto",
+        device=None,
+        dtype=None,
+    ):
+        """
+        process_group is required. We're doing Tensor Parallel with sequence parallelism:
+        we do an all_gather of x before doing the matmul, gelu, then matmul.
+        Finally we do a reduce_scatter of the output.
+        checkpoint_lvl (increasing lvl means slower but more memory saving):
+            0: no recomputation in the bwd
+            1: recompute gelu_out in the bwd
+            2: recompute pre_act and gelu_out in the bwd
+        heuristic:
+            -1: don't fuse gemm + gelu (separate kernel)
+            0..4: use this heuristic for the algo section in the fused gemm + gelu
+            'auto': heuristic will be picked automatically:
+                For CUDA >= 11.8, we set heuristic=0 for both fp16 and bf16 for best perf.
+                For CUDA <= 11.7, we set heuristic=1 for fp16 and heuristic=-1 for bf16.
+        """
+        assert checkpoint_lvl in [0, 1, 2]
+        assert activation in ["gelu_approx", "relu"]
+        assert process_group is not None
+        # assert sp_kwargs != None, "sparse predictor parameters are not passed in."
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        if out_features is None:
+            out_features = in_features
+        self.activation = activation
+        self.process_group = process_group
+        self.sequence_parallel = sequence_parallel
+        self.checkpoint_lvl = checkpoint_lvl
+        self.heuristic = heuristic
+        self.fc1 = ColumnParallelLinear(
+            in_features, hidden_features, process_group, bias=bias1, **factory_kwargs
+        )
+        self.fc2 = RowParallelLinear(
+            hidden_features, out_features, process_group, bias=bias2, **factory_kwargs
+        )
+        self.layer_idx = layer_idx
+        self.fc2_weight_t = self.register_buffer("fc2_weigth_t", None)
+        self.return_residual = return_residual
+        self.fc2_weight_t = None
+        self.use_heuristic = use_heuristic
+        self.reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
+        # self._init_weights()
+    def _init_weights(self):
+        # ffn2 weights needs to be in row major format to select from rows
+        self.fc2_weight_t = self.fc2.weight.t().contiguous()
+    def forward(self, x, residual = None, index_vec = None):
+        # do_token_generation = x.size(1) == 1
+        # index_vec = None
+        # with torch.cuda.stream(self.curr_stream):
+        if index_vec is not None:
+            # assert x.size(1) == 1
+            if self.fc2_weight_t is None:
+                self.fc2_weight_t = self.fc2.weight.t().contiguous()
+            x = x.view(-1, x.size(-1))
+            # x = rearrange(x, "b 1 d -> b d") # slightly more expensive to use rearrange
+            out = SelectiveMLPFunc(x = x, fc1_w = self.fc1.weight,
+                                fc2_w = self.fc2_weight_t, index_vec = index_vec,
+                                bias1 = self.fc1.bias, bias2 = self.fc2.bias,
+                                activation=self.activation, use_heuristic=self.use_heuristic)
+            # out = rearrange(out, "b d -> b 1 d")
+            # out = out.view(-1, 1, out.size(-1))
+        else:   # normal mlp
+            if self.heuristic == "auto":
+                dtype = (
+                    x.dtype
+                    if not torch.is_autocast_enabled()
+                    else torch.get_autocast_gpu_dtype()
+                )
+                if self.activation == "gelu_approx":
+                    cuda_ver = tuple(map(int, torch.version.cuda.split(".")))
+                    heuristic = (
+                        0 if cuda_ver >= (11, 8) else (1 if dtype == torch.float16 else -1)
+                    )
+                else:
+                    heuristic = 0
+            else:
+                heuristic = self.heuristic
+            out = fused_mlp_func(
+                x,
+                self.fc1.weight,
+                self.fc2.weight,
+                self.fc1.bias,
+                self.fc2.bias,
+                activation=self.activation,
+                save_pre_act=self.training,
+                checkpoint_lvl=self.checkpoint_lvl,
+                heuristic=heuristic,
+                process_group=self.process_group,
+                sequence_parallel=self.sequence_parallel,
+            )
+        if self.process_group.size() > 1:
+            # out = self.reduce_fn(out, self.process_group) # has some overhead,
+            dist.all_reduce(out, op=dist.ReduceOp.SUM, group=self.process_group)
+        return out if not self.return_residual else (out, x)
+        # return out
+    def sp_forward(self, x, residual = None, index_vec = None):
+        if self.heuristic == "auto":
+            dtype = (
+                x.dtype
+                if not torch.is_autocast_enabled()
+                else torch.get_autocast_gpu_dtype()
+            )
+            if self.activation == "gelu_approx":
+                cuda_ver = tuple(map(int, torch.version.cuda.split(".")))
+                heuristic = (
+                    0 if cuda_ver >= (11, 8) else (1 if dtype == torch.float16 else -1)
+                )
+            else:
+                heuristic = 0
+        else:
+            heuristic = self.heuristic
+        curr_stream = torch.cuda.current_stream()
+        do_token_generation = x.size(1) == 1
+        # mlp_logit = None
+        # with torch.cuda.stream(self.curr_stream):
+        if index_vec != None:
+            assert x.size(1) == 1
+            if self.fc2_weight_t is None:
+                self.fc2_weight_t = self.fc2.weight.t().contiguous()
+            out = SelectiveMLPFunc(
+                rearrange(x, "b 1 d -> b d"),
+                self.fc1.weight,
+                self.fc2_weight_t,
+                index_vec,
+                self.fc1.bias,
+                self.fc2.bias,
+                activation=self.activation,
+            )
+            out = rearrange(out, "b d -> b 1 d")
+        else:
+            out = fused_mlp_func(
+                x,
+                self.fc1.weight,
+                self.fc2.weight,
+                self.fc1.bias,
+                self.fc2.bias,
+                activation=self.activation,
+                save_pre_act=self.training,
+                checkpoint_lvl=self.checkpoint_lvl,
+                heuristic=heuristic,
+                process_group=self.process_group,
+                sequence_parallel=self.sequence_parallel,
+            )
+        reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
+        if self.sp_router:
+            curr_stream.record_event(self.event_mlp)
+        # handle = torch.distributed.all_reduce(out, op=torch.distributed.ReduceOp.SUM, group=self.process_group, async_op=True)
+        out = reduce_fn(out, self.process_group)
+        if self.sp_router:
+            with torch.cuda.stream(self.sp_stream):
+                self.sp_stream.wait_event(self.event_mlp)
+                if do_token_generation:
+                    mlp_logit = self.sp(rearrange(residual, "b 1 d -> b d"))
+                self.sp_stream.record_event(self.event_mlp_sp)
+            # check this again, we might not have to synchronize here, we can synchronize in the next layer
+            curr_stream.wait_event(self.event_mlp_sp)
+        return out
+class SimpleMLP(nn.Module):
+    def __init__(self, in_features, hidden_features, out_features, bias=False, activation="relu"):
+        super().__init__()
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.activation = activation
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return x
+if __name__ == "__main__":
+    args = arg_parser()
+    bias = True if args.bias > 0 else False
+    x = torch.randn(args.batch_size, args.in_features, device="cuda", dtype=torch.float16)
+    index_vec, _ = sparse_index(args.index_size, args.in_features*4)
+    '''
+    selective_mlp = SelectiveMLPTriton(args.in_features, args.hidden_features, bias=bias, device="cuda", dtype=torch.float16, activation="relu")
+    out, mlp_time = cuda_profiler(selective_mlp, x, index_vec)
+    out_col, col_time = cuda_profiler(gather_matmul_col, x, selective_mlp.fc1_w, index_vec, activations=selective_mlp.activation)
+    out_row, row_time = cuda_profiler(gather_matmul_row, out_col, selective_mlp.fc2_w, index_vec)
+    sum_time = col_time + row_time
+    print(f"Index size {args.index_size}, Activated {args.index_size/(args.in_features * 4)*100}% neurons")
+    print(f"Gather Col Time: {col_time} ms")
+    print(f"Gather Row Time: {row_time} ms")
+    # print(f"Sum Time: {sum_time} ms")
+    print(f"SelectiveMLP Time: {mlp_time} ms")
+    '''
+    in_features = args.in_features
+    hidden_features = in_features * 4
+    out_features = in_features
+    device = torch.device("cuda")
+    model = SelectiveMLP(
+        in_features, hidden_features, out_features, device=device, dtype=torch.float16, activation="relu", use_heuristic=True
+    ).to(device)
+    router = MLPRouter(in_features, 1024, hidden_features, act_th = 0.5, device=device, dtype=torch.float16).to(device)
+    # Warm-up GPU
+    def warmup():
+        for _ in range(10):
+            _ = model(x, index_vec)
+            _ = model(x, None)
+            _ = router._select_neurons_topk(x, args.index_size)
+    warmup()
+    # Measure SelectiveMLPFunc speed
+    _, router_time = cuda_profiler(router._select_neurons_topk, x, args.index_size)
+    _, selective_time = cuda_profiler(model, x, index_vec)
+    # Measure dense forward speed
+    _, dense_time = cuda_profiler(model, x, None)
+    print(f"Router time per run: {router_time:.6f} ms")
+    print(f"SelectiveMLPFunc time per run: {selective_time:.6f} ms")
+    print(f"Dense forward time per run: {dense_time:.6f} ms")
+    print(f"Speedup: {dense_time / selective_time:.2f}x")
+    router_selective_time = router_time + selective_time
+    print(f"Router + SelectiveMLPFunc time per run: {router_selective_time:.6f} ms")
+    print(f"Speedup: {dense_time / router_selective_time:.2f}x")
+    ############################################
+    # CUDA Graph capture tests for the MLP model
+    ############################################
+    print("\n=== CUDA Graph Tests ===")
+    # --- Selective forward (sparse mode) ---
+    print("Testing CUDA Graph for Selective forward (with index_vec)...")
+    static_x = x.clone()
+    static_index_vec = index_vec.clone()
+    # Warm-up run to allocate memory
+    static_out_sel = model(static_x, index_vec=static_index_vec)
+    torch.cuda.synchronize()
+    # Capture on a non-default stream
+    capture_stream = torch.cuda.Stream()
+    with torch.cuda.stream(capture_stream):
+        g_sel = torch.cuda.CUDAGraph()
+        g_sel.capture_begin()
+        static_out_sel = model(static_x, index_vec=static_index_vec)
+        g_sel.capture_end()
+    torch.cuda.synchronize()
+    # Replay and check accuracy
+    g_sel.replay()
+    torch.cuda.synchronize()
+    cuda_sel_out = static_out_sel.clone()
+    regular_sel_out = model(x, index_vec=index_vec)
+    if torch.allclose(cuda_sel_out, regular_sel_out, atol=1e-3):
+        print("Selective forward CUDA Graph output matches regular output")
+    else:
+        print("Selective forward CUDA Graph output does NOT match regular output")
+    def replay_sel():
+        g_sel.replay()
+    _, selective_time_cuda = cuda_profiler(replay_sel)
+    print(f"Selective forward CUDA Graph time per run: {selective_time_cuda:.6f} ms")

HybridTensor/modules/SelectiveRouters.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import re
+import torch
+from collections import OrderedDict
+def create_mlp_router_state_dict(router_files_dir):
+    """
+    Loads all mlp_router weight files from the specified directory and creates a router_state_dict
+    with keys formatted as 'transformer.layers.{layer_num}.mlp_router.{param_name}'.
+    Args:
+        router_files_dir (str): Path to the directory containing mlp_router_*.pt files.
+    Returns:
+        OrderedDict: A state dictionary suitable for loading into a transformer model.
+    """
+    # Regular expression to extract layer number from filename
+    router_file_pattern = re.compile(r'mlp_router_(\d+)-[\d.]+-[\d.]+-[\d.]+\.pt$')
+    router_state_dict = OrderedDict()
+    # List all files in the directory
+    try:
+        all_files = os.listdir(router_files_dir)
+    except FileNotFoundError:
+        print(f"Error: Directory '{router_files_dir}' does not exist.")
+        return None
+    # Filter files matching the pattern
+    router_files = [f for f in all_files if router_file_pattern.match(f)]
+    if not router_files:
+        print(f"No router files found in directory '{router_files_dir}'.")
+        return None
+    for file_name in sorted(router_files, key=lambda x: int(router_file_pattern.match(x).group(1))):
+        match = router_file_pattern.match(file_name)
+        if not match:
+            print(f"Skipping file '{file_name}' as it does not match the pattern.")
+            continue
+        layer_num = int(match.group(1))
+        file_path = os.path.join(router_files_dir, file_name)
+        try:
+            # Load the router's state dict
+            router_weights = torch.load(file_path, map_location='cpu')
+            if not isinstance(router_weights, dict):
+                print(f"Warning: The file '{file_path}' does not contain a state dictionary. Skipping.")
+                continue
+        except Exception as e:
+            print(f"Error loading '{file_path}': {e}")
+            continue
+        # Iterate through each parameter in the router's state dict
+        for param_name, param_tensor in router_weights.items():
+            # Construct the new key
+            new_key = f"transformer.layers.{layer_num}.mlp_router.{param_name}"
+            router_state_dict[new_key] = param_tensor
+        # print(f"Loaded router for layer {layer_num} from '{file_name}'.")
+    print(f"Total routers loaded: {len(router_state_dict) // 2}")  # Assuming 4 params per router (weight & bias for 2 layers)
+    return router_state_dict
+def create_attn_router_state_dict(router_files_dir):
+    """
+    Loads all attn_router weight files from the specified directory and creates a router_state_dict
+    with keys formatted as 'transformer.layers.{layer_num}.mha_router.{param_name}'.
+    Args:
+        router_files_dir (str): Path to the directory containing attn_router_*.pt files.
+    Returns:
+        OrderedDict: A state dictionary suitable for loading into a transformer model.
+    """
+    # Regular expression to extract layer number from filename
+    # Pattern: attn_router_{layer_num}-{value1}-{value2}.pt
+    router_file_pattern = re.compile(r'attn_router_(\d+)-[\d.]+-[\d.]+\.pt$')
+    router_state_dict = OrderedDict()
+    # List all files in the directory
+    try:
+        all_files = os.listdir(router_files_dir)
+    except FileNotFoundError:
+        print(f"Error: Directory '{router_files_dir}' does not exist.")
+        return None
+    # Filter files matching the pattern
+    router_files = [f for f in all_files if router_file_pattern.match(f)]
+    if not router_files:
+        print(f"No attn_router files found in directory '{router_files_dir}'.")
+        return None
+    # To handle potential duplicates, keep track of loaded layer numbers
+    loaded_layers = set()
+    for file_name in sorted(router_files, key=lambda x: int(router_file_pattern.match(x).group(1))):
+        match = router_file_pattern.match(file_name)
+        if not match:
+            print(f"Skipping file '{file_name}' as it does not match the pattern.")
+            continue
+        layer_num = int(match.group(1))
+        if layer_num in loaded_layers:
+            print(f"Warning: Multiple router files found for layer {layer_num}. Skipping '{file_name}'.")
+            continue  # Skip duplicate layers
+        file_path = os.path.join(router_files_dir, file_name)
+        try:
+            # Load the router's state dict
+            router_weights = torch.load(file_path, map_location='cpu')
+            if not isinstance(router_weights, dict):
+                print(f"Warning: The file '{file_path}' does not contain a state dictionary. Skipping.")
+                continue
+        except Exception as e:
+            print(f"Error loading '{file_path}': {e}")
+            continue
+        # Iterate through each parameter in the router's state dict
+        for param_name, param_tensor in router_weights.items():
+            # Construct the new key
+            new_key = f"transformer.layers.{layer_num}.mha_router.{param_name}"
+            router_state_dict[new_key] = param_tensor
+        loaded_layers.add(layer_num)
+        # print(f"Loaded MHA router for layer {layer_num} from '{file_name}'.")
+    print(f"Total MHA routers loaded: {len(loaded_layers)}")
+    return router_state_dict

HybridTensor/modules/__init__.py ADDED Viewed

File without changes

HybridTensor/modules/__pycache__/MLP.cpython-39.pyc ADDED Viewed

Binary file (5.3 kB). View file

HybridTensor/modules/__pycache__/ParallelMLP.cpython-39.pyc ADDED Viewed

Binary file (5.61 kB). View file

HybridTensor/modules/__pycache__/SelectiveBlock.cpython-39.pyc ADDED Viewed

Binary file (10.6 kB). View file

HybridTensor/modules/__pycache__/SelectiveBlock_v1.cpython-310.pyc ADDED Viewed

Binary file (6.56 kB). View file

HybridTensor/modules/__pycache__/SelectiveBlock_v1.cpython-39.pyc ADDED Viewed

Binary file (6.77 kB). View file

HybridTensor/modules/__pycache__/SelectiveMHA.cpython-310.pyc ADDED Viewed

Binary file (23.3 kB). View file

HybridTensor/modules/__pycache__/SelectiveMHA.cpython-39.pyc ADDED Viewed

Binary file (30.1 kB). View file

HybridTensor/modules/__pycache__/SelectiveMLP.cpython-310.pyc ADDED Viewed

Binary file (11.8 kB). View file

HybridTensor/modules/__pycache__/SelectiveMLP.cpython-39.pyc ADDED Viewed

Binary file (14.1 kB). View file

HybridTensor/modules/__pycache__/SelectiveRouters.cpython-310.pyc ADDED Viewed

Binary file (3.96 kB). View file

HybridTensor/modules/__pycache__/SelectiveRouters.cpython-39.pyc ADDED Viewed

Binary file (4 kB). View file

HybridTensor/modules/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (198 Bytes). View file