| { |
| "sglang": { |
| "llama": { |
| "gemm|nvjet": "gemm", |
| "fused_moe_kernel|GroupProblemShape|group_gemm_starts|bmm_|GemmUniversal": "moe_gemm", |
| "moe|sigmoid": "moe", |
| "CatArrayBatched|prepare_inputs": "prepare_next", |
| "ncclDevKernel|cross_device_reduce": "nccl_and_custom_ar", |
| "_norm_|Norm": "norm", |
| "topk": "topk", |
| "act_and_mul_": "activation", |
| "Rotary": "rope", |
| "SoftMax": "softmax", |
| "flash|fmha": "attn", |
| "elementwise": "elementwise", |
| "fp8_quant|cvt_|quantize": "quantize", |
| "reduce_kernel": "reduce", |
| "triton": "triton_kernel", |
| "CUDA mem": "non-gpu-H_D_memops", |
| ".*": "misc" |
| }, |
| "ds": { |
| "block_fp8_matmul": "block_fp8_gemm", |
| "gemm|matmul|nvjet": "gemm", |
| "fused_moe_kernel": "moe_gemm", |
| "moe|expert|sigmoid": "moe", |
| "CatArrayBatched|write_req_to": "prepare_next", |
| "ncclDevKernel|cross_device_reduce|all_gather": "nccl_and_custom_ar", |
| "Norm": "norm", |
| "topk": "topk", |
| "activation|act_and_mul": "activation", |
| "compute_position_kernel": "rope", |
| "elementwise": "elementwise", |
| "fp8_quant|quant_fp8|quantize": "quantize", |
| "SoftMax": "softmax", |
| "reduce": "reduce", |
| "_fwd_|create_flash|::mla::|KVCache": "attn", |
| "CUDA mem": "non-gpu-H_D_memops", |
| ".*": "misc" |
| }, |
| "gpt-oss": { |
| "gemm|nvjet": "gemm", |
| "fused_moe_kernel|_group_gemm|GroupProblemShape|GemmUniversal|bmm_|matmul_ogs_|_topk_forward|_combined_routing|_sum_bitmatrix_rows|_compute_writeback_idx": "moe_gemm", |
| "moe|sigmoid": "moe", |
| "CatArrayBatched|prepare_inputs": "prepare_next", |
| "_norm_|Norm": "norm", |
| "ncclDevKernel|cross_device_reduce|allreduce": "nccl_and_custom_ar", |
| "topk|TopK": "topk", |
| "act_and_mul_": "activation", |
| "Rotary": "rope", |
| "SoftMax": "softmax", |
| "flash|fmha": "attn", |
| "elementwise": "elementwise", |
| "fp8_quant|cvt_|quantize": "quantize", |
| "reduce_kernel": "reduce", |
| "triton": "triton_kernel", |
| "CUDA mem": "non-gpu-H_D_memops", |
| ".*": "misc" |
| } |
| } |
| } |
|
|