import torch import torch.nn as nn import triton import triton.language as tl import sys sys.path.append("/models/blitz/crates/blitz-kernels/src/cuda") @triton.jit def blitz_speed_kernel(X, Y, N, BLOCK_SIZE: tl.constexpr): pid = tl.program_id(0) offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) mask = offsets < N x = tl.load(X + offsets, mask=mask) y = x.to(tl.float8e4nv) tl.store(Y + offsets, y.to(tl.int8, bitcast=True), mask=mask) class ModelNew(nn.Module): def __init__(self): super().__init__() def forward(self, x): y = torch.empty(x.shape, device="cuda", dtype=torch.int8) blitz_speed_kernel[(1,)](x, y, x.numel(), BLOCK_SIZE=x.numel()) return y.view(torch.uint8).to(torch.float32) def get_inputs(): return [torch.randn(8192, device="cuda")] def get_init_inputs(): return []