#!/bin/bash
# Build libtrit_gemv.so — standalone CUDA kernel library
# No PyTorch, no Python, no framework dependency.
# Just nvcc + CUDA runtime.
#
# Fat binary: compiles for all major GPU architectures.
# The right kernel is selected at runtime based on the GPU.

set -e
cd "$(dirname "$0")"

# Detect nvcc
NVCC=$(which nvcc 2>/dev/null || echo "/usr/local/cuda/bin/nvcc")
if [ ! -x "$NVCC" ]; then
    echo "ERROR: nvcc not found. Install CUDA toolkit."
    exit 1
fi

echo "Using nvcc: $NVCC"
$NVCC --version | head -1

# Architecture targets (fat binary)
# Volta (V100), Turing (2080), Ampere (3090/A100),
# Ada (4080/4090), Hopper (H100), Blackwell (5070+)
ARCHS=""
ARCHS="$ARCHS -gencode=arch=compute_70,code=sm_70"   # V100
ARCHS="$ARCHS -gencode=arch=compute_75,code=sm_75"   # 2080
ARCHS="$ARCHS -gencode=arch=compute_80,code=sm_80"   # A100, 3080
ARCHS="$ARCHS -gencode=arch=compute_86,code=sm_86"   # 3090
ARCHS="$ARCHS -gencode=arch=compute_89,code=sm_89"   # 4080, 4090
ARCHS="$ARCHS -gencode=arch=compute_90,code=sm_90"   # H100

# Blackwell — only if nvcc supports it (CUDA 12.8+)
if $NVCC --help 2>&1 | grep -q "compute_120"; then
    ARCHS="$ARCHS -gencode=arch=compute_120,code=sm_120"
    echo "Including Blackwell (sm_120)"
fi

echo "Building libtrit_gemv.so..."
$NVCC -O3 --use_fast_math \
    -shared -Xcompiler -fPIC \
    $ARCHS \
    -o libtrit_gemv.so \
    trit_gemv_standalone.cu

ls -la libtrit_gemv.so
echo "Done! Library ready at $(pwd)/libtrit_gemv.so"
echo ""
echo "Usage from Python:"
echo "  from trit_gemv_lib import TritGEMV"
echo "  lib = TritGEMV()"
echo "  lib.gemv_d2(weights, scales, x_int8, x_scales, output, K, M, ng)"