Kernels
activation / benchmarks /benchmark_profiler.yaml
wyldecat's picture
feat: replace triton do_bench with torch.profiler for kernel timing
7d51e61
apiVersion: trainer.kubeflow.org/v1alpha1
kind: TrainJob
metadata:
name: jeesoo-grouped-polynorm-profiler-bench
namespace: kbm-g-np-motif
spec:
managedBy: trainer.kubeflow.org/trainjob-controller
podTemplateOverrides:
- spec:
containers:
- name: node
volumeMounts:
- mountPath: /dev/shm
name: shm
- mountPath: /mair
name: mair
volumes:
- emptyDir:
medium: Memory
sizeLimit: 64Gi
name: shm
- name: mair
persistentVolumeClaim:
claimName: mair
targetJobs:
- name: node
runtimeRef:
apiGroup: trainer.kubeflow.org
kind: ClusterTrainingRuntime
name: torch-distributed
suspend: false
trainer:
args:
- /bin/bash
- '-c'
- >
ACTIVATIONPATH=/mair/team-sys/jeesoo/activation
BUILDLOG=$ACTIVATIONPATH/benchmarks/results/build.log
pip install triton matplotlib pandas
echo "=== Building with setup.py ==="
cd $ACTIVATIONPATH
rm -f $ACTIVATIONPATH/_activation*.so
pip install --no-build-isolation -e . -v 2>&1 | tee $BUILDLOG | tail -200
python -c "import _activation; print('Build OK:', _activation)" || { echo "BUILD FAILED"; exit 0; }
echo "=== Build success. Running profiler benchmarks ==="
cd $ACTIVATIONPATH/benchmarks
DATESTAMP=$(date +'%y_%m_%d_%H_%M')
SAVE_PATH=$ACTIVATIONPATH/benchmarks/results/${DATESTAMP}
mkdir -p $SAVE_PATH/bench/grouped_mul_poly/bf16
nvidia-smi | tee $SAVE_PATH/nvidia_smi.txt
python -c "import torch; x=torch.randn(8192,1280,device='cuda',dtype=torch.bfloat16); [torch.mm(x.T,x) for _ in range(100)]; torch.cuda.synchronize(); print('warmup done')"
echo "=== Benchmark (torch.profiler) ==="
python run_cases.py --case grouped_mul_poly --dtype bf16 --save-path ${SAVE_PATH}/bench 2>&1 | tee ${SAVE_PATH}/bench_log.txt
echo "=== Done ==="
exit 0;
env:
- name: PYTHONUNBUFFERED
value: '1'
- name: PYTORCH_ALLOC_CONF
value: expandable_segments:True
- name: CUDA_LAUNCH_BLOCKING
value: '0'
- name: OMP_NUM_THREADS
value: '1'
- name: HF_HOME
value: /mair/llm-dataset/hf_cache
image: ghcr.io/motiftechnologies/llm-training:v0.1.3
numNodes: 1
numProcPerNode: 1
resourcesPerNode:
limits:
cpu: '16'
memory: 128Gi
nvidia.com/gpu: '1'
requests:
cpu: '16'
memory: 128Gi
nvidia.com/gpu: '1'