activation / benchmarks /benchmark_profiler.yaml

feat: replace triton do_bench with torch.profiler for kernel timing

7d51e61 about 2 months ago

2.67 kB

	apiVersion: trainer.kubeflow.org/v1alpha1
	kind: TrainJob
	metadata:
	name: jeesoo-grouped-polynorm-profiler-bench
	namespace: kbm-g-np-motif
	spec:
	managedBy: trainer.kubeflow.org/trainjob-controller
	podTemplateOverrides:
	- spec:
	containers:
	- name: node
	volumeMounts:
	- mountPath: /dev/shm
	name: shm
	- mountPath: /mair
	name: mair
	volumes:
	- emptyDir:
	medium: Memory
	sizeLimit: 64Gi
	name: shm
	- name: mair
	persistentVolumeClaim:
	claimName: mair
	targetJobs:
	- name: node
	runtimeRef:
	apiGroup: trainer.kubeflow.org
	kind: ClusterTrainingRuntime
	name: torch-distributed
	suspend: false
	trainer:
	args:
	- /bin/bash
	- '-c'
	- >
	ACTIVATIONPATH=/mair/team-sys/jeesoo/activation

	BUILDLOG=$ACTIVATIONPATH/benchmarks/results/build.log

	pip install triton matplotlib pandas

	echo "=== Building with setup.py ==="

	cd $ACTIVATIONPATH

	rm -f $ACTIVATIONPATH/_activation*.so

	pip install --no-build-isolation -e . -v 2>&1 \| tee $BUILDLOG \| tail -200

	python -c "import _activation; print('Build OK:', _activation)" \|\| { echo "BUILD FAILED"; exit 0; }

	echo "=== Build success. Running profiler benchmarks ==="

	cd $ACTIVATIONPATH/benchmarks

	DATESTAMP=$(date +'%y_%m_%d_%H_%M')

	SAVE_PATH=$ACTIVATIONPATH/benchmarks/results/${DATESTAMP}

	mkdir -p $SAVE_PATH/bench/grouped_mul_poly/bf16

	nvidia-smi \| tee $SAVE_PATH/nvidia_smi.txt

	python -c "import torch; x=torch.randn(8192,1280,device='cuda',dtype=torch.bfloat16); [torch.mm(x.T,x) for _ in range(100)]; torch.cuda.synchronize(); print('warmup done')"

	echo "=== Benchmark (torch.profiler) ==="

	python run_cases.py --case grouped_mul_poly --dtype bf16 --save-path ${SAVE_PATH}/bench 2>&1 \| tee ${SAVE_PATH}/bench_log.txt

	echo "=== Done ==="

	exit 0;
	env:
	- name: PYTHONUNBUFFERED
	value: '1'
	- name: PYTORCH_ALLOC_CONF
	value: expandable_segments:True
	- name: CUDA_LAUNCH_BLOCKING
	value: '0'
	- name: OMP_NUM_THREADS
	value: '1'
	- name: HF_HOME
	value: /mair/llm-dataset/hf_cache
	image: ghcr.io/motiftechnologies/llm-training:v0.1.3
	numNodes: 1
	numProcPerNode: 1
	resourcesPerNode:
	limits:
	cpu: '16'
	memory: 128Gi
	nvidia.com/gpu: '1'
	requests:
	cpu: '16'
	memory: 128Gi
	nvidia.com/gpu: '1'