apiVersion: trainer.kubeflow.org/v1alpha1 kind: TrainJob metadata: name: jeesoo-grouped-polynorm-profiler-bench namespace: kbm-g-np-motif spec: managedBy: trainer.kubeflow.org/trainjob-controller podTemplateOverrides: - spec: containers: - name: node volumeMounts: - mountPath: /dev/shm name: shm - mountPath: /mair name: mair volumes: - emptyDir: medium: Memory sizeLimit: 64Gi name: shm - name: mair persistentVolumeClaim: claimName: mair targetJobs: - name: node runtimeRef: apiGroup: trainer.kubeflow.org kind: ClusterTrainingRuntime name: torch-distributed suspend: false trainer: args: - /bin/bash - '-c' - > ACTIVATIONPATH=/mair/team-sys/jeesoo/activation BUILDLOG=$ACTIVATIONPATH/benchmarks/results/build.log pip install triton matplotlib pandas echo "=== Building with setup.py ===" cd $ACTIVATIONPATH rm -f $ACTIVATIONPATH/_activation*.so pip install --no-build-isolation -e . -v 2>&1 | tee $BUILDLOG | tail -200 python -c "import _activation; print('Build OK:', _activation)" || { echo "BUILD FAILED"; exit 0; } echo "=== Build success. Running profiler benchmarks ===" cd $ACTIVATIONPATH/benchmarks DATESTAMP=$(date +'%y_%m_%d_%H_%M') SAVE_PATH=$ACTIVATIONPATH/benchmarks/results/${DATESTAMP} mkdir -p $SAVE_PATH/bench/grouped_mul_poly/bf16 nvidia-smi | tee $SAVE_PATH/nvidia_smi.txt python -c "import torch; x=torch.randn(8192,1280,device='cuda',dtype=torch.bfloat16); [torch.mm(x.T,x) for _ in range(100)]; torch.cuda.synchronize(); print('warmup done')" echo "=== Benchmark (torch.profiler) ===" python run_cases.py --case grouped_mul_poly --dtype bf16 --save-path ${SAVE_PATH}/bench 2>&1 | tee ${SAVE_PATH}/bench_log.txt echo "=== Done ===" exit 0; env: - name: PYTHONUNBUFFERED value: '1' - name: PYTORCH_ALLOC_CONF value: expandable_segments:True - name: CUDA_LAUNCH_BLOCKING value: '0' - name: OMP_NUM_THREADS value: '1' - name: HF_HOME value: /mair/llm-dataset/hf_cache image: ghcr.io/motiftechnologies/llm-training:v0.1.3 numNodes: 1 numProcPerNode: 1 resourcesPerNode: limits: cpu: '16' memory: 128Gi nvidia.com/gpu: '1' requests: cpu: '16' memory: 128Gi nvidia.com/gpu: '1'