Spaces:
Build error
Build error
| # Copyright (C) 2020 Intel Corporation | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, | |
| # software distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions | |
| # and limitations under the License. | |
| # Kill all subprocesses and exit when ctrl-c is pressed | |
| function stop_all() { | |
| echo "Stopping processes" | |
| pkill top | |
| pkill pcm | |
| tokill=`ps aux | grep tools/train.py | grep -v grep | head -n 1 | awk '{print $2}'` | |
| kill "${tokill}" | |
| } | |
| trap "echo; echo Killing processes...; stop_all; exit 0" SIGINT SIGTERM | |
| #Install PCM if it is not already | |
| if [ -d "./pcm" ] | |
| then | |
| echo PCM installed | |
| modprobe msr | |
| else | |
| git clone https://github.com/opcm/pcm.git | |
| pushd pcm | |
| make | |
| make install | |
| apt install sysstat | |
| modprobe msr | |
| popd | |
| fi | |
| #Creat output dir | |
| DATETIME=`date -u +"%Y%b%d_%H%M"` | |
| mkdir -p output/$DATETIME | |
| rundir="$PWD/output/$DATETIME" | |
| echo ${rundir} | |
| #Run training | |
| pushd .. | |
| python tools/train.py "$@" 2>&1 | tee -a ${rundir}/train.log & | |
| popd | |
| sleep 10 | |
| #Collect system-level metrics | |
| top -b -i -o %CPU > "${rundir}"/top.log & | |
| echo TOP STARTED | |
| ./pcm/pcm.x > "${rundir}"/pcm.log & | |
| echo PCM STARTED | |
| echo NVIDIA LOOP STARTING | |
| for i in `seq 1 60` | |
| do | |
| nvidia-smi --query-gpu=utilization.gpu --format=csv >> "${rundir}"/gpu_utilization.log | |
| nvidia-smi --query-gpu=utilization.memory --format=csv >> "${rundir}"/gpu_mem.log | |
| nvidia-smi --query-gpu=temperature.gpu --format=csv >> "${rundir}"/gpu_temp.log | |
| sleep 1 | |
| done | |
| #Cleanup when done | |
| pkill top | |
| pkill pcm | |