File size: 1,641 Bytes
c6dfc69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/bin/bash

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
CODE_DIR="${REPO_ROOT}/ref-avs.code"
cd "${SCRIPT_DIR}"

DEFAULT_GPUS=4
DEFAULT_EPOCHS=50
DEFAULT_LR=1e-4
OMP_THREADS=8

print_table() {
  echo "+-------------+----------------+"
  echo "| hyper-param |   ref-avs      |"
  echo "+-------------+----------------+"
  printf "| %-11s | %-14s |\n" "epoch" "${DEFAULT_EPOCHS}"
  printf "| %-11s | %-14s |\n" "lr" "${DEFAULT_LR}"
  printf "| %-11s | %-14s |\n" "gpus(def)" "${DEFAULT_GPUS}"
  echo "+-------------+----------------+"
}

usage() {
  echo "Usage: $0 [gpus]"
  echo "Example: $0"
  echo "Example: $0 8"
}

if [[ $# -gt 1 ]]; then
  usage
  print_table
  exit 1
fi

GPUS="${1:-${DEFAULT_GPUS}}"

if ! [[ "${GPUS}" =~ ^[0-9]+$ ]] || [[ "${GPUS}" -le 0 ]]; then
  echo "Error: gpus must be a positive integer, got: ${GPUS}"
  exit 1
fi

if [[ ! -f "${CODE_DIR}/main.py" ]]; then
  echo "Error: training entry not found: ${CODE_DIR}/main.py"
  exit 1
fi

export OMP_NUM_THREADS="${OMP_THREADS}"

LOG_FILE="train_ref_avs.log"
CMD=(
  python3 "${CODE_DIR}/main.py"
  --epochs="${DEFAULT_EPOCHS}"
  --gpus="${GPUS}"
  --lr="${DEFAULT_LR}"
)

echo "Training job is about to start:"
echo "  dataset: ref-avs (REFAVS)"
echo "  code:    ${CODE_DIR}/main.py"
echo "  epochs:  ${DEFAULT_EPOCHS}"
echo "  lr:      ${DEFAULT_LR}"
echo "  gpus:    ${GPUS}"
echo "  log:     ${SCRIPT_DIR}/${LOG_FILE}"
echo
print_table
echo
echo "Command: nohup ${CMD[*]} > ${LOG_FILE} 2>&1 &"

nohup "${CMD[@]}" > "${LOG_FILE}" 2>&1 &
echo "Training started in background, PID: $!"