CodonTranslator / slurm /rebuild_data_v3_cpu.sbatch
alegendaryfish's picture
Refine CodonTranslator docs and public scripts
75c84f0 verified
#!/bin/bash
#SBATCH --partition=beacon
#SBATCH --qos=high
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=16
#SBATCH --mem=240G
#SBATCH --time=3-00:00:00
#SBATCH --job-name=data_v3_rebuild
#SBATCH --output=%x_%j.out
#SBATCH --error=%x_%j.err
set -euo pipefail
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
DEFAULT_REPO_ROOT=$(cd "${SCRIPT_DIR}/.." && pwd)
REPO_ROOT=${REPO_ROOT:-${DEFAULT_REPO_ROOT}}
PYTHON_BIN=${PYTHON_BIN:-python}
MMSEQS_BIN=${MMSEQS_BIN:-$REPO_ROOT/MMseqs2/build/bin/mmseqs}
INPUT_GLOB=${INPUT_GLOB:-data_v2/*/*.parquet}
HELDOUT_GLOB=${HELDOUT_GLOB:-data_v2/test/*.parquet}
SEQ_SPACE=${SEQ_SPACE:-protein}
SPECIES_KEY_MODE=${SPECIES_KEY_MODE:-binomial}
MAX_INPUT_SEQ_LEN=${MAX_INPUT_SEQ_LEN:-20000}
MODE=${MODE:-full}
OUTPUT_ROOT=${OUTPUT_ROOT:-data_v3_rebuild}
LIMIT_FILES=${LIMIT_FILES:-0}
NUM_SHARDS=${NUM_SHARDS:-256}
VAL_FRAC=${VAL_FRAC:-0.01}
THREADS=${THREADS:-${SLURM_CPUS_PER_TASK:-16}}
MIN_SEQ_ID=${MIN_SEQ_ID:-0.90}
COVERAGE=${COVERAGE:-0.80}
COV_MODE=${COV_MODE:-2}
CLUSTER_MODE=${CLUSTER_MODE:-2}
MAX_SEQ_LEN=${MAX_SEQ_LEN:-200000}
SPLIT_MEMORY_LIMIT=${SPLIT_MEMORY_LIMIT:-180G}
SEED=${SEED:-13}
OVERWRITE=${OVERWRITE:-1}
if [[ "${MODE}" == "pilot" ]]; then
if [[ "${LIMIT_FILES}" == "0" ]]; then
LIMIT_FILES=4
fi
if [[ "${OUTPUT_ROOT}" == "data_v3_rebuild" ]]; then
OUTPUT_ROOT=data_v3_pilot
fi
if [[ "${NUM_SHARDS}" == "256" ]]; then
NUM_SHARDS=16
fi
fi
cd "${REPO_ROOT}"
export LD_LIBRARY_PATH="/beacon-projects/codon-lm/miniconda3/envs/dna/lib:/beacon-projects/codon-lm/miniconda3/lib:${LD_LIBRARY_PATH:-}"
if [[ -f "${MMSEQS_BIN}" && ! -x "${MMSEQS_BIN}" ]]; then
chmod u+x "${MMSEQS_BIN}"
fi
"${PYTHON_BIN}" - <<'PY'
import duckdb
import pyarrow
print("duckdb", duckdb.__version__)
print("pyarrow", pyarrow.__version__)
PY
CMD=(
"${PYTHON_BIN}" resplit_data_v3.py all
--input-glob "${INPUT_GLOB}"
--heldout-test-glob "${HELDOUT_GLOB}"
--output-root "${OUTPUT_ROOT}"
--seq-space "${SEQ_SPACE}"
--species-key-mode "${SPECIES_KEY_MODE}"
--max-input-seq-len "${MAX_INPUT_SEQ_LEN}"
--threads "${THREADS}"
--limit-files "${LIMIT_FILES}"
--num-shards "${NUM_SHARDS}"
--mmseqs "${MMSEQS_BIN}"
--min-seq-id "${MIN_SEQ_ID}"
--coverage "${COVERAGE}"
--cov-mode "${COV_MODE}"
--cluster-mode "${CLUSTER_MODE}"
--max-seq-len "${MAX_SEQ_LEN}"
--val-frac "${VAL_FRAC}"
--seed "${SEED}"
)
if [[ -n "${SPLIT_MEMORY_LIMIT}" ]]; then
CMD+=(--split-memory-limit "${SPLIT_MEMORY_LIMIT}")
fi
if [[ "${OVERWRITE}" == "1" ]]; then
CMD+=(--overwrite)
fi
printf 'Running command:'
printf ' %q' "${CMD[@]}"
printf '\n'
"${CMD[@]}"