| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) |
| DEFAULT_REPO_ROOT=$(cd "${SCRIPT_DIR}/.." && pwd) |
| REPO_ROOT=${REPO_ROOT:-${DEFAULT_REPO_ROOT}} |
| PYTHON_BIN=${PYTHON_BIN:-python} |
| MMSEQS_BIN=${MMSEQS_BIN:-$REPO_ROOT/MMseqs2/build/bin/mmseqs} |
| INPUT_GLOB=${INPUT_GLOB:-data_v2/*/*.parquet} |
| HELDOUT_GLOB=${HELDOUT_GLOB:-data_v2/test/*.parquet} |
| SEQ_SPACE=${SEQ_SPACE:-protein} |
| SPECIES_KEY_MODE=${SPECIES_KEY_MODE:-binomial} |
| MAX_INPUT_SEQ_LEN=${MAX_INPUT_SEQ_LEN:-20000} |
| MODE=${MODE:-full} |
| OUTPUT_ROOT=${OUTPUT_ROOT:-data_v3_rebuild} |
| LIMIT_FILES=${LIMIT_FILES:-0} |
| NUM_SHARDS=${NUM_SHARDS:-256} |
| VAL_FRAC=${VAL_FRAC:-0.01} |
| THREADS=${THREADS:-${SLURM_CPUS_PER_TASK:-16}} |
| MIN_SEQ_ID=${MIN_SEQ_ID:-0.90} |
| COVERAGE=${COVERAGE:-0.80} |
| COV_MODE=${COV_MODE:-2} |
| CLUSTER_MODE=${CLUSTER_MODE:-2} |
| MAX_SEQ_LEN=${MAX_SEQ_LEN:-200000} |
| SPLIT_MEMORY_LIMIT=${SPLIT_MEMORY_LIMIT:-180G} |
| SEED=${SEED:-13} |
| OVERWRITE=${OVERWRITE:-1} |
|
|
| if [[ "${MODE}" == "pilot" ]]; then |
| if [[ "${LIMIT_FILES}" == "0" ]]; then |
| LIMIT_FILES=4 |
| fi |
| if [[ "${OUTPUT_ROOT}" == "data_v3_rebuild" ]]; then |
| OUTPUT_ROOT=data_v3_pilot |
| fi |
| if [[ "${NUM_SHARDS}" == "256" ]]; then |
| NUM_SHARDS=16 |
| fi |
| fi |
|
|
| cd "${REPO_ROOT}" |
|
|
| export LD_LIBRARY_PATH="/beacon-projects/codon-lm/miniconda3/envs/dna/lib:/beacon-projects/codon-lm/miniconda3/lib:${LD_LIBRARY_PATH:-}" |
|
|
| if [[ -f "${MMSEQS_BIN}" && ! -x "${MMSEQS_BIN}" ]]; then |
| chmod u+x "${MMSEQS_BIN}" |
| fi |
|
|
| "${PYTHON_BIN}" - <<'PY' |
| import duckdb |
| import pyarrow |
|
|
| print("duckdb", duckdb.__version__) |
| print("pyarrow", pyarrow.__version__) |
| PY |
|
|
| CMD=( |
| "${PYTHON_BIN}" resplit_data_v3.py all |
| --input-glob "${INPUT_GLOB}" |
| --heldout-test-glob "${HELDOUT_GLOB}" |
| --output-root "${OUTPUT_ROOT}" |
| --seq-space "${SEQ_SPACE}" |
| --species-key-mode "${SPECIES_KEY_MODE}" |
| --max-input-seq-len "${MAX_INPUT_SEQ_LEN}" |
| --threads "${THREADS}" |
| --limit-files "${LIMIT_FILES}" |
| --num-shards "${NUM_SHARDS}" |
| --mmseqs "${MMSEQS_BIN}" |
| --min-seq-id "${MIN_SEQ_ID}" |
| --coverage "${COVERAGE}" |
| --cov-mode "${COV_MODE}" |
| --cluster-mode "${CLUSTER_MODE}" |
| --max-seq-len "${MAX_SEQ_LEN}" |
| --val-frac "${VAL_FRAC}" |
| --seed "${SEED}" |
| ) |
|
|
| if [[ -n "${SPLIT_MEMORY_LIMIT}" ]]; then |
| CMD+=(--split-memory-limit "${SPLIT_MEMORY_LIMIT}") |
| fi |
|
|
| if [[ "${OVERWRITE}" == "1" ]]; then |
| CMD+=(--overwrite) |
| fi |
|
|
| printf 'Running command:' |
| printf ' %q' "${CMD[@]}" |
| printf '\n' |
| "${CMD[@]}" |
|
|