deepgenopix / scripts /remote_train.sh
vedatonuryilmaz's picture
Update scripts/remote_train.sh from ml-intern branch
c5efd3d verified
#!/bin/bash
# Remote training script for DeepGenopix experiment presets on HF compute.
# Usage: ./remote_train.sh <preset_name>
set -euo pipefail
PRESET="${1:-}"
if [ -z "$PRESET" ]; then
echo "Usage: $0 <preset_name>"
exit 1
fi
REPO_URL="https://github.com/vedatonuryilmaz/dev_genopix.git"
REPO_DIR="deepgenopix"
HF_DATASET="vedatonuryilmaz/te-seqdata-v1"
RAW_PARQUET="te_seqdata.parquet"
echo "=== DeepGenopix Remote Training ==="
echo "Preset: $PRESET"
echo "Repo: $REPO_URL (branch: ml-intern)"
echo "Dataset: $HF_DATASET"
echo
# 1. Clone the repo
echo ">>> [1/5] Cloning DeepGenopix (ml-intern branch)..."
if [ -d "$REPO_DIR" ]; then
echo "Repository already exists, pulling latest..."
git -C "$REPO_DIR" pull origin ml-intern
else
git clone --branch ml-intern --depth 1 "$REPO_URL" "$REPO_DIR"
fi
# 2. uv sync
echo ">>> [2/5] Syncing dependencies with uv..."
cd "$REPO_DIR"
uv sync
# 3. Download parquet from HF Hub
echo ">>> [3/5] Downloading raw dataset from HF Hub..."
mkdir -p data/raw
if python -c "
from huggingface_hub import hf_hub_download
path = hf_hub_download(repo_id='$HF_DATASET', filename='$RAW_PARQUET', repo_type='dataset')
import shutil
shutil.move(path, 'data/raw/$RAW_PARQUET')
"; then
echo "Dataset downloaded to data/raw/$RAW_PARQUET"
else
echo "ERROR: Failed to download dataset from HF Hub"
exit 1
fi
# 4. Verify parquet
echo ">>> [4/5] Verifying dataset..."
python -c "
import pandas as pd
df = pd.read_parquet('data/raw/$RAW_PARQUET')
print(f'Rows: {len(df):,}')
print(f'Columns: {df.columns.tolist()}')
print(f'Sequence sample length: {len(df.iloc[0][\"sequence\"])}')
"
# 5. Run training
echo ">>> [5/5] Starting training for preset: $PRESET"
echo "========================================"
uv run deepgenopix train --preset "$PRESET" --run-mode train --json
echo "=== Training complete ==="