#!/bin/bash # Remote training script for DeepGenopix experiment presets on HF compute. # Usage: ./remote_train.sh set -euo pipefail PRESET="${1:-}" if [ -z "$PRESET" ]; then echo "Usage: $0 " exit 1 fi REPO_URL="https://github.com/vedatonuryilmaz/dev_genopix.git" REPO_DIR="deepgenopix" HF_DATASET="vedatonuryilmaz/te-seqdata-v1" RAW_PARQUET="te_seqdata.parquet" echo "=== DeepGenopix Remote Training ===" echo "Preset: $PRESET" echo "Repo: $REPO_URL (branch: ml-intern)" echo "Dataset: $HF_DATASET" echo # 1. Clone the repo echo ">>> [1/5] Cloning DeepGenopix (ml-intern branch)..." if [ -d "$REPO_DIR" ]; then echo "Repository already exists, pulling latest..." git -C "$REPO_DIR" pull origin ml-intern else git clone --branch ml-intern --depth 1 "$REPO_URL" "$REPO_DIR" fi # 2. uv sync echo ">>> [2/5] Syncing dependencies with uv..." cd "$REPO_DIR" uv sync # 3. Download parquet from HF Hub echo ">>> [3/5] Downloading raw dataset from HF Hub..." mkdir -p data/raw if python -c " from huggingface_hub import hf_hub_download path = hf_hub_download(repo_id='$HF_DATASET', filename='$RAW_PARQUET', repo_type='dataset') import shutil shutil.move(path, 'data/raw/$RAW_PARQUET') "; then echo "Dataset downloaded to data/raw/$RAW_PARQUET" else echo "ERROR: Failed to download dataset from HF Hub" exit 1 fi # 4. Verify parquet echo ">>> [4/5] Verifying dataset..." python -c " import pandas as pd df = pd.read_parquet('data/raw/$RAW_PARQUET') print(f'Rows: {len(df):,}') print(f'Columns: {df.columns.tolist()}') print(f'Sequence sample length: {len(df.iloc[0][\"sequence\"])}') " # 5. Run training echo ">>> [5/5] Starting training for preset: $PRESET" echo "========================================" uv run deepgenopix train --preset "$PRESET" --run-mode train --json echo "=== Training complete ==="