| #!/bin/bash |
| |
| |
| set -euo pipefail |
|
|
| PRESET="${1:-}" |
| if [ -z "$PRESET" ]; then |
| echo "Usage: $0 <preset_name>" |
| exit 1 |
| fi |
|
|
| REPO_URL="https://github.com/vedatonuryilmaz/dev_genopix.git" |
| REPO_DIR="deepgenopix" |
| HF_DATASET="vedatonuryilmaz/te-seqdata-v1" |
| RAW_PARQUET="te_seqdata.parquet" |
|
|
| echo "=== DeepGenopix Remote Training ===" |
| echo "Preset: $PRESET" |
| echo "Repo: $REPO_URL (branch: ml-intern)" |
| echo "Dataset: $HF_DATASET" |
| echo |
|
|
| |
| echo ">>> [1/5] Cloning DeepGenopix (ml-intern branch)..." |
| if [ -d "$REPO_DIR" ]; then |
| echo "Repository already exists, pulling latest..." |
| git -C "$REPO_DIR" pull origin ml-intern |
| else |
| git clone --branch ml-intern --depth 1 "$REPO_URL" "$REPO_DIR" |
| fi |
|
|
| |
| echo ">>> [2/5] Syncing dependencies with uv..." |
| cd "$REPO_DIR" |
| uv sync |
|
|
| |
| echo ">>> [3/5] Downloading raw dataset from HF Hub..." |
| mkdir -p data/raw |
| if python -c " |
| from huggingface_hub import hf_hub_download |
| path = hf_hub_download(repo_id='$HF_DATASET', filename='$RAW_PARQUET', repo_type='dataset') |
| import shutil |
| shutil.move(path, 'data/raw/$RAW_PARQUET') |
| "; then |
| echo "Dataset downloaded to data/raw/$RAW_PARQUET" |
| else |
| echo "ERROR: Failed to download dataset from HF Hub" |
| exit 1 |
| fi |
|
|
| |
| echo ">>> [4/5] Verifying dataset..." |
| python -c " |
| import pandas as pd |
| df = pd.read_parquet('data/raw/$RAW_PARQUET') |
| print(f'Rows: {len(df):,}') |
| print(f'Columns: {df.columns.tolist()}') |
| print(f'Sequence sample length: {len(df.iloc[0][\"sequence\"])}') |
| " |
|
|
| |
| echo ">>> [5/5] Starting training for preset: $PRESET" |
| echo "========================================" |
| uv run deepgenopix train --preset "$PRESET" --run-mode train --json |
|
|
| echo "=== Training complete ===" |