File size: 5,001 Bytes
c5efd3d 21b75e3 b0ad1f7 21b75e3 b0ad1f7 21b75e3 c5efd3d 21b75e3 c5efd3d fb5ce18 21b75e3 c5efd3d 21b75e3 c5efd3d 21b75e3 c5efd3d 21b75e3 3caced1 c5efd3d 21b75e3 bcad905 21b75e3 bcad905 fb5ce18 bcad905 21b75e3 bcad905 21b75e3 c5efd3d 21b75e3 c5efd3d 21b75e3 c5efd3d 21b75e3 c5efd3d 3a35113 21b75e3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | #!/bin/bash
set -euo pipefail
PRESET="${1:?usage: $0 <preset>}"
REPO_URL="${SOURCE_REPO_URL:-https://huggingface.co/vedatonuryilmaz/deepgenopix}"
REPO_BRANCH="${REPO_BRANCH:-}"
DATA_DATASET="${DATA_DATASET:-vedatonuryilmaz/te-seqdata-v1}"
ARTIFACT_REPO="${ARTIFACT_REPO:?must be set}"
if [ "${DEEPGENOPIX_ALREADY_CLONED:-0}" = "1" ]; then
echo "using existing DeepGenopix checkout at $(pwd)"
else
if [ -n "$REPO_BRANCH" ]; then
git clone --branch "$REPO_BRANCH" --depth 1 "$REPO_URL" deepgenopix
else
git clone --depth 1 "$REPO_URL" deepgenopix
fi
cd deepgenopix
fi
uv sync --extra hf
if [ -z "${DATA_REVISION:-}" ]; then
DATA_REVISION="$(uv run python -c "from huggingface_hub import HfApi; print(HfApi().dataset_info('$DATA_DATASET').sha)")"
echo "WARNING: DATA_REVISION was not set; using mutable dataset HEAD $DATA_REVISION" >&2
echo "WARNING: production HF jobs should pin DATA_REVISION to the immutable split-input commit" >&2
fi
DATA_REVISION_SHORT="${DATA_REVISION:0:12}"
echo "using DATA_DATASET=$DATA_DATASET revision=$DATA_REVISION"
mkdir -p data/raw/te_split
uv run python -c "
from huggingface_hub import snapshot_download
snapshot_download(
repo_id='$DATA_DATASET',
repo_type='dataset',
revision='$DATA_REVISION',
local_dir='data/raw/te_split',
allow_patterns=['train/*.parquet', 'val/*.parquet', 'test/*.parquet', 'split_summary.json'],
)
"
if [ "${RESUME:-0}" = "1" ]; then
uv run python scripts/hf_artifacts.py pull --repo "$ARTIFACT_REPO" --preset "$PRESET" --dest data/output
fi
export DEEPGENOPIX_HF_SYNC=1
export DEEPGENOPIX_HF_ARTIFACT_REPO="$ARTIFACT_REPO"
export DEEPGENOPIX_NUM_WORKERS=4
export DEEPGENOPIX_USE_AMP="${DEEPGENOPIX_USE_AMP:-0}"
echo "using DEEPGENOPIX_USE_AMP=$DEEPGENOPIX_USE_AMP"
MODE=$([ "${RESUME:-0}" = "1" ] && echo resume || echo train)
# Local processed directory signature used by the trainer.
LOCAL_PROCESSED_SIGNATURE="$(uv run python -c "
from deepgenopix.notebook_support import build_experiment_config
cfg = build_experiment_config('$PRESET')
parts = [
f'px{cfg.bp_per_pixel}',
f'ps{cfg.pixel_stride_bp}',
f'fl{cfg.flank_up_bp}-{cfg.flank_down_bp}',
f'vf{str(cfg.val_frac).replace(\".\", \"p\")}',
f'tf{str(cfg.test_frac).replace(\".\", \"p\")}',
f'seed{cfg.split_seed}',
f'mf{cfg.min_family_size}',
]
print('_'.join(parts))
")"
# HF dataset cache key. DATA_REVISION must mean the immutable split-input
# revision, not the mutable dataset repo HEAD after LMDB/cache uploads.
PROCESSED_SIGNATURE="${LOCAL_PROCESSED_SIGNATURE}_ds${DATA_REVISION_SHORT}"
PIXEL_STRIDE="$(uv run python -c "from deepgenopix.notebook_support import build_experiment_config; print(build_experiment_config('$PRESET').pixel_stride_bp)")"
PROCESSED_DIR="data/processed/te_visuals/$LOCAL_PROCESSED_SIGNATURE"
mkdir -p "$PROCESSED_DIR"
# Try to pull an existing ETL cache (LMDB + registry + classes) by signature.
# Cache hit ⇒ skip `deepgenopix etl` entirely; cache miss ⇒ materialize and
# push the result so the next job hits the cache.
if [ "${FORCE_ETL_REBUILD:-0}" = "1" ]; then
echo "FORCE_ETL_REBUILD=1; skipping LMDB cache pull for $PROCESSED_SIGNATURE"
rm -f "$PROCESSED_DIR/tensors.lmdb" "$PROCESSED_DIR/tensors.lmdb-lock" "$PROCESSED_DIR/registry.csv" "$PROCESSED_DIR/classes.json"
else
uv run python scripts/hf_artifacts.py pull-etl \
--repo "$DATA_DATASET" \
--signature "$PROCESSED_SIGNATURE" \
--processed-root "$PROCESSED_DIR" || true
fi
if [ -f "$PROCESSED_DIR/tensors.lmdb" ] && [ -f "$PROCESSED_DIR/registry.csv" ] && [ -f "$PROCESSED_DIR/classes.json" ] && [ -f "$PROCESSED_DIR/etl_cache_manifest.json" ]; then
echo "etl cache hit for $PROCESSED_SIGNATURE"
else
echo "etl cache miss for $PROCESSED_SIGNATURE — materializing"
uv run deepgenopix etl --split-root data/raw/te_split --output-dir "$PROCESSED_DIR" --pixel-stride-bp "$PIXEL_STRIDE"
uv run python -c "
import json
from pathlib import Path
manifest = {
'data_dataset': '$DATA_DATASET',
'data_revision': '$DATA_REVISION',
'data_revision_short': '$DATA_REVISION_SHORT',
'processed_signature': '$PROCESSED_SIGNATURE',
'preset': '$PRESET',
'pixel_stride_bp': int('$PIXEL_STRIDE'),
'split_files': [
'train/te_seqdata.parquet',
'val/te_seqdata.parquet',
'test/te_seqdata.parquet',
'split_summary.json',
],
'source_policy': 'split parquet only; root te_seqdata.parquet is intentionally ignored',
}
Path('$PROCESSED_DIR/etl_cache_manifest.json').write_text(json.dumps(manifest, indent=2))
"
uv run python scripts/hf_artifacts.py push-etl \
--repo "$DATA_DATASET" \
--signature "$PROCESSED_SIGNATURE" \
--processed-root "$PROCESSED_DIR" \
--message "etl cache from preset $PRESET" || true
fi
uv run deepgenopix train --preset "$PRESET" --run-mode "$MODE" --no-etl --no-export-embeddings --json
uv run python scripts/hf_artifacts.py push --repo "$ARTIFACT_REPO" --preset "$PRESET" --source data/output
|