#!/usr/bin/env bash set -euo pipefail WORK_ROOT="${WORK_ROOT:-/home/bagel/bridge_processed_full}" DOWNLOAD_DIR="${DOWNLOAD_DIR:-${WORK_ROOT}/download}" ZIP_PATH="${ZIP_PATH:-${WORK_ROOT}/bridge_processed_full.zip}" EXTRACT_DIR="${EXTRACT_DIR:-${WORK_ROOT}/extracted}" DATASET_ROOT="${DATASET_ROOT:-${EXTRACT_DIR}/bridge_processed}" R3M_CACHE="${R3M_CACHE:-${WORK_ROOT}/r3m_resnet34_features_bs256}" PYTHON_BIN="${PYTHON_BIN:-/home/bagel/miniforge3/envs/rold/bin/python}" REPO_ID="${REPO_ID:-bageldotcom/vla-bridge-dataset}" DEVICE="${DEVICE:-cuda}" R3M_MODEL_ID="${R3M_MODEL_ID:-resnet34}" R3M_BATCH_SIZE="${R3M_BATCH_SIZE:-256}" NUM_WORKERS="${NUM_WORKERS:-8}" VERIFY_SAMPLES="${VERIFY_SAMPLES:-128}" LOG_DIR="${LOG_DIR:-${WORK_ROOT}/logs}" mkdir -p "${DOWNLOAD_DIR}" "${EXTRACT_DIR}" "${LOG_DIR}" stage_log() { local stage="$1" printf '[bridge-full-prepare] %s\n' "${stage}" } run_stage() { local stage="$1" shift local log_path="${LOG_DIR}/${stage}.log" local exit_path="${WORK_ROOT}/${stage}.exit" rm -f "${exit_path}" stage_log "start ${stage}" set +e "$@" > "${log_path}" 2>&1 local status=$? set -e echo "${status}" > "${exit_path}" if [[ "${status}" != "0" ]]; then stage_log "failed ${stage}; see ${log_path}" exit "${status}" fi stage_log "done ${stage}" } if [[ ! -f "${WORK_ROOT}/download.exit" ]]; then run_stage download \ "${PYTHON_BIN}" -m ddm_actions.data.download_hf \ --repo-id "${REPO_ID}" \ --output-dir "${DOWNLOAD_DIR}" \ --allow-patterns 'bridge_processed_full.zip.part-*' else stage_log "skip download; ${WORK_ROOT}/download.exit exists" fi if [[ ! -f "${WORK_ROOT}/parts.exit" ]]; then run_stage parts bash -lc " set -euo pipefail ls -lh '${DOWNLOAD_DIR}'/bridge_processed_full.zip.part-* test -f '${DOWNLOAD_DIR}/bridge_processed_full.zip.part-aa' test -f '${DOWNLOAD_DIR}/bridge_processed_full.zip.part-ab' test -f '${DOWNLOAD_DIR}/bridge_processed_full.zip.part-ac' test -f '${DOWNLOAD_DIR}/bridge_processed_full.zip.part-ad' " else stage_log "skip parts; ${WORK_ROOT}/parts.exit exists" fi if [[ ! -f "${WORK_ROOT}/concat.exit" ]]; then run_stage concat bash -lc " set -euo pipefail cat '${DOWNLOAD_DIR}'/bridge_processed_full.zip.part-* > '${ZIP_PATH}' ls -lh '${ZIP_PATH}' " else stage_log "skip concat; ${WORK_ROOT}/concat.exit exists" fi if [[ ! -f "${WORK_ROOT}/zip_test.exit" ]]; then run_stage zip_test unzip -t "${ZIP_PATH}" else stage_log "skip zip_test; ${WORK_ROOT}/zip_test.exit exists" fi if [[ ! -f "${WORK_ROOT}/extract.exit" ]]; then run_stage extract unzip -o "${ZIP_PATH}" -d "${EXTRACT_DIR}" else stage_log "skip extract; ${WORK_ROOT}/extract.exit exists" fi if [[ ! -f "${WORK_ROOT}/locate.exit" ]]; then run_stage locate bash -lc " set -euo pipefail find '${EXTRACT_DIR}' -maxdepth 4 -type f -name mapping.json -print find '${EXTRACT_DIR}' -maxdepth 4 -type d -name train -print if [[ ! -d '${DATASET_ROOT}/train' ]]; then candidate=\"\$(find '${EXTRACT_DIR}' -maxdepth 4 -type f -name mapping.json -printf '%h\n' | head -1)\" if [[ -z \"\${candidate}\" ]]; then echo 'Could not locate processed dataset root with mapping.json' >&2 exit 1 fi echo \"\${candidate}\" > '${WORK_ROOT}/dataset_root.txt' else echo '${DATASET_ROOT}' > '${WORK_ROOT}/dataset_root.txt' fi root=\"\$(cat '${WORK_ROOT}/dataset_root.txt')\" test -d \"\${root}/train\" test -d \"\${root}/val\" test -d \"\${root}/test\" test -f \"\${root}/mapping.json\" echo \"DATASET_ROOT=\${root}\" " else stage_log "skip locate; ${WORK_ROOT}/locate.exit exists" fi RESOLVED_DATASET_ROOT="$(cat "${WORK_ROOT}/dataset_root.txt")" if [[ ! -f "${WORK_ROOT}/dataset_smoke.exit" ]]; then run_stage dataset_smoke \ "${PYTHON_BIN}" -m ddm_actions.scripts.debug_dataset \ --dataset_root "${RESOLVED_DATASET_ROOT}" else stage_log "skip dataset_smoke; ${WORK_ROOT}/dataset_smoke.exit exists" fi if [[ ! -f "${WORK_ROOT}/r3m_precompute.exit" ]]; then run_stage r3m_precompute \ "${PYTHON_BIN}" -u -m ddm_actions.scripts.precompute_r3m_features \ --dataset_root "${RESOLVED_DATASET_ROOT}" \ --output_dir "${R3M_CACHE}" \ --r3m_model_id "${R3M_MODEL_ID}" \ --batch_size "${R3M_BATCH_SIZE}" \ --num_workers "${NUM_WORKERS}" \ --device "${DEVICE}" else stage_log "skip r3m_precompute; ${WORK_ROOT}/r3m_precompute.exit exists" fi if [[ ! -f "${WORK_ROOT}/verify.exit" ]]; then run_stage verify \ "${PYTHON_BIN}" -m ddm_actions.scripts.verify_r3m_cache \ --dataset_root "${RESOLVED_DATASET_ROOT}" \ --cache_dir "${R3M_CACHE}" \ --device "${DEVICE}" \ --num_samples "${VERIFY_SAMPLES}" else stage_log "skip verify; ${WORK_ROOT}/verify.exit exists" fi cat > "${WORK_ROOT}/ready.env" <