| #!/usr/bin/env bash |
| set -euo pipefail |
|
|
| WORK_ROOT="${WORK_ROOT:-/home/bagel/bridge_processed_full}" |
| DOWNLOAD_DIR="${DOWNLOAD_DIR:-${WORK_ROOT}/download}" |
| ZIP_PATH="${ZIP_PATH:-${WORK_ROOT}/bridge_processed_full.zip}" |
| EXTRACT_DIR="${EXTRACT_DIR:-${WORK_ROOT}/extracted}" |
| DATASET_ROOT="${DATASET_ROOT:-${EXTRACT_DIR}/bridge_processed}" |
| R3M_CACHE="${R3M_CACHE:-${WORK_ROOT}/r3m_resnet34_features_bs256}" |
| PYTHON_BIN="${PYTHON_BIN:-/home/bagel/miniforge3/envs/rold/bin/python}" |
| REPO_ID="${REPO_ID:-bageldotcom/vla-bridge-dataset}" |
| DEVICE="${DEVICE:-cuda}" |
| R3M_MODEL_ID="${R3M_MODEL_ID:-resnet34}" |
| R3M_BATCH_SIZE="${R3M_BATCH_SIZE:-256}" |
| NUM_WORKERS="${NUM_WORKERS:-8}" |
| VERIFY_SAMPLES="${VERIFY_SAMPLES:-128}" |
|
|
| LOG_DIR="${LOG_DIR:-${WORK_ROOT}/logs}" |
| mkdir -p "${DOWNLOAD_DIR}" "${EXTRACT_DIR}" "${LOG_DIR}" |
|
|
| stage_log() { |
| local stage="$1" |
| printf '[bridge-full-prepare] %s\n' "${stage}" |
| } |
|
|
| run_stage() { |
| local stage="$1" |
| shift |
| local log_path="${LOG_DIR}/${stage}.log" |
| local exit_path="${WORK_ROOT}/${stage}.exit" |
| rm -f "${exit_path}" |
| stage_log "start ${stage}" |
| set +e |
| "$@" > "${log_path}" 2>&1 |
| local status=$? |
| set -e |
| echo "${status}" > "${exit_path}" |
| if [[ "${status}" != "0" ]]; then |
| stage_log "failed ${stage}; see ${log_path}" |
| exit "${status}" |
| fi |
| stage_log "done ${stage}" |
| } |
|
|
| if [[ ! -f "${WORK_ROOT}/download.exit" ]]; then |
| run_stage download \ |
| "${PYTHON_BIN}" -m ddm_actions.data.download_hf \ |
| --repo-id "${REPO_ID}" \ |
| --output-dir "${DOWNLOAD_DIR}" \ |
| --allow-patterns 'bridge_processed_full.zip.part-*' |
| else |
| stage_log "skip download; ${WORK_ROOT}/download.exit exists" |
| fi |
|
|
| if [[ ! -f "${WORK_ROOT}/parts.exit" ]]; then |
| run_stage parts bash -lc " |
| set -euo pipefail |
| ls -lh '${DOWNLOAD_DIR}'/bridge_processed_full.zip.part-* |
| test -f '${DOWNLOAD_DIR}/bridge_processed_full.zip.part-aa' |
| test -f '${DOWNLOAD_DIR}/bridge_processed_full.zip.part-ab' |
| test -f '${DOWNLOAD_DIR}/bridge_processed_full.zip.part-ac' |
| test -f '${DOWNLOAD_DIR}/bridge_processed_full.zip.part-ad' |
| " |
| else |
| stage_log "skip parts; ${WORK_ROOT}/parts.exit exists" |
| fi |
|
|
| if [[ ! -f "${WORK_ROOT}/concat.exit" ]]; then |
| run_stage concat bash -lc " |
| set -euo pipefail |
| cat '${DOWNLOAD_DIR}'/bridge_processed_full.zip.part-* > '${ZIP_PATH}' |
| ls -lh '${ZIP_PATH}' |
| " |
| else |
| stage_log "skip concat; ${WORK_ROOT}/concat.exit exists" |
| fi |
|
|
| if [[ ! -f "${WORK_ROOT}/zip_test.exit" ]]; then |
| run_stage zip_test unzip -t "${ZIP_PATH}" |
| else |
| stage_log "skip zip_test; ${WORK_ROOT}/zip_test.exit exists" |
| fi |
|
|
| if [[ ! -f "${WORK_ROOT}/extract.exit" ]]; then |
| run_stage extract unzip -o "${ZIP_PATH}" -d "${EXTRACT_DIR}" |
| else |
| stage_log "skip extract; ${WORK_ROOT}/extract.exit exists" |
| fi |
|
|
| if [[ ! -f "${WORK_ROOT}/locate.exit" ]]; then |
| run_stage locate bash -lc " |
| set -euo pipefail |
| find '${EXTRACT_DIR}' -maxdepth 4 -type f -name mapping.json -print |
| find '${EXTRACT_DIR}' -maxdepth 4 -type d -name train -print |
| if [[ ! -d '${DATASET_ROOT}/train' ]]; then |
| candidate=\"\$(find '${EXTRACT_DIR}' -maxdepth 4 -type f -name mapping.json -printf '%h\n' | head -1)\" |
| if [[ -z \"\${candidate}\" ]]; then |
| echo 'Could not locate processed dataset root with mapping.json' >&2 |
| exit 1 |
| fi |
| echo \"\${candidate}\" > '${WORK_ROOT}/dataset_root.txt' |
| else |
| echo '${DATASET_ROOT}' > '${WORK_ROOT}/dataset_root.txt' |
| fi |
| root=\"\$(cat '${WORK_ROOT}/dataset_root.txt')\" |
| test -d \"\${root}/train\" |
| test -d \"\${root}/val\" |
| test -d \"\${root}/test\" |
| test -f \"\${root}/mapping.json\" |
| echo \"DATASET_ROOT=\${root}\" |
| " |
| else |
| stage_log "skip locate; ${WORK_ROOT}/locate.exit exists" |
| fi |
|
|
| RESOLVED_DATASET_ROOT="$(cat "${WORK_ROOT}/dataset_root.txt")" |
|
|
| if [[ ! -f "${WORK_ROOT}/dataset_smoke.exit" ]]; then |
| run_stage dataset_smoke \ |
| "${PYTHON_BIN}" -m ddm_actions.scripts.debug_dataset \ |
| --dataset_root "${RESOLVED_DATASET_ROOT}" |
| else |
| stage_log "skip dataset_smoke; ${WORK_ROOT}/dataset_smoke.exit exists" |
| fi |
|
|
| if [[ ! -f "${WORK_ROOT}/r3m_precompute.exit" ]]; then |
| run_stage r3m_precompute \ |
| "${PYTHON_BIN}" -u -m ddm_actions.scripts.precompute_r3m_features \ |
| --dataset_root "${RESOLVED_DATASET_ROOT}" \ |
| --output_dir "${R3M_CACHE}" \ |
| --r3m_model_id "${R3M_MODEL_ID}" \ |
| --batch_size "${R3M_BATCH_SIZE}" \ |
| --num_workers "${NUM_WORKERS}" \ |
| --device "${DEVICE}" |
| else |
| stage_log "skip r3m_precompute; ${WORK_ROOT}/r3m_precompute.exit exists" |
| fi |
|
|
| if [[ ! -f "${WORK_ROOT}/verify.exit" ]]; then |
| run_stage verify \ |
| "${PYTHON_BIN}" -m ddm_actions.scripts.verify_r3m_cache \ |
| --dataset_root "${RESOLVED_DATASET_ROOT}" \ |
| --cache_dir "${R3M_CACHE}" \ |
| --device "${DEVICE}" \ |
| --num_samples "${VERIFY_SAMPLES}" |
| else |
| stage_log "skip verify; ${WORK_ROOT}/verify.exit exists" |
| fi |
|
|
| cat > "${WORK_ROOT}/ready.env" <<EOF |
| FULL_DATASET_ROOT=${RESOLVED_DATASET_ROOT} |
| FULL_R3M_CACHE=${R3M_CACHE} |
| EOF |
|
|
| stage_log "ready" |
| cat "${WORK_ROOT}/ready.env" |
|
|