#!/usr/bin/env bash #SBATCH --job-name=dememwm_full_eval #SBATCH --partition=gpu #SBATCH --time=1-00:00:00 #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=16 #SBATCH --mem=256G #SBATCH --gres=gpu:1 #SBATCH --chdir=/share_1/users/bonan_ding/DeMemWM #SBATCH --output=/share_1/users/bonan_ding/DeMemWM/slurm_logs/%x_%j.out #SBATCH --error=/share_1/users/bonan_ding/DeMemWM/slurm_logs/%x_%j.err # Full DeMemWM evaluation script for DeMemWM/H200. # Submit from the remote repo after training has produced a checkpoint: # sbatch --export=ALL,CHECKPOINT=/share_1/users/bonan_ding/DeMemWM/outputs//train/checkpoints/last.ckpt scripts/dememwm_full_eval.slurm # or: # CHECKPOINT=/path/to/last.ckpt sbatch --export=ALL scripts/dememwm_full_eval.slurm set -euo pipefail CHECKPOINT=${CHECKPOINT:-${1:-}} if [[ -z "${CHECKPOINT}" ]]; then echo "ERROR: set CHECKPOINT=/path/to/dememwm.ckpt, e.g." >&2 echo " sbatch --export=ALL,CHECKPOINT=/share_1/users/bonan_ding/DeMemWM/outputs//train/checkpoints/last.ckpt scripts/dememwm_full_eval.slurm" >&2 exit 2 fi if [[ ! -s "${CHECKPOINT}" ]]; then echo "ERROR: checkpoint does not exist or is empty: ${CHECKPOINT}" >&2 exit 2 fi REPO=${REPO:-/share_1/users/bonan_ding/DeMemWM} DATA_DIR=${DATA_DIR:-/share_1/users/bonan_ding/worldmem_data/minecraft} FEATURE_DIR=${FEATURE_DIR:-/share_1/users/bonan_ding/worldmem_data/minecraft/vae_features} RUN_TAG=${RUN_TAG:-dememwm_full_eval_${SLURM_JOB_ID:-manual_$(date +%Y%m%d_%H%M%S)}} RUN_ROOT=${RUN_ROOT:-${REPO}/outputs/${RUN_TAG}} EVAL_OUT=${EVAL_OUT:-${RUN_ROOT}/eval} LOG_DIR=${LOG_DIR:-${REPO}/slurm_logs/${RUN_TAG}} mkdir -p "${EVAL_OUT}" "${LOG_DIR}" "${REPO}/slurm_logs" DATASET_N_FRAMES=${DATASET_N_FRAMES:-300} N_FRAMES_VALID=${N_FRAMES_VALID:-216} CONTEXT_FRAMES=${CONTEXT_FRAMES:-116} N_TOKENS=${N_TOKENS:-8} SAMPLING_TIMESTEPS=${SAMPLING_TIMESTEPS:-20} VAL_BATCH_SIZE=${VAL_BATCH_SIZE:-1} VAL_LIMIT=${VAL_LIMIT:-16} LOG_VIDEO=${LOG_VIDEO:-true} SEED=${SEED:-42} ABLATION_BRANCH=${ABLATION_BRANCH:-A_plus_D_plus_R_normal} # Consumed DeMemWM memory-shape knobs for current latent setup. # Anchor: ratio 6 over 18x32 -> 4 prefixes * 3x6 pooled slots = 72 tokens. # Revisit: ratio 3 over 18x32 -> 2 frames * 6x11 pooled slots = 132 tokens. ANCHOR_DOWNSAMPLE_RATIO=${ANCHOR_DOWNSAMPLE_RATIO:-6} REVISIT_MAX_FRAMES=${REVISIT_MAX_FRAMES:-2} REVISIT_DOWNSAMPLE_RATIO=${REVISIT_DOWNSAMPLE_RATIO:-3} cd "${REPO}" source ~/.bashrc >/dev/null 2>&1 || true if command -v conda >/dev/null 2>&1; then eval "$(conda shell.bash hook)" elif [[ -f "${HOME}/.conda/etc/profile.d/conda.sh" ]]; then source "${HOME}/.conda/etc/profile.d/conda.sh" elif [[ -f /share_0/conda/etc/profile.d/conda.sh ]]; then source /share_0/conda/etc/profile.d/conda.sh fi conda activate worldmem PY=$(which python) export PYTHONPATH="./:${PYTHONPATH:-}" export HYDRA_FULL_ERROR=1 export PYTHONWARNINGS=ignore export OMP_NUM_THREADS="${SLURM_CPUS_PER_TASK:-16}" export WANDB_MODE=offline export NCCL_P2P_DISABLE=1 wandb offline >/dev/null 2>&1 || true echo "JOB_ID=${SLURM_JOB_ID:-manual}" echo "RUN_TAG=${RUN_TAG}" echo "RUN_ROOT=${RUN_ROOT}" echo "CHECKPOINT=${CHECKPOINT}" echo "ABLATION_BRANCH=${ABLATION_BRANCH}" echo "HOST=$(hostname)" echo "START=$(date --iso-8601=seconds)" echo "PWD=$PWD" echo "PY=${PY}" "${PY}" --version nvidia-smi || true nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits > "${LOG_DIR}/gpu_memory_before_mb.txt" || true git branch --show-current || true git rev-parse HEAD || true EVAL_ARGS=( "+name=eval_${RUN_TAG}" "+output_dir=${EVAL_OUT}/" "experiment.tasks=[validation]" "wandb.mode=offline" "dataset.validation_multiplier=1" "+dataset.seed=${SEED}" "+customized_load=true" "+seperate_load=false" "algorithm=dememwm_memory_dit" "load=${CHECKPOINT}" "dataset=video_minecraft_latent" "dataset.save_dir=${DATA_DIR}" "dataset.precomputed_feature_dir=${FEATURE_DIR}" "dataset.n_frames=${DATASET_N_FRAMES}" "+dataset.n_frames_valid=${N_FRAMES_VALID}" "+dataset.customized_validation=true" "+dataset.memory_condition_length=0" "++dataset.angle_range=180" "++dataset.pos_range=1000000000" "++algorithm.n_tokens=${N_TOKENS}" "algorithm.x_shape=[16,18,32]" "++algorithm.context_frames=${CONTEXT_FRAMES}" "++algorithm.log_video=${LOG_VIDEO}" "++algorithm.diffusion.sampling_timesteps=${SAMPLING_TIMESTEPS}" "++algorithm.dememwm.debug_force_all_streams=false" "++algorithm.dememwm.training_stage=stage_2" "++algorithm.dememwm.anchor.enabled=true" "++algorithm.dememwm.anchor.anchor_indices=[0,1,2,3]" "++algorithm.dememwm.anchor.diverse_selection=true" "++algorithm.dememwm.anchor.compress.downsample_ratio=${ANCHOR_DOWNSAMPLE_RATIO}" "++algorithm.dememwm.anchor.allow_generated_as_anchor=false" "++algorithm.dememwm.dynamic.enabled=true" "++algorithm.dememwm.dynamic.exclude_latest_local_frames=4" "++algorithm.dememwm.dynamic.recent_frames=8" "++algorithm.dememwm.revisit.enabled=true" "++algorithm.dememwm.revisit.deterministic_pose_retrieval=true" "++algorithm.dememwm.revisit.fov_overlap_threshold=0.30" "++algorithm.dememwm.revisit.high_quality_fov_threshold=0.70" "++algorithm.dememwm.revisit.pose_preselect_topk=64" "++algorithm.dememwm.revisit.fov_yaw_samples=25" "++algorithm.dememwm.revisit.fov_pitch_samples=20" "++algorithm.dememwm.revisit.fov_depth_samples=20" "++algorithm.dememwm.revisit.plucker_weight=0.10" "++algorithm.dememwm.revisit.max_frames=${REVISIT_MAX_FRAMES}" "++algorithm.dememwm.revisit.compress.downsample_ratio=${REVISIT_DOWNSAMPLE_RATIO}" "++algorithm.dememwm.stage_policy.noise_bucket_logging=true" "++algorithm.dememwm.eval_ablation.enabled=true" "++algorithm.dememwm.eval_ablation.branch=${ABLATION_BRANCH}" "++algorithm.dememwm.cache.enabled=true" "++algorithm.dememwm.cache.device=cpu" "++algorithm.dememwm.cache.keep_raw_latents=all" "++algorithm.dememwm.cache.keep_compressed_records=true" "++algorithm.dememwm.cache.eviction_policy=none" "++algorithm.dememwm.cache.no_evict=true" "++algorithm.dememwm.cache.clear_between_videos=true" "++algorithm.dememwm.cache.max_records=null" "++algorithm.dememwm.cache.on_capacity_exceeded=warn" "experiment.validation.batch_size=${VAL_BATCH_SIZE}" "experiment.validation.limit_batch=${VAL_LIMIT}" ) printf '%s\n' "${EVAL_ARGS[@]}" > "${LOG_DIR}/eval_args.txt" echo "Launching evaluation..." SECONDS=0 srun "${PY}" -m main "${EVAL_ARGS[@]}" EVAL_DURATION_SECONDS=${SECONDS} nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits > "${LOG_DIR}/gpu_memory_after_mb.txt" || true cat > "${RUN_ROOT}/eval_manifest.txt" <