DeMemWM / scripts /dememwm_full_eval.slurm

Clean DeMemWM deterministic memory slot handling

93d7b0a 2 days ago

7.09 kB

	#!/usr/bin/env bash
	#SBATCH --job-name=dememwm_full_eval
	#SBATCH --partition=gpu
	#SBATCH --time=1-00:00:00
	#SBATCH --nodes=1
	#SBATCH --ntasks=1
	#SBATCH --cpus-per-task=16
	#SBATCH --mem=256G
	#SBATCH --gres=gpu:1
	#SBATCH --chdir=/share_1/users/bonan_ding/DeMemWM
	#SBATCH --output=/share_1/users/bonan_ding/DeMemWM/slurm_logs/%x_%j.out
	#SBATCH --error=/share_1/users/bonan_ding/DeMemWM/slurm_logs/%x_%j.err

	# Full DeMemWM evaluation script for DeMemWM/H200.
	# Submit from the remote repo after training has produced a checkpoint:
	# sbatch --export=ALL,CHECKPOINT=/share_1/users/bonan_ding/DeMemWM/outputs/<run>/train/checkpoints/last.ckpt scripts/dememwm_full_eval.slurm
	# or:
	# CHECKPOINT=/path/to/last.ckpt sbatch --export=ALL scripts/dememwm_full_eval.slurm

	set -euo pipefail

	CHECKPOINT=${CHECKPOINT:-${1:-}}
	if [[ -z "${CHECKPOINT}" ]]; then
	echo "ERROR: set CHECKPOINT=/path/to/dememwm.ckpt, e.g." >&2
	echo " sbatch --export=ALL,CHECKPOINT=/share_1/users/bonan_ding/DeMemWM/outputs/<run>/train/checkpoints/last.ckpt scripts/dememwm_full_eval.slurm" >&2
	exit 2
	fi
	if [[ ! -s "${CHECKPOINT}" ]]; then
	echo "ERROR: checkpoint does not exist or is empty: ${CHECKPOINT}" >&2
	exit 2
	fi

	REPO=${REPO:-/share_1/users/bonan_ding/DeMemWM}
	DATA_DIR=${DATA_DIR:-/share_1/users/bonan_ding/worldmem_data/minecraft}
	FEATURE_DIR=${FEATURE_DIR:-/share_1/users/bonan_ding/worldmem_data/minecraft/vae_features}

	RUN_TAG=${RUN_TAG:-dememwm_full_eval_${SLURM_JOB_ID:-manual_$(date +%Y%m%d_%H%M%S)}}
	RUN_ROOT=${RUN_ROOT:-${REPO}/outputs/${RUN_TAG}}
	EVAL_OUT=${EVAL_OUT:-${RUN_ROOT}/eval}
	LOG_DIR=${LOG_DIR:-${REPO}/slurm_logs/${RUN_TAG}}
	mkdir -p "${EVAL_OUT}" "${LOG_DIR}" "${REPO}/slurm_logs"

	DATASET_N_FRAMES=${DATASET_N_FRAMES:-300}
	N_FRAMES_VALID=${N_FRAMES_VALID:-216}
	CONTEXT_FRAMES=${CONTEXT_FRAMES:-116}
	N_TOKENS=${N_TOKENS:-8}
	SAMPLING_TIMESTEPS=${SAMPLING_TIMESTEPS:-20}
	VAL_BATCH_SIZE=${VAL_BATCH_SIZE:-1}
	VAL_LIMIT=${VAL_LIMIT:-16}
	LOG_VIDEO=${LOG_VIDEO:-true}
	SEED=${SEED:-42}
	ABLATION_BRANCH=${ABLATION_BRANCH:-A_plus_D_plus_R_normal}

	# Consumed DeMemWM memory-shape knobs for current latent setup.
	# Anchor: ratio 6 over 18x32 -> 4 prefixes * 3x6 pooled slots = 72 tokens.
	# Revisit: ratio 3 over 18x32 -> 2 frames * 6x11 pooled slots = 132 tokens.
	ANCHOR_DOWNSAMPLE_RATIO=${ANCHOR_DOWNSAMPLE_RATIO:-6}
	REVISIT_MAX_FRAMES=${REVISIT_MAX_FRAMES:-2}
	REVISIT_DOWNSAMPLE_RATIO=${REVISIT_DOWNSAMPLE_RATIO:-3}

	cd "${REPO}"
	source ~/.bashrc >/dev/null 2>&1 \|\| true
	if command -v conda >/dev/null 2>&1; then
	eval "$(conda shell.bash hook)"
	elif [[ -f "${HOME}/.conda/etc/profile.d/conda.sh" ]]; then
	source "${HOME}/.conda/etc/profile.d/conda.sh"
	elif [[ -f /share_0/conda/etc/profile.d/conda.sh ]]; then
	source /share_0/conda/etc/profile.d/conda.sh
	fi
	conda activate worldmem
	PY=$(which python)

	export PYTHONPATH="./:${PYTHONPATH:-}"
	export HYDRA_FULL_ERROR=1
	export PYTHONWARNINGS=ignore
	export OMP_NUM_THREADS="${SLURM_CPUS_PER_TASK:-16}"
	export WANDB_MODE=offline
	export NCCL_P2P_DISABLE=1
	wandb offline >/dev/null 2>&1 \|\| true

	echo "JOB_ID=${SLURM_JOB_ID:-manual}"
	echo "RUN_TAG=${RUN_TAG}"
	echo "RUN_ROOT=${RUN_ROOT}"
	echo "CHECKPOINT=${CHECKPOINT}"
	echo "ABLATION_BRANCH=${ABLATION_BRANCH}"
	echo "HOST=$(hostname)"
	echo "START=$(date --iso-8601=seconds)"
	echo "PWD=$PWD"
	echo "PY=${PY}"
	"${PY}" --version
	nvidia-smi \|\| true
	nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits > "${LOG_DIR}/gpu_memory_before_mb.txt" \|\| true
	git branch --show-current \|\| true
	git rev-parse HEAD \|\| true

	EVAL_ARGS=(
	"+name=eval_${RUN_TAG}"
	"+output_dir=${EVAL_OUT}/"
	"experiment.tasks=[validation]"
	"wandb.mode=offline"
	"dataset.validation_multiplier=1"
	"+dataset.seed=${SEED}"
	"+customized_load=true"
	"+seperate_load=false"
	"algorithm=dememwm_memory_dit"
	"load=${CHECKPOINT}"
	"dataset=video_minecraft_latent"
	"dataset.save_dir=${DATA_DIR}"
	"dataset.precomputed_feature_dir=${FEATURE_DIR}"
	"dataset.n_frames=${DATASET_N_FRAMES}"
	"+dataset.n_frames_valid=${N_FRAMES_VALID}"
	"+dataset.customized_validation=true"
	"+dataset.memory_condition_length=0"
	"++dataset.angle_range=180"
	"++dataset.pos_range=1000000000"
	"++algorithm.n_tokens=${N_TOKENS}"
	"algorithm.x_shape=[16,18,32]"
	"++algorithm.context_frames=${CONTEXT_FRAMES}"
	"++algorithm.log_video=${LOG_VIDEO}"
	"++algorithm.diffusion.sampling_timesteps=${SAMPLING_TIMESTEPS}"
	"++algorithm.dememwm.debug_force_all_streams=false"
	"++algorithm.dememwm.training_stage=stage_2"
	"++algorithm.dememwm.anchor.enabled=true"
	"++algorithm.dememwm.anchor.anchor_indices=[0,1,2,3]"
	"++algorithm.dememwm.anchor.diverse_selection=true"
	"++algorithm.dememwm.anchor.compress.downsample_ratio=${ANCHOR_DOWNSAMPLE_RATIO}"
	"++algorithm.dememwm.anchor.allow_generated_as_anchor=false"
	"++algorithm.dememwm.dynamic.enabled=true"
	"++algorithm.dememwm.dynamic.exclude_latest_local_frames=4"
	"++algorithm.dememwm.dynamic.recent_frames=8"
	"++algorithm.dememwm.revisit.enabled=true"
	"++algorithm.dememwm.revisit.deterministic_pose_retrieval=true"
	"++algorithm.dememwm.revisit.fov_overlap_threshold=0.30"
	"++algorithm.dememwm.revisit.high_quality_fov_threshold=0.70"
	"++algorithm.dememwm.revisit.pose_preselect_topk=64"
	"++algorithm.dememwm.revisit.fov_yaw_samples=25"
	"++algorithm.dememwm.revisit.fov_pitch_samples=20"
	"++algorithm.dememwm.revisit.fov_depth_samples=20"
	"++algorithm.dememwm.revisit.plucker_weight=0.10"
	"++algorithm.dememwm.revisit.max_frames=${REVISIT_MAX_FRAMES}"
	"++algorithm.dememwm.revisit.compress.downsample_ratio=${REVISIT_DOWNSAMPLE_RATIO}"
	"++algorithm.dememwm.stage_policy.noise_bucket_logging=true"
	"++algorithm.dememwm.eval_ablation.enabled=true"
	"++algorithm.dememwm.eval_ablation.branch=${ABLATION_BRANCH}"
	"++algorithm.dememwm.cache.enabled=true"
	"++algorithm.dememwm.cache.device=cpu"
	"++algorithm.dememwm.cache.keep_raw_latents=all"
	"++algorithm.dememwm.cache.keep_compressed_records=true"
	"++algorithm.dememwm.cache.eviction_policy=none"
	"++algorithm.dememwm.cache.no_evict=true"
	"++algorithm.dememwm.cache.clear_between_videos=true"
	"++algorithm.dememwm.cache.max_records=null"
	"++algorithm.dememwm.cache.on_capacity_exceeded=warn"
	"experiment.validation.batch_size=${VAL_BATCH_SIZE}"
	"experiment.validation.limit_batch=${VAL_LIMIT}"
	)

	printf '%s\n' "${EVAL_ARGS[@]}" > "${LOG_DIR}/eval_args.txt"
	echo "Launching evaluation..."
	SECONDS=0
	srun "${PY}" -m main "${EVAL_ARGS[@]}"
	EVAL_DURATION_SECONDS=${SECONDS}
	nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits > "${LOG_DIR}/gpu_memory_after_mb.txt" \|\| true

	cat > "${RUN_ROOT}/eval_manifest.txt" <<MANIFEST
	RUN_TAG=${RUN_TAG}
	RUN_ROOT=${RUN_ROOT}
	EVAL_OUT=${EVAL_OUT}
	CHECKPOINT=${CHECKPOINT}
	ABLATION_BRANCH=${ABLATION_BRANCH}
	EVAL_DURATION_SECONDS=${EVAL_DURATION_SECONDS}
	GPU_MEMORY_BEFORE_MB_FILE=${LOG_DIR}/gpu_memory_before_mb.txt
	GPU_MEMORY_AFTER_MB_FILE=${LOG_DIR}/gpu_memory_after_mb.txt
	JOB_ID=${SLURM_JOB_ID:-manual}
	FINISHED=$(date --iso-8601=seconds)
	MANIFEST

	echo "DEMEMWM_FULL_EVAL_DONE $(date --iso-8601=seconds)"