WorldMem_Repro / infer.sh
BonanDing's picture
Reproduce Training & Fix distributed eval
681f346
export PYTHONWARNINGS="ignore"
export NCCL_DEBUG=INFO
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
export TORCH_DISTRIBUTED_DEBUG=DETAIL
export NCCL_DEBUG_SUBSYS=COLL
# Optional but very helpful while debugging (slower):
export TORCH_NCCL_BLOCKING_WAIT=1
export NCCL_P2P_DISABLE=1
wandb offline
python -m main +name=infer \
experiment.tasks=[validation] \
dataset.validation_multiplier=1 \
+diffusion_model_path=/share_1/users/bonan_ding/worldmem_ckpt/diffusion_only.ckpt \
+vae_path=/share_1/users/bonan_ding/worldmem_ckpt/vae_only.ckpt \
+customized_load=true \
+seperate_load=true \
dataset.n_frames=8 \
dataset.save_dir=/share_1/users/bonan_ding/worldmem_data/minecraft \
+dataset.n_frames_valid=700 \
+dataset.memory_condition_length=8 \
+dataset.customized_validation=true \
+dataset.add_timestamp_embedding=true \
+algorithm.n_tokens=8 \
+algorithm.memory_condition_length=8 \
algorithm.context_frames=600 \
+algorithm.relative_embedding=true \
+algorithm.log_video=true \
+algorithm.add_timestamp_embedding=true