DeMemWM / configurations /algorithm /dememwm_memory_dit.yaml
BonanDing's picture
Clean DeMemWM deterministic memory slot handling
93d7b0a
defaults:
- base_video_dit
- _self_
_name: dememwm_memory_dit
# Standalone Memory-DiT path. Do not route through old SSM-memory config.
memory_token_cross_attention: true
memory_cross_attn_layers: null
memory_condition_length: 0
pose_cond_dim: 5
log_video: false
dememwm:
enabled: true
training_stage: stage_1 # fallback only when curriculum.enabled=false
debug_force_all_streams: false
curriculum:
enabled: true
full_stage_start_step: 60000
freeze_vae: true
dit_freeze:
enabled: true
lr:
dememwm_modules: 1.0e-4
memory_adapters: 1.0e-4
full_dit: 1.0e-5
# Current Conv2D memory projectors preserve latent H,W=(18,32).
# Pool sizes are resolved from projected spatial grid size and downsample ratios.
token_patch_size: 2
anchor:
enabled: true
anchor_indices: [0, 1, 2, 3]
allow_generated_as_anchor: false
diverse_selection: true
compress:
downsample_ratio: 4
dynamic:
enabled: true
exclude_latest_local_frames: 4
recent_frames: 8
conv_kernel_t: 3
conv_stride_t: 2
revisit:
enabled: true
deterministic_pose_retrieval: true
fov_overlap_threshold: 0.30
high_quality_fov_threshold: 0.70
plucker_weight: 0.10
max_frames: 2
# FoV geometry for coverage-based retrieval scoring.
# fov_half_h/v: half-angles (degrees) of the horizontal/vertical field of view.
# fov_radius: world-space radius of the sample sphere.
# fov_{yaw,pitch,depth}_samples: grid resolution for FoV point sampling.
fov_half_h: 52.5 # 105 deg total horizontal FoV
fov_half_v: 37.5 # 75 deg total vertical FoV
fov_radius: 30.0
fov_yaw_samples: 25
fov_pitch_samples: 20
fov_depth_samples: 20
pose_preselect_topk: 64
# Plucker descriptor grid for secondary pose-similarity scoring.
plucker_grid_h: 4
plucker_grid_w: 4
plucker_focal_length: 0.35
compress:
downsample_ratio: 4
stage_policy:
noise_bucket_logging: true
eval_ablation:
enabled: false
branch: A_plus_D_plus_R_normal
generated_history_proxy:
enabled: false
start_step: 0
ramp_steps: 1
max_prob: 0.0
noise_std: 0.25
dropout_prob: 0.0
injection:
dit_hidden_size: 1024
anchor_gate: 1.0
dynamic_gate: 1.0
revisit_gate: 1.0
cache:
enabled: true
device: cpu
keep_raw_latents: all
keep_compressed_records: true
keep_prefix_anchors: true
eviction_policy: none
no_evict: true
clear_between_videos: true
max_records: null
on_capacity_exceeded: warn
checkpoint:
strict_dememwm_eval_load: true
diffusion:
architecture:
network_size: 64