| defaults: | |
| - base_video_dit | |
| - _self_ | |
| _name: dememwm_memory_dit | |
| # Standalone Memory-DiT path. Do not route through old SSM-memory config. | |
| memory_token_cross_attention: true | |
| memory_cross_attn_layers: null | |
| memory_condition_length: 0 | |
| pose_cond_dim: 5 | |
| log_video: false | |
| dememwm: | |
| enabled: true | |
| training_stage: stage_1 # fallback only when curriculum.enabled=false | |
| debug_force_all_streams: false | |
| curriculum: | |
| enabled: true | |
| full_stage_start_step: 60000 | |
| freeze_vae: true | |
| dit_freeze: | |
| enabled: true | |
| lr: | |
| dememwm_modules: 1.0e-4 | |
| memory_adapters: 1.0e-4 | |
| full_dit: 1.0e-5 | |
| # Current Conv2D memory projectors preserve latent H,W=(18,32). | |
| # Pool sizes are resolved from projected spatial grid size and downsample ratios. | |
| token_patch_size: 2 | |
| anchor: | |
| enabled: true | |
| anchor_indices: [0, 1, 2, 3] | |
| allow_generated_as_anchor: false | |
| diverse_selection: true | |
| compress: | |
| downsample_ratio: 4 | |
| dynamic: | |
| enabled: true | |
| exclude_latest_local_frames: 4 | |
| recent_frames: 8 | |
| conv_kernel_t: 3 | |
| conv_stride_t: 2 | |
| revisit: | |
| enabled: true | |
| deterministic_pose_retrieval: true | |
| fov_overlap_threshold: 0.30 | |
| high_quality_fov_threshold: 0.70 | |
| plucker_weight: 0.10 | |
| max_frames: 2 | |
| # FoV geometry for coverage-based retrieval scoring. | |
| # fov_half_h/v: half-angles (degrees) of the horizontal/vertical field of view. | |
| # fov_radius: world-space radius of the sample sphere. | |
| # fov_{yaw,pitch,depth}_samples: grid resolution for FoV point sampling. | |
| fov_half_h: 52.5 # 105 deg total horizontal FoV | |
| fov_half_v: 37.5 # 75 deg total vertical FoV | |
| fov_radius: 30.0 | |
| fov_yaw_samples: 25 | |
| fov_pitch_samples: 20 | |
| fov_depth_samples: 20 | |
| pose_preselect_topk: 64 | |
| # Plucker descriptor grid for secondary pose-similarity scoring. | |
| plucker_grid_h: 4 | |
| plucker_grid_w: 4 | |
| plucker_focal_length: 0.35 | |
| compress: | |
| downsample_ratio: 4 | |
| stage_policy: | |
| noise_bucket_logging: true | |
| eval_ablation: | |
| enabled: false | |
| branch: A_plus_D_plus_R_normal | |
| generated_history_proxy: | |
| enabled: false | |
| start_step: 0 | |
| ramp_steps: 1 | |
| max_prob: 0.0 | |
| noise_std: 0.25 | |
| dropout_prob: 0.0 | |
| injection: | |
| dit_hidden_size: 1024 | |
| anchor_gate: 1.0 | |
| dynamic_gate: 1.0 | |
| revisit_gate: 1.0 | |
| cache: | |
| enabled: true | |
| device: cpu | |
| keep_raw_latents: all | |
| keep_compressed_records: true | |
| keep_prefix_anchors: true | |
| eviction_policy: none | |
| no_evict: true | |
| clear_between_videos: true | |
| max_records: null | |
| on_capacity_exceeded: warn | |
| checkpoint: | |
| strict_dememwm_eval_load: true | |
| diffusion: | |
| architecture: | |
| network_size: 64 | |