File size: 2,675 Bytes
b47a1ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

defaults:
  - base_video_dit
  - _self_

_name: dememwm_memory_dit

# Standalone Memory-DiT path. Do not route through old SSM-memory config.
memory_token_cross_attention: true
memory_cross_attn_layers: null
memory_condition_length: 0
pose_cond_dim: 5
log_video: false

dememwm:
  enabled: true
  training_stage: stage_1  # fallback only when curriculum.enabled=false
  debug_force_all_streams: false
  curriculum:
    enabled: true
    full_stage_start_step: 60000
    freeze_vae: true
    dit_freeze:
      enabled: true
    lr:
      dememwm_modules: 1.0e-4
      memory_adapters: 1.0e-4
      full_dit: 1.0e-5
  # Current Conv2D memory projectors preserve latent H,W=(18,32).
  # Pool sizes are resolved from projected spatial grid size and downsample ratios.
  token_patch_size: 2
  anchor:
    enabled: true
    anchor_indices: [0, 1, 2, 3]
    allow_generated_as_anchor: false
    diverse_selection: true
    compress:
      downsample_ratio: 4
  dynamic:
    enabled: true
    exclude_latest_local_frames: 4
    recent_frames: 8
    conv_kernel_t: 3
    conv_stride_t: 2
  revisit:
    enabled: true
    deterministic_pose_retrieval: true
    fov_overlap_threshold: 0.30
    high_quality_fov_threshold: 0.70
    plucker_weight: 0.10
    max_frames: 2
    # FoV geometry for coverage-based retrieval scoring.
    # fov_half_h/v: half-angles (degrees) of the horizontal/vertical field of view.
    # fov_radius: world-space radius of the sample sphere.
    # fov_{yaw,pitch,depth}_samples: grid resolution for FoV point sampling.
    fov_half_h: 52.5   # 105 deg total horizontal FoV
    fov_half_v: 37.5   # 75 deg total vertical FoV
    fov_radius: 30.0
    fov_yaw_samples: 25
    fov_pitch_samples: 20
    fov_depth_samples: 20
    pose_preselect_topk: 64
    # Plucker descriptor grid for secondary pose-similarity scoring.
    plucker_grid_h: 4
    plucker_grid_w: 4
    plucker_focal_length: 0.35
    compress:
      downsample_ratio: 4
  stage_policy:
    noise_bucket_logging: true
  eval_ablation:
    enabled: false
    branch: A_plus_D_plus_R_normal
  generated_history_proxy:
    enabled: false
    start_step: 0
    ramp_steps: 1
    max_prob: 0.0
    noise_std: 0.25
    dropout_prob: 0.0
  injection:
    dit_hidden_size: 1024
    anchor_gate: 1.0
    dynamic_gate: 1.0
    revisit_gate: 1.0
  cache:
    enabled: true
    device: cpu
    keep_raw_latents: all
    keep_compressed_records: true
    keep_prefix_anchors: true
    eviction_policy: none
    no_evict: true
    clear_between_videos: true
    max_records: null
    on_capacity_exceeded: warn
  checkpoint:
    strict_dememwm_eval_load: true

diffusion:
  architecture:
    network_size: 64