File size: 12,255 Bytes
32f1686
 
 
 
 
 
 
10a2918
32f1686
 
 
 
10a2918
32f1686
 
 
 
 
 
 
10a2918
32f1686
 
 
 
10a2918
32f1686
10a2918
 
32f1686
 
10a2918
 
32f1686
 
10a2918
32f1686
 
 
10a2918
32f1686
 
 
10a2918
32f1686
 
 
 
 
 
 
 
10a2918
 
 
 
 
 
32f1686
 
 
 
10a2918
 
 
 
 
 
32f1686
 
 
10a2918
32f1686
10a2918
32f1686
 
 
 
 
 
 
 
 
 
 
10a2918
32f1686
10a2918
32f1686
 
 
 
 
 
 
10a2918
32f1686
 
 
 
 
 
 
 
 
 
10a2918
 
32f1686
10a2918
32f1686
 
 
 
 
 
 
 
 
 
 
 
 
10a2918
 
 
 
 
32f1686
 
 
 
 
 
 
 
 
 
10a2918
32f1686
 
 
 
 
 
 
 
 
 
 
 
 
 
10a2918
32f1686
 
 
 
 
 
 
 
 
 
10a2918
32f1686
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10a2918
 
 
 
32f1686
 
 
10a2918
32f1686
 
 
 
 
 
 
 
 
10a2918
32f1686
10a2918
32f1686
 
 
 
 
 
 
10a2918
32f1686
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10a2918
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
### BASELINE: CONVERGES AFTER LONG

parameters:

  ### MODEL ARCHITECTURE                                                       
  MODEL:
    value:
      MODEL_CLASS: "UnReflect_Model"  # Main model class name (must match class in models.py)  # <<<<<<<<< DECODER PRETRAINING: NOT USING TOKEN INPAINTER (DIRECT FROM DINO)
      MODEL_MODULE: "models"  # Module name to import model classes from (default: "models")
      RGB_ENCODER:
        ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m"  # DINOv3 encoder model name (HuggingFace format)
        IMAGE_SIZE: 896  # Input image size (height and width in pixels)
        RETURN_SELECTED_LAYERS: [3, 6, 9, 12]  # Transformer layer indices to extract features from (0-indexed)
        RGB_ENCODER_LR: 0.0  # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
      DECODERS:
        highlight:
          FEATURE_DIM: 1024  # Feature dimension for highlight decoder
          REASSEMBLE_OUT_CHANNELS: [96,192,384,768]  # Output channels for each decoder stage
          REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5]  # Spatial upsampling factors for each stage
          READOUT_TYPE: "ignore"  # Readout type for DPT decoder
          # FROM_PRETRAINED: "highlight_decoder.pt"  # Path to pretrained token inpainter weights (optional)
          USE_BN: False  # Use batch normalization in decoder
          DROPOUT: 0.1  # Dropout rate in decoder layers
          OUTPUT_IMAGE_SIZE: [896,896]  # Output image resolution [height, width]
          OUTPUT_CHANNELS: 1  # Number of output channels (1 for highlight mask)
          DECODER_LR: 1.0e-6  # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
          NUM_FUSION_BLOCKS_TRAINABLE: null  # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)

      TOKEN_INPAINTER: # <<<<<<<<<<<< DOESNT MATTER, MODEL CLASS IS NOT TOKEN INPAINTER
        TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior"  # Token inpainter class name
        TOKEN_INPAINTER_MODULE: "token_inpainters"  # Module name to import token inpainter from
        FROM_PRETRAINED: "token_inpainter.pth"  # Path to pretrained token inpainter weights (optional)
        TOKEN_INPAINTER_LR: 1.0e-5  # Learning rate for token inpainter (can differ from base LR)
        DEPTH: 6  # Number of transformer blocks
        HEADS: 16  # Number of attention heads
        DROP: 0 # Dropout rate 
        USE_POSITIONAL_ENCODING: True  # Enable 2D sinusoidal positional encodings
        USE_FINAL_NORM: True  # Enable final LayerNorm before output projection
        USE_LOCAL_PRIOR: True  # Blend local mean prior for masked seeds
        LOCAL_PRIOR_WEIGHT: 0.5  # Weight for local prior blending (1.0 = only mask_token, 0.0 = only local mean)
        LOCAL_PRIOR_KERNEL: 5  # Kernel size for local prior blending (> 1)
        SEED_NOISE_STD: 0.02  # Standard deviation of noise added to masked seeds during training  
  INPAINT_MASK_DILATION:
    value: 3  # Dilation kernel size (pixels) for inpaint mask - Must be odd
  USE_TORCH_COMPILE:  # Enable PyTorch 2.0 torch.compile for faster training (experimental)
    value: False
  DISTRIBUTE:
    value: "ddp"

  ### DATA                                                                      
  DATASETS:
    value:
      SHIQ:
        VAL_SCENES: ["test"]
        RESIZE_MODE: "resize+crop"
        TARGET_SIZE: [896,896]
        SAMPLE_EVERY_N: 2
      
      SCARED:
        VAL_SCENES: ["v22","v23","v24","v25","v26","v27","v28","v29","v30","v31","v32","v33","v34"]  # Validation scene names
        RESIZE_MODE: "resize+crop"  # Image resizing mode
        SAMPLE_EVERY_N: 4  # Load every Nth frame
              
      ALL_DATASETS:
        FEW_IMAGES: False
        TARGET_SIZE: [896,896]
        LOAD_RGB_ONLY: True
        LOAD_HIGHLIGHT: True


  BATCH_SIZE: # Max batch size with img size 896 is 32
    value: 20 # Number of samples per batch (adjust based on GPU memory)
  NUM_WORKERS:
    value: 12  # Number of data loading worker processes (0 = main process only, "auto" = 90% of CPU affinity)
  SHUFFLE:
    value: True  # Shuffle training data each epoch (False for validation/test)
  PIN_MEMORY:
    value: True  # Pin memory in DataLoader for faster GPU transfer (recommended: True)
  PREFETCH_FACTOR:
    value: 2  # Number of batches to prefetch per worker (higher = more memory usage)
  
  ### HIGHLIGHTS                                                        
  MOGE_MODEL:
    value: "Ruicheng/moge-2-vits-normal"  # MoGe model name for normal estimation (HuggingFace format)
  SURFACE_ROUGHNESS:
    value: 100.0  # Blinn-Phong surface roughness exponent (higher = sharper highlights)
  INTENSITY:
    value: 0.8  # Specular highlight intensity multiplier
  LIGHT_DISTANCE_RANGE:
    value: [0.0, 1]  # Range for light source distance sampling [min, max] (normalized)
  LIGHT_LEFT_RIGHT_ANGLE:
    value: [0, 360]  # Range for light source horizontal angle [min, max] in degrees
  LIGHT_ABOVE_BELOW_ANGLE:
    value: [0, 360]  # Range for light source vertical angle [min, max] in degrees
  DATASET_HIGHLIGHT_DILATION:
    value: 25  #sDilation kernel size (pixels) for dataset highlight masks
  DATASET_HIGHLIGHT_THRESHOLD:
    value: 0.9  # Brightness/luminance threshold (0-1) for detecting highlights in dataset images
  DATASET_HIGHLIGHT_USE_LUMINANCE:
    value: True  # If True, use perceptually-weighted luminance (0.299*R + 0.587*G + 0.114*B) for dataset highlights; if False, use simple mean brightness
  HIGHLIGHT_COLOR:
    value: [1.0, 1.0, 1.0]  # RGB color for synthetic highlights (normalized 0-1)
  CLAMP_RECONSTRUCTION:
    value: True  # Clamp reconstructed images to [0, 1] range if True

  ### OPTIMIZATION                                                                      
  EPOCHS:
    value: 20  # Maximum number of training epochs<
  LEARNING_RATE:
    value: 1.0e-4  # Base learning rate for optimizer
  WEIGHT_DECAY:   
    value: 0.0  # L2 regularization weight (0.0 = no weight decay)
  GRADIENT_ACCUMULATION_STEPS:
    value: 1  # Number of steps to accumulate gradients before optimizer step (1 = no accumulation)
  WARMUP:
    value: 100  # Number of warmup steps for learning rate schedule (linear warmup from 0 to LR)
  GRADIENT_CLIPPING_MAX_NORM:
    value: 8  # Maximum gradient norm for clipping (set to -1 to disable clipping)
  LR_SCHEDULER: 
    value: 
      ONPLATEAU:  # ReduceLROnPlateau scheduler (reduces LR when validation metric plateaus)
        PATIENCE: 5  # Number of epochs to wait before reducing LR
        FACTOR: 0.1  # Factor by which LR is reduced (new_lr = old_lr * factor)
      COSINE:  # CosineAnnealingLR scheduler (cosine annealing schedule)
        N_PERIODS: 1  # Number of cosine periods over training
      # STEPWISE:  # StepLR scheduler (reduces LR at fixed step intervals)
      #   N_STEPS: 5  # Number of times to reduce LR during training
      #   GAMMA: 0.25  # Factor by which LR is reduced at each step (new_lr = old_lr * gamma)
      # EXPONENTIAL:  # ExponentialLR scheduler (exponential decay)
      #   GAMMA: 0.5  # Multiplicative factor for exponential decay

  SWITCH_OPTIMIZER_EPOCH:
    value: null  # Epoch number to switch from bootstrap to refining optimizer (null = no switch)
  OPTIMIZER_BOOTSTRAP_NAME:
    value: "AdamW"  # Optimizer name for initial training phase ("Adam", "SGD", etc.)
  OPTIMIZER_REFINING_NAME:
    value: "AdamW"  # Optimizer name for refining phase (used after SWITCH_OPTIMIZER_EPOCH)
  EARLY_STOPPING_PATIENCE:
    value: 20  # Number of epochs without improvement before stopping training
  SAVE_INTERVAL:
    value: 1000  # Number of training steps between model checkpoints
  
  DATASET_HIGHLIGHT_SUPERVISION_THRESHOLD:
    value: 0.1  # Pixel highlights above this threshold (should be low) are excluded from supervision

  ### LOSS WEIGHTS (relative to the total loss, NOT NORMALIZED LATER)                                                                              
  SPECULAR_LOSS_WEIGHT:
    value: 0.0  # Weight for specular component reconstruction loss
  DIFFUSE_LOSS_WEIGHT:
    value: 0.0  # Weight for diffuse component reconstruction loss
  HIGHLIGHT_LOSS_WEIGHT:
    value: 1.0  # Weight for highlight mask regression loss
  TOKEN_INPAINT_LOSS_WEIGHT:
    value: 0.0  # Weight for token-space inpainting loss (L1 + cosine similarity in feature space)

  IMAGE_RECONSTRUCTION_LOSS_WEIGHT:
    value: 0.0  # Weight for full image reconstruction loss
  SATURATION_RING_LOSS_WEIGHT:
    value: 0.0  # Weight for saturation ring consistency loss (around highlight regions)
  RING_KERNEL_SIZE:
    value: 11  # Kernel size (odd number) for saturation ring dilation around highlights
  RING_VAR_WEIGHT:
    value: 0.5  # Weight for variance matching in saturation ring loss (vs mean matching)
  RING_TEXTURE_WEIGHT:
    value: 0.0  # Weight for texture consistency term in saturation ring loss
  HLREG_W_L1:
    value: 1.0  # Weight for L1 loss in highlight regression
  HLREG_USE_CHARB:
    value: True  # Use Charbonnier loss (smooth L1) instead of standard L1 if True
  HLREG_W_DICE:
    value: 0.2  # Weight for Dice loss in highlight regression (for mask overlap)
  HLREG_W_SSIM:
    value: 0.0  # Weight for SSIM loss in highlight regression
  HLREG_W_GRAD:
    value: 0.0  # Weight for gradient loss in highlight regression
  HLREG_W_TV:
    value: 0.0  # Weight for total variation loss in highlight regression
  HLREG_BALANCE_MODE:
    value: "auto"   # Class balancing mode for highlight regression: 'none' | 'auto' | 'pos_weight'
  HLREG_POS_WEIGHT:
    value: 1.0      # Positive class weight (used only if BALANCE_MODE == 'pos_weight')
  HLREG_FOCAL_GAMMA:
    value: 2.0      # Focal loss gamma parameter (0.0 = standard BCE, 1.0-2.0 helps with gradient vanishing)
  
  WEIGHT_CONTEXT_IDENTITY: 
    value: 0.0  # LEAVE TO 0.0: Weight for L1 loss on context (non-masked) regions (identity preservation) 
  WEIGHT_TV_IN_HOLE:
    value: 0.0  # LEAVE TO 0.0: Weight for total variation loss inside masked/hole regions
  RING_DILATE_KERNEL:
    value: 17  # Dilation kernel size (odd number) for creating ring mask around highlights
  WEIGHT_SEAM:
    value: 0.0  # Weight for gradient matching loss on saturation ring
  SEAM_USE_CHARB:
    value: True  # Use Charbonnier loss instead of L1 in seam loss (smooth L1 for boundary consistency)
  SEAM_WEIGHT_GRAD:
    value: 0.0  # Weight for gradient matching term inside seam loss (0.0 = disable gradient term)
  TOKEN_FEAT_ALPHA:
    value: 0.5  # Mixing factor for token feature loss: alpha * L1 + (1-alpha) * (1-cosine_sim)

  ### DIFFUSE HIGHLIGHT PENALTY
  WEIGHT_DIFFUSE_HIGHLIGHT_PENALTY:
    value: 0.0  # Weight for penalty loss on highlights in diffuse decoder output (0.0 = disabled)
  DIFFUSE_HL_THRESHOLD:
    value: 0.8  # Brightness/luminance threshold for detecting highlights in diffuse (0.0-1.0)
  DIFFUSE_HL_USE_CHARB:
    value: True  # Use Charbonnier loss instead of L1 for diffuse highlight penalty
  DIFFUSE_HL_PENALTY_MODE:
    value: "brightness"  # Penalty mode: "brightness" (penalize brightness/luminance above threshold) or "pixel" (penalize RGB values directly)
  DIFFUSE_HL_TARGET_BRIGHTNESS:
    value: null  # Target brightness/luminance for penalized pixels (null = use threshold value)
  DIFFUSE_HL_USE_LUMINANCE:
    value: True  # If True, use perceptually-weighted luminance (0.299*R + 0.587*G + 0.114*B); if False, use simple mean brightness

  ### LOGGING, RESULTS AND WANDB                                                                 
  LOG_INTERVAL:
    value: 1  # Number of training steps between console log outputs
  WANDB_LOG_INTERVAL:
    value: 1  # Number of training steps between WandB metric logs
  IMAGE_LOG_INTERVAL:
    value: 5  # Number of training steps between image logging to WandB
  NO_WANDB:
    value: False  # Disable WandB logging if True (useful for local debugging)
  MODEL_WATCHER_FREQ_WANDB:
    value: 50  # Frequency (in steps) for logging model parameter histograms to WandB
  WANDB_ENTITY:
    value: "unreflect-anything"  # WandB organization/entity name
  WANDB_PROJECT:
    value: "UnReflectAnything"  # WandB project name
  NOTES:
    value: ""  # Notes/description for this training run