AlbeRota commited on
Commit
10a2918
·
verified ·
1 Parent(s): 270f4e7

Upload weights, notebooks, sample images

Browse files
configs/decoder_pretrain.yaml CHANGED
@@ -18,7 +18,7 @@ parameters:
18
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
19
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
20
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
21
- # FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional) # <<<<<<<<< DECODER PRETRAINING: NO WEIGHTS HERE
22
  USE_BN: False # Use batch normalization in decoder
23
  DROPOUT: 0.1 # Dropout rate in decoder layers
24
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
@@ -29,7 +29,7 @@ parameters:
29
  TOKEN_INPAINTER: # <<<<<<<<<<<< DOESNT MATTER, MODEL CLASS IS NOT TOKEN INPAINTER
30
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
31
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
32
- FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
33
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
34
  DEPTH: 6 # Number of transformer blocks
35
  HEADS: 16 # Number of attention heads
@@ -140,7 +140,7 @@ parameters:
140
  HIGHLIGHT_RETURN_RECT_AS_RGB: False # Return highlight rectangle as RGB if True
141
  HIGHLIGHT_RETURN_RECT: True # Return highlight rectangle region if True
142
 
143
- FEW_IMAGES_OVERRIDE:
144
  value: False # If True, override all datasets' FEW_IMAGES to True (for quick debugging across all datasets)
145
 
146
  BATCH_SIZE: # Max batch size with img size 896 is 32
 
18
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
19
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
20
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
21
+ # FROM_PRETRAINED: "rgb_decoder.pth" # Path to pretrained decoder weights (optional) # <<<<<<<<< DECODER PRETRAINING: NO WEIGHTS HERE
22
  USE_BN: False # Use batch normalization in decoder
23
  DROPOUT: 0.1 # Dropout rate in decoder layers
24
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
 
29
  TOKEN_INPAINTER: # <<<<<<<<<<<< DOESNT MATTER, MODEL CLASS IS NOT TOKEN INPAINTER
30
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
31
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
32
+ FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
33
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
34
  DEPTH: 6 # Number of transformer blocks
35
  HEADS: 16 # Number of attention heads
 
140
  HIGHLIGHT_RETURN_RECT_AS_RGB: False # Return highlight rectangle as RGB if True
141
  HIGHLIGHT_RETURN_RECT: True # Return highlight rectangle region if True
142
 
143
+ FEW_IMAGES_ALL_DATASETS:
144
  value: False # If True, override all datasets' FEW_IMAGES to True (for quick debugging across all datasets)
145
 
146
  BATCH_SIZE: # Max batch size with img size 896 is 32
configs/end2end.yaml ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### BASELINE: CONVERGES AFTER LONG
2
+
3
+ parameters:
4
+
5
+ ### MODEL ARCHITECTURE
6
+ MODEL:
7
+ value:
8
+ MODEL_CLASS: "UnReflect_Model_TokenInpainter" # Main model class name (must match class in models.py)
9
+ MODEL_MODULE: "models" # Module name to import model classes from (default: "models")
10
+ RGB_ENCODER:
11
+ ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format)
12
+ IMAGE_SIZE: 896 # Input image size (height and width in pixels)
13
+ RETURN_SELECTED_LAYERS: [3, 6, 9 , 12] # Transformer layer indices to extract features from (0-indexed)
14
+ RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
15
+ DECODERS:
16
+ diffuse:
17
+ FEATURE_DIM: 1024 # Feature dimension for decoder (should match encoder output)
18
+ REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
19
+ REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
20
+ READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
21
+ FROM_PRETRAINED: "diffuse_decoder.pt" # Path to pretrained decoder weights (optional)
22
+ USE_BN: False # Use batch normalization in decoder
23
+ DROPOUT: 0.1 # Dropout rate in decoder layers
24
+ OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
25
+ OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image)
26
+ DECODER_LR: 1.0e-5 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
27
+ NUM_FUSION_BLOCKS_TRAINABLE: 1 # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
28
+ TRAIN_RGB_HEAD: True # Whether to train RGB head (true/false, null = train if DECODER_LR != 0)
29
+ highlight:
30
+ FEATURE_DIM: 1024 # Feature dimension for highlight decoder
31
+ REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
32
+ REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
33
+ READOUT_TYPE: "ignore" # Readout type for DPT decoder
34
+ # FROM_PRETRAINED: "highlight_decoder.pt" # Path to pretrained token inpainter weights (optional)
35
+ USE_BN: False # Use batch normalization in decoder
36
+ DROPOUT: 0.1 # Dropout rate in decoder layers
37
+ OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
38
+ OUTPUT_CHANNELS: 1 # Number of output channels (1 for highlight mask)
39
+ DECODER_LR: 1.0e-5 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
40
+ NUM_FUSION_BLOCKS_TRAINABLE: null # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
41
+ TOKEN_INPAINTER:
42
+ TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
43
+ TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
44
+ # FROM_PRETRAINED: "token_inpainter.pt" # Path to pretrained token inpainter weights (optional)
45
+ TOKEN_INPAINTER_LR: 5.0e-4 # Learning rate for token inpainter (can differ from base LR)
46
+ DEPTH: 6 # Number of transformer blocks
47
+ HEADS: 16 # Number of attention heads
48
+ DROP: 0.05 # Dropout rate
49
+ USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings
50
+ USE_FINAL_NORM: True # Enable final LayerNorm before output projection
51
+ USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds
52
+ LOCAL_PRIOR_WEIGHT: 0.85 # Weight for local prior blending (1.0 = only mask_token, 0.0 = only local mean)
53
+ LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
54
+ SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
55
+
56
+ INPAINT_MASK_DILATION:
57
+ value: 31 # Dilation kernel size (pixels) for inpaint mask - Must be odd
58
+ USE_TORCH_COMPILE: # Enable PyTorch 2.0 torch.compile for faster training (experimental)
59
+ value: False
60
+ DISTRIBUTE:
61
+ value: "ddp"
62
+
63
+ ### DATA
64
+ DATASETS:
65
+ value:
66
+ # Reserved key: key-value pairs here override the same keys for every dataset (per-dataset entries still override this).
67
+ SCRREAM:
68
+ VAL_SCENES: ["scene10", "scene04"] # List of validation scene names
69
+ RESIZE_MODE: "resize+crop" # Image resizing mode: "resize", "crop", "resize+crop", or "pad"
70
+ SAMPLE_EVERY_N: 6 # Load every Nth frame from each scene (1 = all frames, 4 = every 4th frame)
71
+
72
+ HOUSECAT6D:
73
+ VAL_SCENES: ["val_scene1","val_scene2"] # Validation scene names
74
+ RESIZE_MODE: "resize+crop" # Image resizing mode
75
+ SAMPLE_EVERY_N: 4 # Load every Nth frame
76
+
77
+ CROMO:
78
+ TRAIN_SCENES: ["kitchen"] # Training scene names (list or string)
79
+ # VAL_SCENES: "station" # Validation scene names (optional)
80
+ RESIZE_MODE: "resize" # Image resizing mode
81
+ SAMPLE_EVERY_N: 2 # Load every Nth frame
82
+
83
+ PSD:
84
+ TRAIN_SCENES: "PSD_Train" # Training scene name (string or list)
85
+ VAL_SCENES: "PSD_Val" # Validation scene name (string or list)
86
+ RESIZE_MODE: "resize+crop" # Image resizing mode
87
+ SAMPLE_EVERY_N: 1 # Load every Nth frame (1 = all frames)
88
+
89
+ SCARED:
90
+ VAL_SCENES: ["v22","v23","v24","v25","v26","v27","v28","v29","v30","v31","v32","v33","v34"] # Validation scene names
91
+ RESIZE_MODE: "resize+crop" # Image resizing mode
92
+ SAMPLE_EVERY_N: 8 # Load every Nth frame
93
+
94
+ STEREOMIS_TRACKING:
95
+ VAL_SCENES: ["P2_2"] # Validation scene names
96
+ RESIZE_MODE: "resize+crop" # Image resizing mode
97
+ SAMPLE_EVERY_N: 2 # Load every Nth frame
98
+
99
+ CHOLEC80:
100
+ TRAIN_SCENES: ["train"] # Validation scene names
101
+ VAL_SCENES: ["test"] # Validation scene names
102
+ RESIZE_MODE: "resize+crop" # Image resizing mode
103
+ SAMPLE_EVERY_N: 40 # Load every Nth frame
104
+
105
+ SUNRGBD:
106
+ VAL_SCENES: ["realsense"] # Validation scene names
107
+ RESIZE_MODE: "resize+crop" # Image resizing mode
108
+ SAMPLE_EVERY_N: 4 # Load every Nth frame
109
+
110
+ SCANNET:
111
+ TRAIN_SCENES: ["train"]
112
+ VAL_SCENES: ["val"]
113
+ RESIZE_MODE: "resize+crop"
114
+ SAMPLE_EVERY_N: 5
115
+
116
+ OPENIMAGESV7:
117
+ TRAIN_SCENES: ["thescene"]
118
+ # VAL_SCENES: [""]
119
+ RESIZE_MODE: "resize+crop"
120
+ SAMPLE_EVERY_N: 5
121
+
122
+ ENDOSYNTH:
123
+ TRAIN_SCENES: ["scene"]
124
+ # VAL_SCENES: ["val"]
125
+ RESIZE_MODE: "resize+crop"
126
+ SAMPLE_EVERY_N: 1
127
+
128
+ ALL_DATASETS:
129
+ FEW_IMAGES: False # Override FEW_IMAGES for all datasets (for quick debugging set True)
130
+ TARGET_SIZE: [896,896] # Override target image size [height, width] for all datasets
131
+ LOAD_RGB_ONLY: True
132
+
133
+ BATCH_SIZE: # Max batch size with img size 896 is 32
134
+ value: 6 # Number of samples per batch (adjust based on GPU memory)
135
+ NUM_WORKERS:
136
+ value: 12 # Number of data loading worker processes (0 = main process only, "auto" = 90% of CPU affinity)
137
+ SHUFFLE:
138
+ value: True # Shuffle training data each epoch (False for validation/test)
139
+ PIN_MEMORY:
140
+ value: True # Pin memory in DataLoader for faster GPU transfer (recommended: True)
141
+ PREFETCH_FACTOR:
142
+ value: 2 # Number of batches to prefetch per worker (higher = more memory usage)
143
+
144
+ ### HIGHLIGHTS
145
+ MOGE_MODEL:
146
+ value: "Ruicheng/moge-2-vits-normal" # MoGe model name for normal estimation (HuggingFace format)
147
+ SURFACE_ROUGHNESS:
148
+ value: 8.0 # Blinn-Phong surface roughness exponent (higher = sharper highlights)
149
+ INTENSITY:
150
+ value: 2.0 # Specular highlight intensity multiplier
151
+ LIGHT_DISTANCE_RANGE:
152
+ value: [0.0, 1] # Range for light source distance sampling [min, max] (normalized)
153
+ LIGHT_LEFT_RIGHT_ANGLE:
154
+ value: [0, 360] # Range for light source horizontal angle [min, max] in degrees
155
+ LIGHT_ABOVE_BELOW_ANGLE:
156
+ value: [0, 360] # Range for light source vertical angle [min, max] in degrees
157
+ DATASET_HIGHLIGHT_DILATION:
158
+ value: 25 # Dilation kernel size (pixels) for dataset highlight masks
159
+ DATASET_HIGHLIGHT_THRESHOLD:
160
+ value: 0.85 # Brightness/luminance threshold (0-1) for detecting highlights in dataset images
161
+ DATASET_HIGHLIGHT_USE_LUMINANCE:
162
+ value: True # If True, use perceptually-weighted luminance (0.299*R + 0.587*G + 0.114*B) for dataset highlights; if False, use simple mean brightness
163
+ HIGHLIGHT_COLOR:
164
+ value: [1.0, 1.0, 1.0] # RGB color for synthetic highlights (normalized 0-1)
165
+ CLAMP_RECONSTRUCTION:
166
+ value: True # Clamp reconstructed images to [0, 1] range if True
167
+
168
+ ### OPTIMIZATION
169
+ LEARNING_RATE:
170
+ value: 1.0e-3 # Base learning rate for optimizer # DEPRECATED. SETTING LR FOR EACH MODULE ABOVE
171
+ WEIGHT_DECAY:
172
+ value: 0.0 # L2 regularization weight (0.0 = no weight decay)
173
+ EPOCHS:
174
+ value: 10 # Maximum number of training epochs
175
+ GRADIENT_ACCUMULATION_STEPS:
176
+ value: 1 # Number of steps to accumulate gradients before optimizer step (1 = no accumulation)
177
+ WARMUP:
178
+ value: 100 # Number of warmup steps for learning rate schedule (linear warmup from 0 to LR)
179
+ GRADIENT_CLIPPING_MAX_NORM:
180
+ value: 8 # Maximum gradient norm for clipping (set to -1 to disable clipping)
181
+ LR_SCHEDULER:
182
+ value:
183
+ ONPLATEAU: # ReduceLROnPlateau scheduler (reduces LR when validation metric plateaus)
184
+ PATIENCE: 5 # Number of epochs to wait before reducing LR
185
+ FACTOR: 0.1 # Factor by which LR is reduced (new_lr = old_lr * factor)
186
+ COSINE: # CosineAnnealingLR scheduler (cosine annealing schedule)
187
+ N_PERIODS: 0.5 # Number of cosine periods over training
188
+ # STEPWISE: # StepLR scheduler (reduces LR at fixed step intervals)
189
+ # N_STEPS: 4 # Number of times to reduce LR during training
190
+ # GAMMA: 0.5 # Factor by which LR is reduced at each step (new_lr = old_lr * gamma)
191
+ # EXPONENTIAL: # ExponentialLR scheduler (exponential decay)
192
+ # GAMMA: 0.5 # Multiplicative factor for exponential decay
193
+
194
+ OPTIMIZER_BOOTSTRAP_NAME:
195
+ value: "AdamW" # Optimizer name for initial training phase ("Adam", "SGD", etc.)
196
+ EARLY_STOPPING_PATIENCE:
197
+ value: 10 # Number of epochs without improvement before stopping training
198
+ SAVE_INTERVAL:
199
+ value: 1000 # Number of training steps between model checkpoints
200
+
201
+ DATASET_HIGHLIGHT_SUPERVISION_THRESHOLD:
202
+ value: 0.1 # Pixel highlights above this threshold (should be low) are excluded from supervision
203
+
204
+ ### LOSS WEIGHTS (relative to the total loss, NOT NORMALIZED LATER)
205
+ SPECULAR_LOSS_WEIGHT:
206
+ value: 0.0 # Weight for specular component reconstruction loss
207
+ DIFFUSE_LOSS_WEIGHT:
208
+ value: 1.0 # Weight for diffuse component reconstruction loss
209
+ HIGHLIGHT_LOSS_WEIGHT:
210
+ value: 1.0 # Weight for highlight mask regression loss
211
+ TOKEN_INPAINT_LOSS_WEIGHT:
212
+ value: 1.0 # Weight for token-space inpainting loss (L1 + cosine similarity in feature space)
213
+ IMAGE_RECONSTRUCTION_LOSS_WEIGHT:
214
+ value: 0.0 # Weight for full image reconstruction loss
215
+
216
+ HLREG_W_L1:
217
+ value: 1.0 # Weight for L1 loss in highlight regression
218
+ HLREG_USE_CHARB:
219
+ value: True # Use Charbonnier loss (smooth L1) instead of standard L1 if True
220
+ HLREG_W_DICE:
221
+ value: 0.2 # Weight for Dice loss in highlight regression (for mask overlap)
222
+ HLREG_W_SSIM:
223
+ value: 0.0 # Weight for SSIM loss in highlight regression
224
+ HLREG_W_GRAD:
225
+ value: 0.0 # Weight for gradient loss in highlight regression
226
+ HLREG_W_TV:
227
+ value: 0.0 # Weight for total variation loss in highlight regression
228
+ HLREG_BALANCE_MODE:
229
+ value: "auto" # Class balancing mode for highlight regression: 'none' | 'auto' | 'pos_weight'
230
+ HLREG_POS_WEIGHT:
231
+ value: 1.0 # Positive class weight (used only if BALANCE_MODE == 'pos_weight')
232
+ HLREG_FOCAL_GAMMA:
233
+ value: 2.0 # Focal loss gamma parameter (0.0 = standard BCE, 1.0-2.0 helps with gradient vanishing)
234
+
235
+ WEIGHT_SEAM:
236
+ value: 0.5 # Weight for gradient matching loss on saturation ring
237
+ RING_DILATE_KERNEL:
238
+ value: 17 # Dilation kernel size (odd number) for creating ring mask around highlights
239
+ SEAM_USE_CHARB:
240
+ value: True # Use Charbonnier loss instead of L1 in seam loss (smooth L1 for boundary consistency)
241
+ SEAM_WEIGHT_GRAD:
242
+ value: 0.0 # Weight for gradient matching term inside seam loss (0.0 = disable gradient term)
243
+ TOKEN_FEAT_ALPHA:
244
+ value: 0.5 # Mixing factor for token feature loss: alpha * L1 + (1-alpha) * (1-cosine_sim)
245
+
246
+ ### DIFFUSE HIGHLIGHT PENALTY
247
+ WEIGHT_DIFFUSE_HIGHLIGHT_PENALTY:
248
+ value: 0.75 # Weight for penalty loss on highlights in diffuse decoder output (0.0 = disabled)
249
+ DIFFUSE_HL_THRESHOLD:
250
+ value: 0.8 # Brightness/luminance threshold for detecting highlights in diffuse (0.0-1.0)
251
+ DIFFUSE_HL_USE_CHARB:
252
+ value: True # Use Charbonnier loss instead of L1 for diffuse highlight penalty
253
+ DIFFUSE_HL_PENALTY_MODE:
254
+ value: "brightness" # Penalty mode: "brightness" (penalize brightness/luminance above threshold) or "pixel" (penalize RGB values directly)
255
+ DIFFUSE_HL_TARGET_BRIGHTNESS:
256
+ value: null # Target brightness/luminance for penalized pixels (null = use threshold value)
257
+ DIFFUSE_HL_USE_LUMINANCE:
258
+ value: False # If True, use perceptually-weighted luminance (0.299*R + 0.587*G + 0.114*B); if False, use simple mean brightness
259
+
260
+ ### LOGGING, RESULTS AND WANDB
261
+ LOG_INTERVAL:
262
+ value: 1 # Number of training steps between console log outputs
263
+ WANDB_LOG_INTERVAL:
264
+ value: 1 # Number of training steps between WandB metric logs
265
+ IMAGE_LOG_INTERVAL:
266
+ value: 5 # Number of training steps between image logging to WandB
267
+ NO_WANDB:
268
+ value: False # Disable WandB logging if True (useful for local debugging)
269
+ MODEL_WATCHER_FREQ_WANDB:
270
+ value: 50 # Frequency (in steps) for logging model parameter histograms to WandB
271
+ WANDB_ENTITY:
272
+ value: "unreflect-anything" # WandB organization/entity name
273
+ WANDB_PROJECT:
274
+ value: "UnReflectAnything" # WandB project name
275
+ NOTES:
276
+ value: "LOCAL_PRIOR_WEIGHT 0.9" # Notes/description for this training run
configs/highlight_decoder_pretrain.yaml ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### BASELINE: CONVERGES AFTER LONG
2
+
3
+ parameters:
4
+
5
+ ### MODEL ARCHITECTURE
6
+ MODEL:
7
+ value:
8
+ MODEL_CLASS: "UnReflect_Model" # Main model class name (must match class in models.py) # <<<<<<<<< DECODER PRETRAINING: NOT USING TOKEN INPAINTER (DIRECT FROM DINO)
9
+ MODEL_MODULE: "models" # Module name to import model classes from (default: "models")
10
+ RGB_ENCODER:
11
+ ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format)
12
+ IMAGE_SIZE: 896 # Input image size (height and width in pixels)
13
+ RETURN_SELECTED_LAYERS: [3, 6, 9, 12] # Transformer layer indices to extract features from (0-indexed)
14
+ RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
15
+ DECODERS:
16
+ highlight:
17
+ FEATURE_DIM: 1024 # Feature dimension for highlight decoder
18
+ REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
19
+ REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
20
+ READOUT_TYPE: "ignore" # Readout type for DPT decoder
21
+ # FROM_PRETRAINED: "highlight_decoder.pt" # Path to pretrained token inpainter weights (optional)
22
+ USE_BN: False # Use batch normalization in decoder
23
+ DROPOUT: 0.1 # Dropout rate in decoder layers
24
+ OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
25
+ OUTPUT_CHANNELS: 1 # Number of output channels (1 for highlight mask)
26
+ DECODER_LR: 1.0e-6 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
27
+ NUM_FUSION_BLOCKS_TRAINABLE: null # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
28
+
29
+ TOKEN_INPAINTER: # <<<<<<<<<<<< DOESNT MATTER, MODEL CLASS IS NOT TOKEN INPAINTER
30
+ TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
31
+ TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
32
+ FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
33
+ TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
34
+ DEPTH: 6 # Number of transformer blocks
35
+ HEADS: 16 # Number of attention heads
36
+ DROP: 0 # Dropout rate
37
+ USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings
38
+ USE_FINAL_NORM: True # Enable final LayerNorm before output projection
39
+ USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds
40
+ LOCAL_PRIOR_WEIGHT: 0.5 # Weight for local prior blending (1.0 = only mask_token, 0.0 = only local mean)
41
+ LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
42
+ SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
43
+ INPAINT_MASK_DILATION:
44
+ value: 3 # Dilation kernel size (pixels) for inpaint mask - Must be odd
45
+ USE_TORCH_COMPILE: # Enable PyTorch 2.0 torch.compile for faster training (experimental)
46
+ value: False
47
+ DISTRIBUTE:
48
+ value: "ddp"
49
+
50
+ ### DATA
51
+ DATASETS:
52
+ value:
53
+ SHIQ:
54
+ VAL_SCENES: ["test"]
55
+ RESIZE_MODE: "resize+crop"
56
+ TARGET_SIZE: [896,896]
57
+ SAMPLE_EVERY_N: 2
58
+
59
+ SCARED:
60
+ VAL_SCENES: ["v22","v23","v24","v25","v26","v27","v28","v29","v30","v31","v32","v33","v34"] # Validation scene names
61
+ RESIZE_MODE: "resize+crop" # Image resizing mode
62
+ SAMPLE_EVERY_N: 4 # Load every Nth frame
63
+
64
+ ALL_DATASETS:
65
+ FEW_IMAGES: False
66
+ TARGET_SIZE: [896,896]
67
+ LOAD_RGB_ONLY: True
68
+ LOAD_HIGHLIGHT: True
69
+
70
+
71
+ BATCH_SIZE: # Max batch size with img size 896 is 32
72
+ value: 20 # Number of samples per batch (adjust based on GPU memory)
73
+ NUM_WORKERS:
74
+ value: 12 # Number of data loading worker processes (0 = main process only, "auto" = 90% of CPU affinity)
75
+ SHUFFLE:
76
+ value: True # Shuffle training data each epoch (False for validation/test)
77
+ PIN_MEMORY:
78
+ value: True # Pin memory in DataLoader for faster GPU transfer (recommended: True)
79
+ PREFETCH_FACTOR:
80
+ value: 2 # Number of batches to prefetch per worker (higher = more memory usage)
81
+
82
+ ### HIGHLIGHTS
83
+ MOGE_MODEL:
84
+ value: "Ruicheng/moge-2-vits-normal" # MoGe model name for normal estimation (HuggingFace format)
85
+ SURFACE_ROUGHNESS:
86
+ value: 100.0 # Blinn-Phong surface roughness exponent (higher = sharper highlights)
87
+ INTENSITY:
88
+ value: 0.8 # Specular highlight intensity multiplier
89
+ LIGHT_DISTANCE_RANGE:
90
+ value: [0.0, 1] # Range for light source distance sampling [min, max] (normalized)
91
+ LIGHT_LEFT_RIGHT_ANGLE:
92
+ value: [0, 360] # Range for light source horizontal angle [min, max] in degrees
93
+ LIGHT_ABOVE_BELOW_ANGLE:
94
+ value: [0, 360] # Range for light source vertical angle [min, max] in degrees
95
+ DATASET_HIGHLIGHT_DILATION:
96
+ value: 25 #sDilation kernel size (pixels) for dataset highlight masks
97
+ DATASET_HIGHLIGHT_THRESHOLD:
98
+ value: 0.9 # Brightness/luminance threshold (0-1) for detecting highlights in dataset images
99
+ DATASET_HIGHLIGHT_USE_LUMINANCE:
100
+ value: True # If True, use perceptually-weighted luminance (0.299*R + 0.587*G + 0.114*B) for dataset highlights; if False, use simple mean brightness
101
+ HIGHLIGHT_COLOR:
102
+ value: [1.0, 1.0, 1.0] # RGB color for synthetic highlights (normalized 0-1)
103
+ CLAMP_RECONSTRUCTION:
104
+ value: True # Clamp reconstructed images to [0, 1] range if True
105
+
106
+ ### OPTIMIZATION
107
+ EPOCHS:
108
+ value: 20 # Maximum number of training epochs<
109
+ LEARNING_RATE:
110
+ value: 1.0e-4 # Base learning rate for optimizer
111
+ WEIGHT_DECAY:
112
+ value: 0.0 # L2 regularization weight (0.0 = no weight decay)
113
+ GRADIENT_ACCUMULATION_STEPS:
114
+ value: 1 # Number of steps to accumulate gradients before optimizer step (1 = no accumulation)
115
+ WARMUP:
116
+ value: 100 # Number of warmup steps for learning rate schedule (linear warmup from 0 to LR)
117
+ GRADIENT_CLIPPING_MAX_NORM:
118
+ value: 8 # Maximum gradient norm for clipping (set to -1 to disable clipping)
119
+ LR_SCHEDULER:
120
+ value:
121
+ ONPLATEAU: # ReduceLROnPlateau scheduler (reduces LR when validation metric plateaus)
122
+ PATIENCE: 5 # Number of epochs to wait before reducing LR
123
+ FACTOR: 0.1 # Factor by which LR is reduced (new_lr = old_lr * factor)
124
+ COSINE: # CosineAnnealingLR scheduler (cosine annealing schedule)
125
+ N_PERIODS: 1 # Number of cosine periods over training
126
+ # STEPWISE: # StepLR scheduler (reduces LR at fixed step intervals)
127
+ # N_STEPS: 5 # Number of times to reduce LR during training
128
+ # GAMMA: 0.25 # Factor by which LR is reduced at each step (new_lr = old_lr * gamma)
129
+ # EXPONENTIAL: # ExponentialLR scheduler (exponential decay)
130
+ # GAMMA: 0.5 # Multiplicative factor for exponential decay
131
+
132
+ SWITCH_OPTIMIZER_EPOCH:
133
+ value: null # Epoch number to switch from bootstrap to refining optimizer (null = no switch)
134
+ OPTIMIZER_BOOTSTRAP_NAME:
135
+ value: "AdamW" # Optimizer name for initial training phase ("Adam", "SGD", etc.)
136
+ OPTIMIZER_REFINING_NAME:
137
+ value: "AdamW" # Optimizer name for refining phase (used after SWITCH_OPTIMIZER_EPOCH)
138
+ EARLY_STOPPING_PATIENCE:
139
+ value: 20 # Number of epochs without improvement before stopping training
140
+ SAVE_INTERVAL:
141
+ value: 1000 # Number of training steps between model checkpoints
142
+
143
+ DATASET_HIGHLIGHT_SUPERVISION_THRESHOLD:
144
+ value: 0.1 # Pixel highlights above this threshold (should be low) are excluded from supervision
145
+
146
+ ### LOSS WEIGHTS (relative to the total loss, NOT NORMALIZED LATER)
147
+ SPECULAR_LOSS_WEIGHT:
148
+ value: 0.0 # Weight for specular component reconstruction loss
149
+ DIFFUSE_LOSS_WEIGHT:
150
+ value: 0.0 # Weight for diffuse component reconstruction loss
151
+ HIGHLIGHT_LOSS_WEIGHT:
152
+ value: 1.0 # Weight for highlight mask regression loss
153
+ TOKEN_INPAINT_LOSS_WEIGHT:
154
+ value: 0.0 # Weight for token-space inpainting loss (L1 + cosine similarity in feature space)
155
+
156
+ IMAGE_RECONSTRUCTION_LOSS_WEIGHT:
157
+ value: 0.0 # Weight for full image reconstruction loss
158
+ SATURATION_RING_LOSS_WEIGHT:
159
+ value: 0.0 # Weight for saturation ring consistency loss (around highlight regions)
160
+ RING_KERNEL_SIZE:
161
+ value: 11 # Kernel size (odd number) for saturation ring dilation around highlights
162
+ RING_VAR_WEIGHT:
163
+ value: 0.5 # Weight for variance matching in saturation ring loss (vs mean matching)
164
+ RING_TEXTURE_WEIGHT:
165
+ value: 0.0 # Weight for texture consistency term in saturation ring loss
166
+ HLREG_W_L1:
167
+ value: 1.0 # Weight for L1 loss in highlight regression
168
+ HLREG_USE_CHARB:
169
+ value: True # Use Charbonnier loss (smooth L1) instead of standard L1 if True
170
+ HLREG_W_DICE:
171
+ value: 0.2 # Weight for Dice loss in highlight regression (for mask overlap)
172
+ HLREG_W_SSIM:
173
+ value: 0.0 # Weight for SSIM loss in highlight regression
174
+ HLREG_W_GRAD:
175
+ value: 0.0 # Weight for gradient loss in highlight regression
176
+ HLREG_W_TV:
177
+ value: 0.0 # Weight for total variation loss in highlight regression
178
+ HLREG_BALANCE_MODE:
179
+ value: "auto" # Class balancing mode for highlight regression: 'none' | 'auto' | 'pos_weight'
180
+ HLREG_POS_WEIGHT:
181
+ value: 1.0 # Positive class weight (used only if BALANCE_MODE == 'pos_weight')
182
+ HLREG_FOCAL_GAMMA:
183
+ value: 2.0 # Focal loss gamma parameter (0.0 = standard BCE, 1.0-2.0 helps with gradient vanishing)
184
+
185
+ WEIGHT_CONTEXT_IDENTITY:
186
+ value: 0.0 # LEAVE TO 0.0: Weight for L1 loss on context (non-masked) regions (identity preservation)
187
+ WEIGHT_TV_IN_HOLE:
188
+ value: 0.0 # LEAVE TO 0.0: Weight for total variation loss inside masked/hole regions
189
+ RING_DILATE_KERNEL:
190
+ value: 17 # Dilation kernel size (odd number) for creating ring mask around highlights
191
+ WEIGHT_SEAM:
192
+ value: 0.0 # Weight for gradient matching loss on saturation ring
193
+ SEAM_USE_CHARB:
194
+ value: True # Use Charbonnier loss instead of L1 in seam loss (smooth L1 for boundary consistency)
195
+ SEAM_WEIGHT_GRAD:
196
+ value: 0.0 # Weight for gradient matching term inside seam loss (0.0 = disable gradient term)
197
+ TOKEN_FEAT_ALPHA:
198
+ value: 0.5 # Mixing factor for token feature loss: alpha * L1 + (1-alpha) * (1-cosine_sim)
199
+
200
+ ### DIFFUSE HIGHLIGHT PENALTY
201
+ WEIGHT_DIFFUSE_HIGHLIGHT_PENALTY:
202
+ value: 0.0 # Weight for penalty loss on highlights in diffuse decoder output (0.0 = disabled)
203
+ DIFFUSE_HL_THRESHOLD:
204
+ value: 0.8 # Brightness/luminance threshold for detecting highlights in diffuse (0.0-1.0)
205
+ DIFFUSE_HL_USE_CHARB:
206
+ value: True # Use Charbonnier loss instead of L1 for diffuse highlight penalty
207
+ DIFFUSE_HL_PENALTY_MODE:
208
+ value: "brightness" # Penalty mode: "brightness" (penalize brightness/luminance above threshold) or "pixel" (penalize RGB values directly)
209
+ DIFFUSE_HL_TARGET_BRIGHTNESS:
210
+ value: null # Target brightness/luminance for penalized pixels (null = use threshold value)
211
+ DIFFUSE_HL_USE_LUMINANCE:
212
+ value: True # If True, use perceptually-weighted luminance (0.299*R + 0.587*G + 0.114*B); if False, use simple mean brightness
213
+
214
+ ### LOGGING, RESULTS AND WANDB
215
+ LOG_INTERVAL:
216
+ value: 1 # Number of training steps between console log outputs
217
+ WANDB_LOG_INTERVAL:
218
+ value: 1 # Number of training steps between WandB metric logs
219
+ IMAGE_LOG_INTERVAL:
220
+ value: 5 # Number of training steps between image logging to WandB
221
+ NO_WANDB:
222
+ value: False # Disable WandB logging if True (useful for local debugging)
223
+ MODEL_WATCHER_FREQ_WANDB:
224
+ value: 50 # Frequency (in steps) for logging model parameter histograms to WandB
225
+ WANDB_ENTITY:
226
+ value: "unreflect-anything" # WandB organization/entity name
227
+ WANDB_PROJECT:
228
+ value: "UnReflectAnything" # WandB project name
229
+ NOTES:
230
+ value: "" # Notes/description for this training run
231
+
configs/pretrained_config.yaml CHANGED
@@ -18,7 +18,7 @@ parameters:
18
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
19
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
20
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
21
- # FROM_PRETRAINED: "weights/diffuse_decoder.pt" # Path to pretrained decoder weights (optional)
22
  USE_BN: False # Use batch normalization in decoder
23
  DROPOUT: 0.1 # Dropout rate in decoder layers
24
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
@@ -40,7 +40,7 @@ parameters:
40
  TOKEN_INPAINTER:
41
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
42
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
43
- # FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
44
  TOKEN_INPAINTER_LR: 1.0e-4 # Learning rate for token inpainter (can differ from base LR)
45
  DEPTH: 6 # Number of transformer blocks
46
  HEADS: 16 # Number of attention heads
 
18
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
19
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
20
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
21
+ # FROM_PRETRAINED: "diffuse_decoder.pt" # Path to pretrained decoder weights (optional)
22
  USE_BN: False # Use batch normalization in decoder
23
  DROPOUT: 0.1 # Dropout rate in decoder layers
24
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
 
40
  TOKEN_INPAINTER:
41
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
42
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
43
+ # FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
44
  TOKEN_INPAINTER_LR: 1.0e-4 # Learning rate for token inpainter (can differ from base LR)
45
  DEPTH: 6 # Number of transformer blocks
46
  HEADS: 16 # Number of attention heads
configs/rebuttal/ablate_DWConv.yaml CHANGED
@@ -19,7 +19,7 @@ parameters:
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
- FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
@@ -42,7 +42,7 @@ parameters:
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
- FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
 
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
+ FROM_PRETRAINED: "rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
 
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
+ FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
configs/rebuttal/ablate_Dice.yaml CHANGED
@@ -19,7 +19,7 @@ parameters:
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
- FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
@@ -42,7 +42,7 @@ parameters:
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
- FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
 
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
+ FROM_PRETRAINED: "rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
 
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
+ FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
configs/rebuttal/ablate_L1.yaml CHANGED
@@ -19,7 +19,7 @@ parameters:
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
- FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
@@ -42,7 +42,7 @@ parameters:
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
- FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
 
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
+ FROM_PRETRAINED: "rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
 
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
+ FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
configs/rebuttal/ablate_LMasktoken.yaml CHANGED
@@ -19,7 +19,7 @@ parameters:
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
- FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
@@ -42,7 +42,7 @@ parameters:
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
- FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
 
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
+ FROM_PRETRAINED: "rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
 
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
+ FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
configs/rebuttal/ablate_PosEnc.yaml CHANGED
@@ -19,7 +19,7 @@ parameters:
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
- FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
@@ -42,7 +42,7 @@ parameters:
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
- FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
 
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
+ FROM_PRETRAINED: "rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
 
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
+ FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
configs/rebuttal/ablate_RGB.yaml CHANGED
@@ -19,7 +19,7 @@ parameters:
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
- FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
@@ -42,7 +42,7 @@ parameters:
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
- FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
 
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
+ FROM_PRETRAINED: "rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
 
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
+ FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
configs/rebuttal/ablate_Seam.yaml CHANGED
@@ -19,7 +19,7 @@ parameters:
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
- FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
@@ -42,7 +42,7 @@ parameters:
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
- FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
 
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
+ FROM_PRETRAINED: "rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
 
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
+ FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
configs/rebuttal/ablate_SoftTHR.yaml CHANGED
@@ -19,7 +19,7 @@ parameters:
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
- FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
@@ -42,7 +42,7 @@ parameters:
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Blended" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
- FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
 
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
+ FROM_PRETRAINED: "rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
 
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Blended" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
+ FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
configs/rebuttal/ablate_Spec.yaml CHANGED
@@ -19,7 +19,7 @@ parameters:
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
- FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
@@ -42,7 +42,7 @@ parameters:
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
- FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
 
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
+ FROM_PRETRAINED: "rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
 
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
+ FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
configs/rebuttal/ablate_TV.yaml CHANGED
@@ -19,7 +19,7 @@ parameters:
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
- FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
@@ -42,7 +42,7 @@ parameters:
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
- FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
 
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
+ FROM_PRETRAINED: "rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [448,448] # Output image resolution [height, width]
 
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
+ FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights
46
  TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
configs/{tokeninp_preatrain.yaml → sweeps/tokeninp_pretrain.yaml} RENAMED
@@ -1,3 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
1
  ### BASELINE: CONVERGES AFTER LONG
2
 
3
  parameters:
@@ -18,7 +29,7 @@ parameters:
18
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
19
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
20
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
21
- FROM_PRETRAINED: "weights/diffuse_decoder.pt" # Path to pretrained decoder weights (optional)
22
  USE_BN: False # Use batch normalization in decoder
23
  DROPOUT: 0.1 # Dropout rate in decoder layers
24
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
@@ -31,6 +42,7 @@ parameters:
31
  REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
32
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
33
  READOUT_TYPE: "ignore" # Readout type for DPT decoder
 
34
  USE_BN: False # Use batch normalization in decoder
35
  DROPOUT: 0.1 # Dropout rate in decoder layers
36
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
@@ -40,7 +52,7 @@ parameters:
40
  TOKEN_INPAINTER:
41
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
42
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
43
- # FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
44
  TOKEN_INPAINTER_LR: 1.0e-4 # Learning rate for token inpainter (can differ from base LR)
45
  DEPTH: 6 # Number of transformer blocks
46
  HEADS: 16 # Number of attention heads
@@ -73,7 +85,7 @@ parameters:
73
  VAL_SCENES: ["val_scene1","val_scene2"] # Validation scene names
74
  TARGET_SIZE: [896,896] # Target image size [height, width]
75
  RESIZE_MODE: "resize+crop" # Image resizing mode
76
- FEW_IMAGES: False # Load only first 10 images if True
77
  SAMPLE_EVERY_N: 2 # Load every Nth frame
78
  LOAD_RGB_ONLY: True # Ignore polarization data if True
79
 
@@ -82,7 +94,7 @@ parameters:
82
  # VAL_SCENES: "station" # Validation scene names (optional)
83
  TARGET_SIZE: [896,896] # Target image size [height, width]
84
  RESIZE_MODE: "resize" # Image resizing mode
85
- FEW_IMAGES: False # Load only first 10 images if True
86
  SAMPLE_EVERY_N: 2 # Load every Nth frame
87
  LOAD_RGB_ONLY: True # Ignore polarization data if True
88
 
@@ -91,7 +103,7 @@ parameters:
91
  VAL_SCENES: "PSD_Val" # Validation scene name (string or list)
92
  TARGET_SIZE: [896,896] # Target image size [height, width]
93
  RESIZE_MODE: "resize+crop" # Image resizing mode
94
- FEW_IMAGES: False # Load only first 10 images if True
95
  SAMPLE_EVERY_N: 1 # Load every Nth frame (1 = all frames)
96
  LOAD_RGB_ONLY: True # Ignore polarization data if True
97
 
@@ -101,7 +113,7 @@ parameters:
101
  RESIZE_MODE: "resize+crop" # Image resizing mode
102
  SAMPLE_EVERY_N: 8 # Load every Nth frame
103
  LOAD_RGB_ONLY: True # Ignore polarization data if True
104
- FEW_IMAGES: False # Load only first 10 images if True
105
  HIGHLIGHT_ENABLE: False # Enable highlight detection/processing in dataset
106
  HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection (0-1)
107
  HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
@@ -129,7 +141,7 @@ parameters:
129
  RESIZE_MODE: "resize+crop" # Image resizing mode
130
  SAMPLE_EVERY_N: 10 # Load every Nth frame
131
  LOAD_RGB_ONLY: True # Ignore polarization data if True
132
- FEW_IMAGES: False # Load only first 10 images if True
133
  HIGHLIGHT_ENABLE: False # Enable highlight detection/processing
134
  HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection
135
  HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
@@ -141,24 +153,17 @@ parameters:
141
  VAL_SCENES: ["realsense"] # Validation scene names
142
  TARGET_SIZE: [896,896] # Target image size [height, width]
143
  RESIZE_MODE: "resize+crop" # Image resizing mode
144
- SAMPLE_EVERY_N: 1 # Load every Nth frame
145
  LOAD_RGB_ONLY: True # Ignore polarization data if True
146
- FEW_IMAGES: False # Load only first 10 images if True
147
- HIGHLIGHT_ENABLE: False # Enable highlight detection/processing
148
- HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection
149
- HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
150
- HIGHLIGHT_RECT_SIZE: [800, 800] # Size of highlight rectangle region
151
- HIGHLIGHT_RETURN_RECT_AS_RGB: False # Return highlight rectangle as RGB if True
152
- HIGHLIGHT_RETURN_RECT: True # Return highlight rectangle region if True
153
-
154
 
155
- FEW_IMAGES_OVERRIDE:
156
  value: False # If True, override all datasets' FEW_IMAGES to True (for quick debugging across all datasets)
157
 
158
  BATCH_SIZE: # Max batch size with img size 896 is 32
159
- value: 4 # Number of samples per batch (adjust based on GPU memory)
160
  NUM_WORKERS:
161
- value: 4 # Number of data loading worker processes (0 = main process only, "auto" = 90% of CPU affinity)
162
  SHUFFLE:
163
  value: True # Shuffle training data each epoch (False for validation/test)
164
  PIN_MEMORY:
 
1
+ ### WANDB SWEEP CONFIG ###
2
+ method: grid
3
+ metric:
4
+ goal: maximize
5
+ name: Validation/epoch/SSIM/diffuse
6
+
7
+ program: sweep_agent.py
8
+
9
+ project: UnReflectAnything
10
+ entity: unreflect-anything
11
+
12
  ### BASELINE: CONVERGES AFTER LONG
13
 
14
  parameters:
 
29
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
30
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
31
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
32
+ FROM_PRETRAINED: "diffuse_decoder.pt" # Path to pretrained decoder weights (optional)
33
  USE_BN: False # Use batch normalization in decoder
34
  DROPOUT: 0.1 # Dropout rate in decoder layers
35
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
 
42
  REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
43
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
44
  READOUT_TYPE: "ignore" # Readout type for DPT decoder
45
+ FROM_PRETRAINED: "highlight_decoder.pt" # Path to pretrained decoder weights (optional)
46
  USE_BN: False # Use batch normalization in decoder
47
  DROPOUT: 0.1 # Dropout rate in decoder layers
48
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
 
52
  TOKEN_INPAINTER:
53
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
54
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
55
+ # FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
56
  TOKEN_INPAINTER_LR: 1.0e-4 # Learning rate for token inpainter (can differ from base LR)
57
  DEPTH: 6 # Number of transformer blocks
58
  HEADS: 16 # Number of attention heads
 
85
  VAL_SCENES: ["val_scene1","val_scene2"] # Validation scene names
86
  TARGET_SIZE: [896,896] # Target image size [height, width]
87
  RESIZE_MODE: "resize+crop" # Image resizing mode
88
+
89
  SAMPLE_EVERY_N: 2 # Load every Nth frame
90
  LOAD_RGB_ONLY: True # Ignore polarization data if True
91
 
 
94
  # VAL_SCENES: "station" # Validation scene names (optional)
95
  TARGET_SIZE: [896,896] # Target image size [height, width]
96
  RESIZE_MODE: "resize" # Image resizing mode
97
+
98
  SAMPLE_EVERY_N: 2 # Load every Nth frame
99
  LOAD_RGB_ONLY: True # Ignore polarization data if True
100
 
 
103
  VAL_SCENES: "PSD_Val" # Validation scene name (string or list)
104
  TARGET_SIZE: [896,896] # Target image size [height, width]
105
  RESIZE_MODE: "resize+crop" # Image resizing mode
106
+
107
  SAMPLE_EVERY_N: 1 # Load every Nth frame (1 = all frames)
108
  LOAD_RGB_ONLY: True # Ignore polarization data if True
109
 
 
113
  RESIZE_MODE: "resize+crop" # Image resizing mode
114
  SAMPLE_EVERY_N: 8 # Load every Nth frame
115
  LOAD_RGB_ONLY: True # Ignore polarization data if True
116
+
117
  HIGHLIGHT_ENABLE: False # Enable highlight detection/processing in dataset
118
  HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection (0-1)
119
  HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
 
141
  RESIZE_MODE: "resize+crop" # Image resizing mode
142
  SAMPLE_EVERY_N: 10 # Load every Nth frame
143
  LOAD_RGB_ONLY: True # Ignore polarization data if True
144
+
145
  HIGHLIGHT_ENABLE: False # Enable highlight detection/processing
146
  HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection
147
  HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
 
153
  VAL_SCENES: ["realsense"] # Validation scene names
154
  TARGET_SIZE: [896,896] # Target image size [height, width]
155
  RESIZE_MODE: "resize+crop" # Image resizing mode
156
+ SAMPLE_EVERY_N: 4 # Load every Nth frame
157
  LOAD_RGB_ONLY: True # Ignore polarization data if True
158
+
 
 
 
 
 
 
 
159
 
160
+ FEW_IMAGES_ALL_DATASETS:
161
  value: False # If True, override all datasets' FEW_IMAGES to True (for quick debugging across all datasets)
162
 
163
  BATCH_SIZE: # Max batch size with img size 896 is 32
164
+ value: 8 # Number of samples per batch (adjust based on GPU memory)
165
  NUM_WORKERS:
166
+ value: 12 # Number of data loading worker processes (0 = main process only, "auto" = 90% of CPU affinity)
167
  SHUFFLE:
168
  value: True # Shuffle training data each epoch (False for validation/test)
169
  PIN_MEMORY:
configs/{finetune_e2e.yaml → tokeninp_pretrain.yaml} RENAMED
@@ -18,12 +18,12 @@ parameters:
18
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
19
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
20
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
21
- FROM_PRETRAINED: "weights/diffuse_decoder.pt" # Path to pretrained decoder weights (optional)
22
  USE_BN: False # Use batch normalization in decoder
23
  DROPOUT: 0.1 # Dropout rate in decoder layers
24
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
25
  OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image)
26
- DECODER_LR: 1.0e-5 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
27
  NUM_FUSION_BLOCKS_TRAINABLE: 1 # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
28
  TRAIN_RGB_HEAD: True # Whether to train RGB head (true/false, null = train if DECODER_LR != 0)
29
  highlight:
@@ -35,12 +35,12 @@ parameters:
35
  DROPOUT: 0.1 # Dropout rate in decoder layers
36
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
37
  OUTPUT_CHANNELS: 1 # Number of output channels (1 for highlight mask)
38
- DECODER_LR: 0.0 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
39
  NUM_FUSION_BLOCKS_TRAINABLE: null # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
40
  TOKEN_INPAINTER:
41
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
42
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
43
- FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
44
  TOKEN_INPAINTER_LR: 1.0e-4 # Learning rate for token inpainter (can differ from base LR)
45
  DEPTH: 6 # Number of transformer blocks
46
  HEADS: 16 # Number of attention heads
@@ -73,7 +73,7 @@ parameters:
73
  VAL_SCENES: ["val_scene1","val_scene2"] # Validation scene names
74
  TARGET_SIZE: [896,896] # Target image size [height, width]
75
  RESIZE_MODE: "resize+crop" # Image resizing mode
76
- FEW_IMAGES: False # Load only first 10 images if True
77
  SAMPLE_EVERY_N: 2 # Load every Nth frame
78
  LOAD_RGB_ONLY: True # Ignore polarization data if True
79
 
@@ -82,7 +82,7 @@ parameters:
82
  # VAL_SCENES: "station" # Validation scene names (optional)
83
  TARGET_SIZE: [896,896] # Target image size [height, width]
84
  RESIZE_MODE: "resize" # Image resizing mode
85
- FEW_IMAGES: False # Load only first 10 images if True
86
  SAMPLE_EVERY_N: 2 # Load every Nth frame
87
  LOAD_RGB_ONLY: True # Ignore polarization data if True
88
 
@@ -91,7 +91,7 @@ parameters:
91
  VAL_SCENES: "PSD_Val" # Validation scene name (string or list)
92
  TARGET_SIZE: [896,896] # Target image size [height, width]
93
  RESIZE_MODE: "resize+crop" # Image resizing mode
94
- FEW_IMAGES: False # Load only first 10 images if True
95
  SAMPLE_EVERY_N: 1 # Load every Nth frame (1 = all frames)
96
  LOAD_RGB_ONLY: True # Ignore polarization data if True
97
 
@@ -101,7 +101,7 @@ parameters:
101
  RESIZE_MODE: "resize+crop" # Image resizing mode
102
  SAMPLE_EVERY_N: 8 # Load every Nth frame
103
  LOAD_RGB_ONLY: True # Ignore polarization data if True
104
- FEW_IMAGES: False # Load only first 10 images if True
105
  HIGHLIGHT_ENABLE: False # Enable highlight detection/processing in dataset
106
  HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection (0-1)
107
  HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
@@ -129,7 +129,7 @@ parameters:
129
  RESIZE_MODE: "resize+crop" # Image resizing mode
130
  SAMPLE_EVERY_N: 10 # Load every Nth frame
131
  LOAD_RGB_ONLY: True # Ignore polarization data if True
132
- FEW_IMAGES: False # Load only first 10 images if True
133
  HIGHLIGHT_ENABLE: False # Enable highlight detection/processing
134
  HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection
135
  HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
@@ -141,24 +141,17 @@ parameters:
141
  VAL_SCENES: ["realsense"] # Validation scene names
142
  TARGET_SIZE: [896,896] # Target image size [height, width]
143
  RESIZE_MODE: "resize+crop" # Image resizing mode
144
- SAMPLE_EVERY_N: 1 # Load every Nth frame
145
  LOAD_RGB_ONLY: True # Ignore polarization data if True
146
- FEW_IMAGES: False # Load only first 10 images if True
147
- HIGHLIGHT_ENABLE: False # Enable highlight detection/processing
148
- HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection
149
- HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
150
- HIGHLIGHT_RECT_SIZE: [800, 800] # Size of highlight rectangle region
151
- HIGHLIGHT_RETURN_RECT_AS_RGB: False # Return highlight rectangle as RGB if True
152
- HIGHLIGHT_RETURN_RECT: True # Return highlight rectangle region if True
153
-
154
 
155
- FEW_IMAGES_OVERRIDE:
156
  value: False # If True, override all datasets' FEW_IMAGES to True (for quick debugging across all datasets)
157
 
158
  BATCH_SIZE: # Max batch size with img size 896 is 32
159
- value: 4 # Number of samples per batch (adjust based on GPU memory)
160
  NUM_WORKERS:
161
- value: 16 # Number of data loading worker processes (0 = main process only, "auto" = 90% of CPU affinity)
162
  SHUFFLE:
163
  value: True # Shuffle training data each epoch (False for validation/test)
164
  PIN_MEMORY:
 
18
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
19
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
20
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
21
+ FROM_PRETRAINED: "diffuse_decoder.pt" # Path to pretrained decoder weights (optional)
22
  USE_BN: False # Use batch normalization in decoder
23
  DROPOUT: 0.1 # Dropout rate in decoder layers
24
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
25
  OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image)
26
+ DECODER_LR: 0.0 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
27
  NUM_FUSION_BLOCKS_TRAINABLE: 1 # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
28
  TRAIN_RGB_HEAD: True # Whether to train RGB head (true/false, null = train if DECODER_LR != 0)
29
  highlight:
 
35
  DROPOUT: 0.1 # Dropout rate in decoder layers
36
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
37
  OUTPUT_CHANNELS: 1 # Number of output channels (1 for highlight mask)
38
+ DECODER_LR: 5.0e-4 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
39
  NUM_FUSION_BLOCKS_TRAINABLE: null # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
40
  TOKEN_INPAINTER:
41
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
42
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
43
+ # FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
44
  TOKEN_INPAINTER_LR: 1.0e-4 # Learning rate for token inpainter (can differ from base LR)
45
  DEPTH: 6 # Number of transformer blocks
46
  HEADS: 16 # Number of attention heads
 
73
  VAL_SCENES: ["val_scene1","val_scene2"] # Validation scene names
74
  TARGET_SIZE: [896,896] # Target image size [height, width]
75
  RESIZE_MODE: "resize+crop" # Image resizing mode
76
+
77
  SAMPLE_EVERY_N: 2 # Load every Nth frame
78
  LOAD_RGB_ONLY: True # Ignore polarization data if True
79
 
 
82
  # VAL_SCENES: "station" # Validation scene names (optional)
83
  TARGET_SIZE: [896,896] # Target image size [height, width]
84
  RESIZE_MODE: "resize" # Image resizing mode
85
+
86
  SAMPLE_EVERY_N: 2 # Load every Nth frame
87
  LOAD_RGB_ONLY: True # Ignore polarization data if True
88
 
 
91
  VAL_SCENES: "PSD_Val" # Validation scene name (string or list)
92
  TARGET_SIZE: [896,896] # Target image size [height, width]
93
  RESIZE_MODE: "resize+crop" # Image resizing mode
94
+
95
  SAMPLE_EVERY_N: 1 # Load every Nth frame (1 = all frames)
96
  LOAD_RGB_ONLY: True # Ignore polarization data if True
97
 
 
101
  RESIZE_MODE: "resize+crop" # Image resizing mode
102
  SAMPLE_EVERY_N: 8 # Load every Nth frame
103
  LOAD_RGB_ONLY: True # Ignore polarization data if True
104
+
105
  HIGHLIGHT_ENABLE: False # Enable highlight detection/processing in dataset
106
  HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection (0-1)
107
  HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
 
129
  RESIZE_MODE: "resize+crop" # Image resizing mode
130
  SAMPLE_EVERY_N: 10 # Load every Nth frame
131
  LOAD_RGB_ONLY: True # Ignore polarization data if True
132
+
133
  HIGHLIGHT_ENABLE: False # Enable highlight detection/processing
134
  HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection
135
  HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
 
141
  VAL_SCENES: ["realsense"] # Validation scene names
142
  TARGET_SIZE: [896,896] # Target image size [height, width]
143
  RESIZE_MODE: "resize+crop" # Image resizing mode
144
+ SAMPLE_EVERY_N: 4 # Load every Nth frame
145
  LOAD_RGB_ONLY: True # Ignore polarization data if True
146
+
 
 
 
 
 
 
 
147
 
148
+ FEW_IMAGES_ALL_DATASETS:
149
  value: False # If True, override all datasets' FEW_IMAGES to True (for quick debugging across all datasets)
150
 
151
  BATCH_SIZE: # Max batch size with img size 896 is 32
152
+ value: 8 # Number of samples per batch (adjust based on GPU memory)
153
  NUM_WORKERS:
154
+ value: 12 # Number of data loading worker processes (0 = main process only, "auto" = 90% of CPU affinity)
155
  SHUFFLE:
156
  value: True # Shuffle training data each epoch (False for validation/test)
157
  PIN_MEMORY: