File size: 17,666 Bytes
679c502 10a2918 679c502 32f1686 679c502 10a2918 679c502 10a2918 679c502 32f1686 679c502 32f1686 679c502 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 |
### BASELINE: CONVERGES AFTER LONG
parameters:
### MODEL ARCHITECTURE
MODEL:
value:
MODEL_CLASS: "UnReflect_Model" # Main model class name (must match class in models.py) # <<<<<<<<< DECODER PRETRAINING: NOT USING TOKEN INPAINTER (DIRECT FROM DINO)
MODEL_MODULE: "models" # Module name to import model classes from (default: "models")
RGB_ENCODER:
ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format)
IMAGE_SIZE: 896 # Input image size (height and width in pixels)
RETURN_SELECTED_LAYERS: [3, 6, 9, 12] # Transformer layer indices to extract features from (0-indexed)
RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
DECODERS:
diffuse:
FEATURE_DIM: 1024 # Feature dimension for decoder (should match encoder output)
REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
# FROM_PRETRAINED: "rgb_decoder.pth" # Path to pretrained decoder weights (optional) # <<<<<<<<< DECODER PRETRAINING: NO WEIGHTS HERE
USE_BN: False # Use batch normalization in decoder
DROPOUT: 0.1 # Dropout rate in decoder layers
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image)
DECODER_LR: 5.0e-5 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
NUM_FUSION_BLOCKS_TRAINABLE: null # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
TRAIN_RGB_HEAD: True # Whether to train RGB head (true/false, null = train if DECODER_LR != 0)
TOKEN_INPAINTER: # <<<<<<<<<<<< DOESNT MATTER, MODEL CLASS IS NOT TOKEN INPAINTER
TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
DEPTH: 6 # Number of transformer blocks
HEADS: 16 # Number of attention heads
DROP: 0 # Dropout rate
USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings
USE_FINAL_NORM: True # Enable final LayerNorm before output projection
USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds
LOCAL_PRIOR_WEIGHT: 0.5 # Weight for local prior blending (1.0 = only mask_token, 0.0 = only local mean)
LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
INPAINT_MASK_DILATION:
value: 3 # Dilation kernel size (pixels) for inpaint mask - Must be odd
USE_TORCH_COMPILE: # Enable PyTorch 2.0 torch.compile for faster training (experimental)
value: False
DISTRIBUTE:
value: "ddp"
### DATA
DATASETS:
value:
SCRREAM:
VAL_SCENES: ["scene10_full_00","scene11_full_00","scene044_full_00","scene04_reduced_00","scene04_reduced_01","scene04_reduced_02"] # List of validation scene names
TARGET_SIZE: [896,896] # Target image size [height, width] in pixels
RESIZE_MODE: "resize+crop" # Image resizing mode: "resize", "crop", "resize+crop", or "pad"
FEW_IMAGES: False # If True, load only first 10 images per scene (for quick debugging)
SAMPLE_EVERY_N: 2 # Load every Nth frame from each scene (1 = all frames, 4 = every 4th frame)
LOAD_RGB_ONLY: True # If True, ignore polarization data and load only RGB images
HOUSECAT6D:
VAL_SCENES: ["val_scene1","val_scene2"] # Validation scene names
TARGET_SIZE: [896,896] # Target image size [height, width]
RESIZE_MODE: "resize+crop" # Image resizing mode
FEW_IMAGES: False # Load only first 10 images if True
SAMPLE_EVERY_N: 2 # Load every Nth frame
LOAD_RGB_ONLY: True # Ignore polarization data if True
CROMO:
TRAIN_SCENES: ["kitchen"] # Training scene names (list or string)
# VAL_SCENES: "station" # Validation scene names (optional)
TARGET_SIZE: [896,896] # Target image size [height, width]
RESIZE_MODE: "resize" # Image resizing mode
FEW_IMAGES: False # Load only first 10 images if True
SAMPLE_EVERY_N: 2 # Load every Nth frame
LOAD_RGB_ONLY: True # Ignore polarization data if True
PSD:
TRAIN_SCENES: "PSD_Train" # Training scene name (string or list)
VAL_SCENES: "PSD_Val" # Validation scene name (string or list)
TARGET_SIZE: [896,896] # Target image size [height, width]
RESIZE_MODE: "resize+crop" # Image resizing mode
FEW_IMAGES: False # Load only first 10 images if True
SAMPLE_EVERY_N: 1 # Load every Nth frame (1 = all frames)
LOAD_RGB_ONLY: True # Ignore polarization data if True
SCARED:
VAL_SCENES: ["v22","v23","v24","v25","v26","v27","v28","v29","v30","v31","v32","v33","v34"] # Validation scene names
TARGET_SIZE: [896,896] # Target image size [height, width]
RESIZE_MODE: "resize+crop" # Image resizing mode
SAMPLE_EVERY_N: 8 # Load every Nth frame
LOAD_RGB_ONLY: True # Ignore polarization data if True
FEW_IMAGES: False # Load only first 10 images if True
HIGHLIGHT_ENABLE: False # Enable highlight detection/processing in dataset
HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection (0-1)
HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
HIGHLIGHT_RECT_SIZE: [1000, 1000] # Size of highlight rectangle region [height, width]
HIGHLIGHT_RETURN_RECT_AS_RGB: False # Return highlight rectangle as RGB if True
HIGHLIGHT_RETURN_RECT: True # Return highlight rectangle region if True
STEREOMIS_TRACKING:
VAL_SCENES: ["P2_2"] # Validation scene names
TARGET_SIZE: [896,896] # Target image size [height, width]
RESIZE_MODE: "resize+crop" # Image resizing mode
SAMPLE_EVERY_N: 4 # Load every Nth frame
LOAD_RGB_ONLY: True # Ignore polarization data if True
FEW_IMAGES: False # Load only first 10 imagas if True
HIGHLIGHT_ENABLE: False # Enable highlight detection/processing
HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection
HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
HIGHLIGHT_RECT_SIZE: [800, 800] # Size of highlight rectangle region
HIGHLIGHT_RETURN_RECT_AS_RGB: False # Return highlight rectangle as RGB if True
HIGHLIGHT_RETURN_RECT: True # Return highlight rectangle region if True
CHOLEC80:
VAL_SCENES: ["val"] # Validation scene names
TARGET_SIZE: [896,896] # Target image size [height, width]
RESIZE_MODE: "resize+crop" # Image resizing mode
SAMPLE_EVERY_N: 10 # Load every Nth frame
LOAD_RGB_ONLY: True # Ignore polarization data if True
FEW_IMAGES: False # Load only first 10 images if True
HIGHLIGHT_ENABLE: False # Enable highlight detection/processing
HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection
HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
HIGHLIGHT_RECT_SIZE: [800, 800] # Size of highlight rectangle region
HIGHLIGHT_RETURN_RECT_AS_RGB: False # Return highlight rectangle as RGB if True
HIGHLIGHT_RETURN_RECT: True # Return highlight rectangle region if True
SUNRGBD:
VAL_SCENES: ["realsense"] # Validation scene names
TARGET_SIZE: [896,896] # Target image size [height, width]
RESIZE_MODE: "resize+crop" # Image resizing mode
SAMPLE_EVERY_N: 1 # Load every Nth frame
LOAD_RGB_ONLY: True # Ignore polarization data if True
FEW_IMAGES: False # Load only first 10 images if True
HIGHLIGHT_ENABLE: False # Enable highlight detection/processing
HIGHLIGHT_BRIGHTNESS_THRESHOLD: 0.9 # Brightness threshold for highlight detection
HIGHLIGHT_RETURN_MASK: True # Return highlight mask in dataset output
HIGHLIGHT_RECT_SIZE: [800, 800] # Size of highlight rectangle region
HIGHLIGHT_RETURN_RECT_AS_RGB: False # Return highlight rectangle as RGB if True
HIGHLIGHT_RETURN_RECT: True # Return highlight rectangle region if True
FEW_IMAGES_ALL_DATASETS:
value: False # If True, override all datasets' FEW_IMAGES to True (for quick debugging across all datasets)
BATCH_SIZE: # Max batch size with img size 896 is 32
value: 4 # Number of samples per batch (adjust based on GPU memory)
NUM_WORKERS:
value: 12 # Number of data loading worker processes (0 = main process only, "auto" = 90% of CPU affinity)
SHUFFLE:
value: True # Shuffle training data each epoch (False for validation/test)
PIN_MEMORY:
value: True # Pin memory in DataLoader for faster GPU transfer (recommended: True)
PREFETCH_FACTOR:
value: 2 # Number of batches to prefetch per worker (higher = more memory usage)
### HIGHLIGHTS
MOGE_MODEL:
value: "Ruicheng/moge-2-vits-normal" # MoGe model name for normal estimation (HuggingFace format)
SURFACE_ROUGHNESS:
value: 8.0 # Blinn-Phong surface roughness exponent (higher = sharper highlights)
INTENSITY:
value: 0.0 # Specular highlight intensity multiplier
LIGHT_DISTANCE_RANGE:
value: [0.0, 1] # Range for light source distance sampling [min, max] (normalized)
LIGHT_LEFT_RIGHT_ANGLE:
value: [0, 360] # Range for light source horizontal angle [min, max] in degrees
LIGHT_ABOVE_BELOW_ANGLE:
value: [0, 360] # Range for light source vertical angle [min, max] in degrees
DATASET_HIGHLIGHT_DILATION:
value: 25 #sDilation kernel size (pixels) for dataset highlight masks
DATASET_HIGHLIGHT_THRESHOLD:
value: 0.9 # Brightness/luminance threshold (0-1) for detecting highlights in dataset images
DATASET_HIGHLIGHT_USE_LUMINANCE:
value: True # If True, use perceptually-weighted luminance (0.299*R + 0.587*G + 0.114*B) for dataset highlights; if False, use simple mean brightness
HIGHLIGHT_COLOR:
value: [1.0, 1.0, 1.0] # RGB color for synthetic highlights (normalized 0-1)
CLAMP_RECONSTRUCTION:
value: True # Clamp reconstructed images to [0, 1] range if True
### OPTIMIZATION
EPOCHS:
value: 20 # Maximum number of training epochs<
LEARNING_RATE:
value: 1.0e-4 # Base learning rate for optimizer
WEIGHT_DECAY:
value: 0.0 # L2 regularization weight (0.0 = no weight decay)
GRADIENT_ACCUMULATION_STEPS:
value: 1 # Number of steps to accumulate gradients before optimizer step (1 = no accumulation)
WARMUP:
value: 100 # Number of warmup steps for learning rate schedule (linear warmup from 0 to LR)
GRADIENT_CLIPPING_MAX_NORM:
value: 8 # Maximum gradient norm for clipping (set to -1 to disable clipping)
LR_SCHEDULER:
value:
ONPLATEAU: # ReduceLROnPlateau scheduler (reduces LR when validation metric plateaus)
PATIENCE: 5 # Number of epochs to wait before reducing LR
FACTOR: 0.1 # Factor by which LR is reduced (new_lr = old_lr * factor)
COSINE: # CosineAnnealingLR scheduler (cosine annealing schedule)
N_PERIODS: 1 # Number of cosine periods over training
# STEPWISE: # StepLR scheduler (reduces LR at fixed step intervals)
# N_STEPS: 5 # Number of times to reduce LR during training
# GAMMA: 0.25 # Factor by which LR is reduced at each step (new_lr = old_lr * gamma)
# EXPONENTIAL: # ExponentialLR scheduler (exponential decay)
# GAMMA: 0.5 # Multiplicative factor for exponential decay
SWITCH_OPTIMIZER_EPOCH:
value: null # Epoch number to switch from bootstrap to refining optimizer (null = no switch)
OPTIMIZER_BOOTSTRAP_NAME:
value: "AdamW" # Optimizer name for initial training phase ("Adam", "SGD", etc.)
OPTIMIZER_REFINING_NAME:
value: "AdamW" # Optimizer name for refining phase (used after SWITCH_OPTIMIZER_EPOCH)
EARLY_STOPPING_PATIENCE:
value: 20 # Number of epochs without improvement before stopping training
SAVE_INTERVAL:
value: 1000 # Number of training steps between model checkpoints
DATASET_HIGHLIGHT_SUPERVISION_THRESHOLD:
value: 0.1 # Pixel highlights above this threshold (should be low) are excluded from supervision
### LOSS WEIGHTS (relative to the total loss, NOT NORMALIZED LATER)
SPECULAR_LOSS_WEIGHT:
value: 0.0 # Weight for specular component reconstruction loss
DIFFUSE_LOSS_WEIGHT:
value: 1.0 # Weight for diffuse component reconstruction loss
HIGHLIGHT_LOSS_WEIGHT:
value: 0.0 # Weight for highlight mask regression loss
TOKEN_INPAINT_LOSS_WEIGHT:
value: 0.0 # Weight for token-space inpainting loss (L1 + cosine similarity in feature space)
IMAGE_RECONSTRUCTION_LOSS_WEIGHT:
value: 0.0 # Weight for full image reconstruction loss
SATURATION_RING_LOSS_WEIGHT:
value: 0.0 # Weight for saturation ring consistency loss (around highlight regions)
RING_KERNEL_SIZE:
value: 11 # Kernel size (odd number) for saturation ring dilation around highlights
RING_VAR_WEIGHT:
value: 0.5 # Weight for variance matching in saturation ring loss (vs mean matching)
RING_TEXTURE_WEIGHT:
value: 0.0 # Weight for texture consistency term in saturation ring loss
HLREG_W_L1:
value: 1.0 # Weight for L1 loss in highlight regression
HLREG_USE_CHARB:
value: True # Use Charbonnier loss (smooth L1) instead of standard L1 if True
HLREG_W_DICE:
value: 0.2 # Weight for Dice loss in highlight regression (for mask overlap)
HLREG_W_SSIM:
value: 0.0 # Weight for SSIM loss in highlight regression
HLREG_W_GRAD:
value: 0.0 # Weight for gradient loss in highlight regression
HLREG_W_TV:
value: 0.0 # Weight for total variation loss in highlight regression
HLREG_BALANCE_MODE:
value: "auto" # Class balancing mode for highlight regression: 'none' | 'auto' | 'pos_weight'
HLREG_POS_WEIGHT:
value: 1.0 # Positive class weight (used only if BALANCE_MODE == 'pos_weight')
HLREG_FOCAL_GAMMA:
value: 2.0 # Focal loss gamma parameter (0.0 = standard BCE, 1.0-2.0 helps with gradient vanishing)
WEIGHT_CONTEXT_IDENTITY:
value: 0.0 # LEAVE TO 0.0: Weight for L1 loss on context (non-masked) regions (identity preservation)
WEIGHT_TV_IN_HOLE:
value: 0.0 # LEAVE TO 0.0: Weight for total variation loss inside masked/hole regions
RING_DILATE_KERNEL:
value: 17 # Dilation kernel size (odd number) for creating ring mask around highlights
WEIGHT_SEAM:
value: 0.0 # Weight for gradient matching loss on saturation ring
SEAM_USE_CHARB:
value: True # Use Charbonnier loss instead of L1 in seam loss (smooth L1 for boundary consistency)
SEAM_WEIGHT_GRAD:
value: 0.0 # Weight for gradient matching term inside seam loss (0.0 = disable gradient term)
TOKEN_FEAT_ALPHA:
value: 0.5 # Mixing factor for token feature loss: alpha * L1 + (1-alpha) * (1-cosine_sim)
### DIFFUSE HIGHLIGHT PENALTY
WEIGHT_DIFFUSE_HIGHLIGHT_PENALTY:
value: 0.0 # Weight for penalty loss on highlights in diffuse decoder output (0.0 = disabled)
DIFFUSE_HL_THRESHOLD:
value: 0.8 # Brightness/luminance threshold for detecting highlights in diffuse (0.0-1.0)
DIFFUSE_HL_USE_CHARB:
value: True # Use Charbonnier loss instead of L1 for diffuse highlight penalty
DIFFUSE_HL_PENALTY_MODE:
value: "brightness" # Penalty mode: "brightness" (penalize brightness/luminance above threshold) or "pixel" (penalize RGB values directly)
DIFFUSE_HL_TARGET_BRIGHTNESS:
value: null # Target brightness/luminance for penalized pixels (null = use threshold value)
DIFFUSE_HL_USE_LUMINANCE:
value: True # If True, use perceptually-weighted luminance (0.299*R + 0.587*G + 0.114*B); if False, use simple mean brightness
### LOGGING, RESULTS AND WANDB
LOG_INTERVAL:
value: 1 # Number of training steps between console log outputs
WANDB_LOG_INTERVAL:
value: 1 # Number of training steps between WandB metric logs
IMAGE_LOG_INTERVAL:
value: 5 # Number of training steps between image logging to WandB
NO_WANDB:
value: False # Disable WandB logging if True (useful for local debugging)
MODEL_WATCHER_FREQ_WANDB:
value: 50 # Frequency (in steps) for logging model parameter histograms to WandB
WANDB_ENTITY:
value: "unreflect-anything" # WandB organization/entity name
WANDB_PROJECT:
value: "UnReflectAnything" # WandB project name
NOTES:
value: "" # Notes/description for this training run
|