File size: 12,255 Bytes
32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 32f1686 10a2918 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 |
### BASELINE: CONVERGES AFTER LONG
parameters:
### MODEL ARCHITECTURE
MODEL:
value:
MODEL_CLASS: "UnReflect_Model" # Main model class name (must match class in models.py) # <<<<<<<<< DECODER PRETRAINING: NOT USING TOKEN INPAINTER (DIRECT FROM DINO)
MODEL_MODULE: "models" # Module name to import model classes from (default: "models")
RGB_ENCODER:
ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format)
IMAGE_SIZE: 896 # Input image size (height and width in pixels)
RETURN_SELECTED_LAYERS: [3, 6, 9, 12] # Transformer layer indices to extract features from (0-indexed)
RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
DECODERS:
highlight:
FEATURE_DIM: 1024 # Feature dimension for highlight decoder
REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
READOUT_TYPE: "ignore" # Readout type for DPT decoder
# FROM_PRETRAINED: "highlight_decoder.pt" # Path to pretrained token inpainter weights (optional)
USE_BN: False # Use batch normalization in decoder
DROPOUT: 0.1 # Dropout rate in decoder layers
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
OUTPUT_CHANNELS: 1 # Number of output channels (1 for highlight mask)
DECODER_LR: 1.0e-6 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
NUM_FUSION_BLOCKS_TRAINABLE: null # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
TOKEN_INPAINTER: # <<<<<<<<<<<< DOESNT MATTER, MODEL CLASS IS NOT TOKEN INPAINTER
TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
DEPTH: 6 # Number of transformer blocks
HEADS: 16 # Number of attention heads
DROP: 0 # Dropout rate
USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings
USE_FINAL_NORM: True # Enable final LayerNorm before output projection
USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds
LOCAL_PRIOR_WEIGHT: 0.5 # Weight for local prior blending (1.0 = only mask_token, 0.0 = only local mean)
LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
INPAINT_MASK_DILATION:
value: 3 # Dilation kernel size (pixels) for inpaint mask - Must be odd
USE_TORCH_COMPILE: # Enable PyTorch 2.0 torch.compile for faster training (experimental)
value: False
DISTRIBUTE:
value: "ddp"
### DATA
DATASETS:
value:
SHIQ:
VAL_SCENES: ["test"]
RESIZE_MODE: "resize+crop"
TARGET_SIZE: [896,896]
SAMPLE_EVERY_N: 2
SCARED:
VAL_SCENES: ["v22","v23","v24","v25","v26","v27","v28","v29","v30","v31","v32","v33","v34"] # Validation scene names
RESIZE_MODE: "resize+crop" # Image resizing mode
SAMPLE_EVERY_N: 4 # Load every Nth frame
ALL_DATASETS:
FEW_IMAGES: False
TARGET_SIZE: [896,896]
LOAD_RGB_ONLY: True
LOAD_HIGHLIGHT: True
BATCH_SIZE: # Max batch size with img size 896 is 32
value: 20 # Number of samples per batch (adjust based on GPU memory)
NUM_WORKERS:
value: 12 # Number of data loading worker processes (0 = main process only, "auto" = 90% of CPU affinity)
SHUFFLE:
value: True # Shuffle training data each epoch (False for validation/test)
PIN_MEMORY:
value: True # Pin memory in DataLoader for faster GPU transfer (recommended: True)
PREFETCH_FACTOR:
value: 2 # Number of batches to prefetch per worker (higher = more memory usage)
### HIGHLIGHTS
MOGE_MODEL:
value: "Ruicheng/moge-2-vits-normal" # MoGe model name for normal estimation (HuggingFace format)
SURFACE_ROUGHNESS:
value: 100.0 # Blinn-Phong surface roughness exponent (higher = sharper highlights)
INTENSITY:
value: 0.8 # Specular highlight intensity multiplier
LIGHT_DISTANCE_RANGE:
value: [0.0, 1] # Range for light source distance sampling [min, max] (normalized)
LIGHT_LEFT_RIGHT_ANGLE:
value: [0, 360] # Range for light source horizontal angle [min, max] in degrees
LIGHT_ABOVE_BELOW_ANGLE:
value: [0, 360] # Range for light source vertical angle [min, max] in degrees
DATASET_HIGHLIGHT_DILATION:
value: 25 #sDilation kernel size (pixels) for dataset highlight masks
DATASET_HIGHLIGHT_THRESHOLD:
value: 0.9 # Brightness/luminance threshold (0-1) for detecting highlights in dataset images
DATASET_HIGHLIGHT_USE_LUMINANCE:
value: True # If True, use perceptually-weighted luminance (0.299*R + 0.587*G + 0.114*B) for dataset highlights; if False, use simple mean brightness
HIGHLIGHT_COLOR:
value: [1.0, 1.0, 1.0] # RGB color for synthetic highlights (normalized 0-1)
CLAMP_RECONSTRUCTION:
value: True # Clamp reconstructed images to [0, 1] range if True
### OPTIMIZATION
EPOCHS:
value: 20 # Maximum number of training epochs<
LEARNING_RATE:
value: 1.0e-4 # Base learning rate for optimizer
WEIGHT_DECAY:
value: 0.0 # L2 regularization weight (0.0 = no weight decay)
GRADIENT_ACCUMULATION_STEPS:
value: 1 # Number of steps to accumulate gradients before optimizer step (1 = no accumulation)
WARMUP:
value: 100 # Number of warmup steps for learning rate schedule (linear warmup from 0 to LR)
GRADIENT_CLIPPING_MAX_NORM:
value: 8 # Maximum gradient norm for clipping (set to -1 to disable clipping)
LR_SCHEDULER:
value:
ONPLATEAU: # ReduceLROnPlateau scheduler (reduces LR when validation metric plateaus)
PATIENCE: 5 # Number of epochs to wait before reducing LR
FACTOR: 0.1 # Factor by which LR is reduced (new_lr = old_lr * factor)
COSINE: # CosineAnnealingLR scheduler (cosine annealing schedule)
N_PERIODS: 1 # Number of cosine periods over training
# STEPWISE: # StepLR scheduler (reduces LR at fixed step intervals)
# N_STEPS: 5 # Number of times to reduce LR during training
# GAMMA: 0.25 # Factor by which LR is reduced at each step (new_lr = old_lr * gamma)
# EXPONENTIAL: # ExponentialLR scheduler (exponential decay)
# GAMMA: 0.5 # Multiplicative factor for exponential decay
SWITCH_OPTIMIZER_EPOCH:
value: null # Epoch number to switch from bootstrap to refining optimizer (null = no switch)
OPTIMIZER_BOOTSTRAP_NAME:
value: "AdamW" # Optimizer name for initial training phase ("Adam", "SGD", etc.)
OPTIMIZER_REFINING_NAME:
value: "AdamW" # Optimizer name for refining phase (used after SWITCH_OPTIMIZER_EPOCH)
EARLY_STOPPING_PATIENCE:
value: 20 # Number of epochs without improvement before stopping training
SAVE_INTERVAL:
value: 1000 # Number of training steps between model checkpoints
DATASET_HIGHLIGHT_SUPERVISION_THRESHOLD:
value: 0.1 # Pixel highlights above this threshold (should be low) are excluded from supervision
### LOSS WEIGHTS (relative to the total loss, NOT NORMALIZED LATER)
SPECULAR_LOSS_WEIGHT:
value: 0.0 # Weight for specular component reconstruction loss
DIFFUSE_LOSS_WEIGHT:
value: 0.0 # Weight for diffuse component reconstruction loss
HIGHLIGHT_LOSS_WEIGHT:
value: 1.0 # Weight for highlight mask regression loss
TOKEN_INPAINT_LOSS_WEIGHT:
value: 0.0 # Weight for token-space inpainting loss (L1 + cosine similarity in feature space)
IMAGE_RECONSTRUCTION_LOSS_WEIGHT:
value: 0.0 # Weight for full image reconstruction loss
SATURATION_RING_LOSS_WEIGHT:
value: 0.0 # Weight for saturation ring consistency loss (around highlight regions)
RING_KERNEL_SIZE:
value: 11 # Kernel size (odd number) for saturation ring dilation around highlights
RING_VAR_WEIGHT:
value: 0.5 # Weight for variance matching in saturation ring loss (vs mean matching)
RING_TEXTURE_WEIGHT:
value: 0.0 # Weight for texture consistency term in saturation ring loss
HLREG_W_L1:
value: 1.0 # Weight for L1 loss in highlight regression
HLREG_USE_CHARB:
value: True # Use Charbonnier loss (smooth L1) instead of standard L1 if True
HLREG_W_DICE:
value: 0.2 # Weight for Dice loss in highlight regression (for mask overlap)
HLREG_W_SSIM:
value: 0.0 # Weight for SSIM loss in highlight regression
HLREG_W_GRAD:
value: 0.0 # Weight for gradient loss in highlight regression
HLREG_W_TV:
value: 0.0 # Weight for total variation loss in highlight regression
HLREG_BALANCE_MODE:
value: "auto" # Class balancing mode for highlight regression: 'none' | 'auto' | 'pos_weight'
HLREG_POS_WEIGHT:
value: 1.0 # Positive class weight (used only if BALANCE_MODE == 'pos_weight')
HLREG_FOCAL_GAMMA:
value: 2.0 # Focal loss gamma parameter (0.0 = standard BCE, 1.0-2.0 helps with gradient vanishing)
WEIGHT_CONTEXT_IDENTITY:
value: 0.0 # LEAVE TO 0.0: Weight for L1 loss on context (non-masked) regions (identity preservation)
WEIGHT_TV_IN_HOLE:
value: 0.0 # LEAVE TO 0.0: Weight for total variation loss inside masked/hole regions
RING_DILATE_KERNEL:
value: 17 # Dilation kernel size (odd number) for creating ring mask around highlights
WEIGHT_SEAM:
value: 0.0 # Weight for gradient matching loss on saturation ring
SEAM_USE_CHARB:
value: True # Use Charbonnier loss instead of L1 in seam loss (smooth L1 for boundary consistency)
SEAM_WEIGHT_GRAD:
value: 0.0 # Weight for gradient matching term inside seam loss (0.0 = disable gradient term)
TOKEN_FEAT_ALPHA:
value: 0.5 # Mixing factor for token feature loss: alpha * L1 + (1-alpha) * (1-cosine_sim)
### DIFFUSE HIGHLIGHT PENALTY
WEIGHT_DIFFUSE_HIGHLIGHT_PENALTY:
value: 0.0 # Weight for penalty loss on highlights in diffuse decoder output (0.0 = disabled)
DIFFUSE_HL_THRESHOLD:
value: 0.8 # Brightness/luminance threshold for detecting highlights in diffuse (0.0-1.0)
DIFFUSE_HL_USE_CHARB:
value: True # Use Charbonnier loss instead of L1 for diffuse highlight penalty
DIFFUSE_HL_PENALTY_MODE:
value: "brightness" # Penalty mode: "brightness" (penalize brightness/luminance above threshold) or "pixel" (penalize RGB values directly)
DIFFUSE_HL_TARGET_BRIGHTNESS:
value: null # Target brightness/luminance for penalized pixels (null = use threshold value)
DIFFUSE_HL_USE_LUMINANCE:
value: True # If True, use perceptually-weighted luminance (0.299*R + 0.587*G + 0.114*B); if False, use simple mean brightness
### LOGGING, RESULTS AND WANDB
LOG_INTERVAL:
value: 1 # Number of training steps between console log outputs
WANDB_LOG_INTERVAL:
value: 1 # Number of training steps between WandB metric logs
IMAGE_LOG_INTERVAL:
value: 5 # Number of training steps between image logging to WandB
NO_WANDB:
value: False # Disable WandB logging if True (useful for local debugging)
MODEL_WATCHER_FREQ_WANDB:
value: 50 # Frequency (in steps) for logging model parameter histograms to WandB
WANDB_ENTITY:
value: "unreflect-anything" # WandB organization/entity name
WANDB_PROJECT:
value: "UnReflectAnything" # WandB project name
NOTES:
value: "" # Notes/description for this training run
|