| ### BASELINE: CONVERGES AFTER LONG | |
| parameters: | |
| ### MODEL ARCHITECTURE | |
| MODEL: | |
| value: | |
| MODEL_CLASS: "UnReflect_Model_TokenInpainter" # Main model class name (must match class in models.py) | |
| MODEL_MODULE: "models" # Module name to import model classes from (default: "models") | |
| RGB_ENCODER: | |
| ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format) | |
| IMAGE_SIZE: 896 # Input image size (height and width in pixels) | |
| RETURN_SELECTED_LAYERS: [3, 6, 9, 12] # Transformer layer indices to extract features from (0-indexed) | |
| RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set) | |
| DECODERS: | |
| diffuse: | |
| FEATURE_DIM: 1024 # Feature dimension for decoder (should match encoder output) | |
| REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly) | |
| REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage | |
| READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.) | |
| # FROM_PRETRAINED: "diffuse_decoder.pt" # Path to pretrained decoder weights (optional) | |
| USE_BN: False # Use batch normalization in decoder | |
| DROPOUT: 0.1 # Dropout rate in decoder layers | |
| OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width] | |
| OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image) | |
| DECODER_LR: 0.0 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR) | |
| NUM_FUSION_BLOCKS_TRAINABLE: 1 # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0) | |
| TRAIN_RGB_HEAD: True # Whether to train RGB head (true/false, null = train if DECODER_LR != 0) | |
| highlight: | |
| FEATURE_DIM: 1024 # Feature dimension for highlight decoder | |
| REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage | |
| REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage | |
| READOUT_TYPE: "ignore" # Readout type for DPT decoder | |
| # FROM_PRETRAINED: "highlight_decoder.pt" # Path to pretrained decoder weights (optional) | |
| USE_BN: False # Use batch normalization in decoder | |
| DROPOUT: 0.1 # Dropout rate in decoder layers | |
| OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width] | |
| OUTPUT_CHANNELS: 1 # Number of output channels (1 for highlight mask) | |
| DECODER_LR: 5.0e-4 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR) | |
| NUM_FUSION_BLOCKS_TRAINABLE: null # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0) | |
| TOKEN_INPAINTER: | |
| TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name | |
| TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from | |
| # FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights (optional) | |
| TOKEN_INPAINTER_LR: 1.0e-4 # Learning rate for token inpainter (can differ from base LR) | |
| DEPTH: 6 # Number of transformer blocks | |
| HEADS: 16 # Number of attention heads | |
| DROP: 0.05 # Dropout rate | |
| USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings | |
| USE_FINAL_NORM: True # Enable final LayerNorm before output projection | |
| USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds | |
| LOCAL_PRIOR_WEIGHT: 0.5 # Weight for local prior blending (1.0 = only mask_token, 0.0 = only local mean) | |
| LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1) | |
| SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training |