UnReflectAnything / configs /pretrained_config.yaml
AlbeRota's picture
Upload weights, notebooks, sample images
955ea7d verified
### BASELINE: CONVERGES AFTER LONG
parameters:
### MODEL ARCHITECTURE
MODEL:
value:
MODEL_CLASS: "UnReflect_Model_TokenInpainter" # Main model class name (must match class in models.py)
MODEL_MODULE: "models" # Module name to import model classes from (default: "models")
RGB_ENCODER:
ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format)
IMAGE_SIZE: 896 # Input image size (height and width in pixels)
RETURN_SELECTED_LAYERS: [3, 6, 9, 12] # Transformer layer indices to extract features from (0-indexed)
RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
DECODERS:
diffuse:
FEATURE_DIM: 1024 # Feature dimension for decoder (should match encoder output)
REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
# FROM_PRETRAINED: "diffuse_decoder.pt" # Path to pretrained decoder weights (optional)
USE_BN: False # Use batch normalization in decoder
DROPOUT: 0.1 # Dropout rate in decoder layers
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image)
DECODER_LR: 0.0 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
NUM_FUSION_BLOCKS_TRAINABLE: 1 # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
TRAIN_RGB_HEAD: True # Whether to train RGB head (true/false, null = train if DECODER_LR != 0)
highlight:
FEATURE_DIM: 1024 # Feature dimension for highlight decoder
REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
READOUT_TYPE: "ignore" # Readout type for DPT decoder
# FROM_PRETRAINED: "highlight_decoder.pt" # Path to pretrained decoder weights (optional)
USE_BN: False # Use batch normalization in decoder
DROPOUT: 0.1 # Dropout rate in decoder layers
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
OUTPUT_CHANNELS: 1 # Number of output channels (1 for highlight mask)
DECODER_LR: 5.0e-4 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
NUM_FUSION_BLOCKS_TRAINABLE: null # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
TOKEN_INPAINTER:
TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
# FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
TOKEN_INPAINTER_LR: 1.0e-4 # Learning rate for token inpainter (can differ from base LR)
DEPTH: 6 # Number of transformer blocks
HEADS: 16 # Number of attention heads
DROP: 0.05 # Dropout rate
USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings
USE_FINAL_NORM: True # Enable final LayerNorm before output projection
USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds
LOCAL_PRIOR_WEIGHT: 0.5 # Weight for local prior blending (1.0 = only mask_token, 0.0 = only local mean)
LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training