UnReflectAnything / configs /pretrained_config.yaml

Upload weights, notebooks, sample images

955ea7d verified 2 months ago

3.93 kB

	### BASELINE: CONVERGES AFTER LONG

	parameters:

	### MODEL ARCHITECTURE
	MODEL:
	value:
	MODEL_CLASS: "UnReflect_Model_TokenInpainter" # Main model class name (must match class in models.py)
	MODEL_MODULE: "models" # Module name to import model classes from (default: "models")
	RGB_ENCODER:
	ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format)
	IMAGE_SIZE: 896 # Input image size (height and width in pixels)
	RETURN_SELECTED_LAYERS: [3, 6, 9, 12] # Transformer layer indices to extract features from (0-indexed)
	RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
	DECODERS:
	diffuse:
	FEATURE_DIM: 1024 # Feature dimension for decoder (should match encoder output)
	REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
	REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
	READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
	# FROM_PRETRAINED: "diffuse_decoder.pt" # Path to pretrained decoder weights (optional)
	USE_BN: False # Use batch normalization in decoder
	DROPOUT: 0.1 # Dropout rate in decoder layers
	OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
	OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image)
	DECODER_LR: 0.0 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
	NUM_FUSION_BLOCKS_TRAINABLE: 1 # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
	TRAIN_RGB_HEAD: True # Whether to train RGB head (true/false, null = train if DECODER_LR != 0)
	highlight:
	FEATURE_DIM: 1024 # Feature dimension for highlight decoder
	REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
	REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
	READOUT_TYPE: "ignore" # Readout type for DPT decoder
	# FROM_PRETRAINED: "highlight_decoder.pt" # Path to pretrained decoder weights (optional)
	USE_BN: False # Use batch normalization in decoder
	DROPOUT: 0.1 # Dropout rate in decoder layers
	OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
	OUTPUT_CHANNELS: 1 # Number of output channels (1 for highlight mask)
	DECODER_LR: 5.0e-4 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
	NUM_FUSION_BLOCKS_TRAINABLE: null # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
	TOKEN_INPAINTER:
	TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
	TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
	# FROM_PRETRAINED: "token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
	TOKEN_INPAINTER_LR: 1.0e-4 # Learning rate for token inpainter (can differ from base LR)
	DEPTH: 6 # Number of transformer blocks
	HEADS: 16 # Number of attention heads
	DROP: 0.05 # Dropout rate
	USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings
	USE_FINAL_NORM: True # Enable final LayerNorm before output projection
	USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds
	LOCAL_PRIOR_WEIGHT: 0.5 # Weight for local prior blending (1.0 = only mask_token, 0.0 = only local mean)
	LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
	SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training