896 half-pretrained!
Browse files
configs/pretrained_config.yaml
CHANGED
|
@@ -10,25 +10,23 @@ parameters:
|
|
| 10 |
RGB_ENCODER:
|
| 11 |
ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format)
|
| 12 |
IMAGE_SIZE: 896 # Input image size (height and width in pixels)
|
| 13 |
-
RETURN_SELECTED_LAYERS: [3, 6, 9, 12] # Transformer layer indices to extract features from (0-indexed)
|
| 14 |
RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
|
| 15 |
DECODERS:
|
| 16 |
diffuse:
|
| 17 |
-
USE_FILM: False # Enable FiLM (Feature-wise Linear Modulation) conditioning in decoder
|
| 18 |
FEATURE_DIM: 1024 # Feature dimension for decoder (should match encoder output)
|
| 19 |
REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
|
| 20 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 21 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
|
| 22 |
-
|
| 23 |
USE_BN: False # Use batch normalization in decoder
|
| 24 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 25 |
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
| 26 |
OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image)
|
| 27 |
-
DECODER_LR:
|
| 28 |
NUM_FUSION_BLOCKS_TRAINABLE: 1 # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
|
| 29 |
TRAIN_RGB_HEAD: True # Whether to train RGB head (true/false, null = train if DECODER_LR != 0)
|
| 30 |
highlight:
|
| 31 |
-
USE_FILM: False # Enable FiLM conditioning in highlight decoder
|
| 32 |
FEATURE_DIM: 1024 # Feature dimension for highlight decoder
|
| 33 |
REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
|
| 34 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
|
@@ -42,11 +40,11 @@ parameters:
|
|
| 42 |
TOKEN_INPAINTER:
|
| 43 |
TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
|
| 44 |
TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
|
| 45 |
-
# FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
|
| 46 |
-
TOKEN_INPAINTER_LR: 1.0e-
|
| 47 |
DEPTH: 6 # Number of transformer blocks
|
| 48 |
HEADS: 16 # Number of attention heads
|
| 49 |
-
DROP: 0 # Dropout rate
|
| 50 |
USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings
|
| 51 |
USE_FINAL_NORM: True # Enable final LayerNorm before output projection
|
| 52 |
USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds
|
|
@@ -54,6 +52,4 @@ parameters:
|
|
| 54 |
LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
|
| 55 |
SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
|
| 56 |
INPAINT_MASK_DILATION:
|
| 57 |
-
value:
|
| 58 |
-
USE_TORCH_COMPILE: # Enable PyTorch 2.0 torch.compile for faster training (experimental)
|
| 59 |
-
value: False
|
|
|
|
| 10 |
RGB_ENCODER:
|
| 11 |
ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format)
|
| 12 |
IMAGE_SIZE: 896 # Input image size (height and width in pixels)
|
| 13 |
+
RETURN_SELECTED_LAYERS: [3, 6, 9 , 12] # Transformer layer indices to extract features from (0-indexed)
|
| 14 |
RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
|
| 15 |
DECODERS:
|
| 16 |
diffuse:
|
|
|
|
| 17 |
FEATURE_DIM: 1024 # Feature dimension for decoder (should match encoder output)
|
| 18 |
REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
|
| 19 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 20 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
|
| 21 |
+
FROM_PRETRAINED: "weights/decoder_896.pth" # Path to pretrained decoder weights (optional)
|
| 22 |
USE_BN: False # Use batch normalization in decoder
|
| 23 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 24 |
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
| 25 |
OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image)
|
| 26 |
+
DECODER_LR: 0.0 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
|
| 27 |
NUM_FUSION_BLOCKS_TRAINABLE: 1 # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
|
| 28 |
TRAIN_RGB_HEAD: True # Whether to train RGB head (true/false, null = train if DECODER_LR != 0)
|
| 29 |
highlight:
|
|
|
|
| 30 |
FEATURE_DIM: 1024 # Feature dimension for highlight decoder
|
| 31 |
REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
|
| 32 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
|
|
|
| 40 |
TOKEN_INPAINTER:
|
| 41 |
TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
|
| 42 |
TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
|
| 43 |
+
# FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
|
| 44 |
+
TOKEN_INPAINTER_LR: 1.0e-4 # Learning rate for token inpainter (can differ from base LR)
|
| 45 |
DEPTH: 6 # Number of transformer blocks
|
| 46 |
HEADS: 16 # Number of attention heads
|
| 47 |
+
DROP: 0.05 # Dropout rate
|
| 48 |
USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings
|
| 49 |
USE_FINAL_NORM: True # Enable final LayerNorm before output projection
|
| 50 |
USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds
|
|
|
|
| 52 |
LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
|
| 53 |
SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
|
| 54 |
INPAINT_MASK_DILATION:
|
| 55 |
+
value: 3 # Dilation kernel size (pixels) for inpaint mask - Must be odd
|
|
|
|
|
|