Upload weights, notebooks, sample images
Browse files
configs/pretrained_config.yaml
CHANGED
|
@@ -9,7 +9,7 @@ parameters:
|
|
| 9 |
MODEL_MODULE: "models" # Module name to import model classes from (default: "models")
|
| 10 |
RGB_ENCODER:
|
| 11 |
ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format)
|
| 12 |
-
IMAGE_SIZE:
|
| 13 |
RETURN_SELECTED_LAYERS: [3, 6, 9, 12] # Transformer layer indices to extract features from (0-indexed)
|
| 14 |
RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
|
| 15 |
DECODERS:
|
|
@@ -19,10 +19,10 @@ parameters:
|
|
| 19 |
REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
|
| 20 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 21 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
|
| 22 |
-
FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional)
|
| 23 |
USE_BN: False # Use batch normalization in decoder
|
| 24 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 25 |
-
OUTPUT_IMAGE_SIZE: [
|
| 26 |
OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image)
|
| 27 |
DECODER_LR: 1.0e-5 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
|
| 28 |
NUM_FUSION_BLOCKS_TRAINABLE: 1 # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
|
|
@@ -35,14 +35,14 @@ parameters:
|
|
| 35 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder
|
| 36 |
USE_BN: False # Use batch normalization in decoder
|
| 37 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 38 |
-
OUTPUT_IMAGE_SIZE: [
|
| 39 |
OUTPUT_CHANNELS: 1 # Number of output channels (1 for highlight mask)
|
| 40 |
DECODER_LR: 5.0e-4 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
|
| 41 |
NUM_FUSION_BLOCKS_TRAINABLE: null # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
|
| 42 |
TOKEN_INPAINTER:
|
| 43 |
TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
|
| 44 |
TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
|
| 45 |
-
FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
|
| 46 |
TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
|
| 47 |
DEPTH: 6 # Number of transformer blocks
|
| 48 |
HEADS: 16 # Number of attention heads
|
|
|
|
| 9 |
MODEL_MODULE: "models" # Module name to import model classes from (default: "models")
|
| 10 |
RGB_ENCODER:
|
| 11 |
ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format)
|
| 12 |
+
IMAGE_SIZE: 896 # Input image size (height and width in pixels)
|
| 13 |
RETURN_SELECTED_LAYERS: [3, 6, 9, 12] # Transformer layer indices to extract features from (0-indexed)
|
| 14 |
RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
|
| 15 |
DECODERS:
|
|
|
|
| 19 |
REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
|
| 20 |
REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
|
| 21 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
|
| 22 |
+
# FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional)
|
| 23 |
USE_BN: False # Use batch normalization in decoder
|
| 24 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 25 |
+
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
| 26 |
OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image)
|
| 27 |
DECODER_LR: 1.0e-5 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
|
| 28 |
NUM_FUSION_BLOCKS_TRAINABLE: 1 # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
|
|
|
|
| 35 |
READOUT_TYPE: "ignore" # Readout type for DPT decoder
|
| 36 |
USE_BN: False # Use batch normalization in decoder
|
| 37 |
DROPOUT: 0.1 # Dropout rate in decoder layers
|
| 38 |
+
OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
|
| 39 |
OUTPUT_CHANNELS: 1 # Number of output channels (1 for highlight mask)
|
| 40 |
DECODER_LR: 5.0e-4 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
|
| 41 |
NUM_FUSION_BLOCKS_TRAINABLE: null # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
|
| 42 |
TOKEN_INPAINTER:
|
| 43 |
TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
|
| 44 |
TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
|
| 45 |
+
# FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
|
| 46 |
TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
|
| 47 |
DEPTH: 6 # Number of transformer blocks
|
| 48 |
HEADS: 16 # Number of attention heads
|