AlbeRota commited on
Commit
2baadcd
·
verified ·
1 Parent(s): 0475949

896 half-pretrained!

Browse files
Files changed (1) hide show
  1. configs/pretrained_config.yaml +7 -11
configs/pretrained_config.yaml CHANGED
@@ -10,25 +10,23 @@ parameters:
10
  RGB_ENCODER:
11
  ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format)
12
  IMAGE_SIZE: 896 # Input image size (height and width in pixels)
13
- RETURN_SELECTED_LAYERS: [3, 6, 9, 12] # Transformer layer indices to extract features from (0-indexed)
14
  RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
15
  DECODERS:
16
  diffuse:
17
- USE_FILM: False # Enable FiLM (Feature-wise Linear Modulation) conditioning in decoder
18
  FEATURE_DIM: 1024 # Feature dimension for decoder (should match encoder output)
19
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
20
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
21
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
22
- # FROM_PRETRAINED: "weights/rgb_decoder.pth" # Path to pretrained decoder weights (optional)
23
  USE_BN: False # Use batch normalization in decoder
24
  DROPOUT: 0.1 # Dropout rate in decoder layers
25
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
26
  OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image)
27
- DECODER_LR: 1.0e-5 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
28
  NUM_FUSION_BLOCKS_TRAINABLE: 1 # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
29
  TRAIN_RGB_HEAD: True # Whether to train RGB head (true/false, null = train if DECODER_LR != 0)
30
  highlight:
31
- USE_FILM: False # Enable FiLM conditioning in highlight decoder
32
  FEATURE_DIM: 1024 # Feature dimension for highlight decoder
33
  REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
34
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
@@ -42,11 +40,11 @@ parameters:
42
  TOKEN_INPAINTER:
43
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
44
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
45
- # FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights
46
- TOKEN_INPAINTER_LR: 1.0e-5 # Learning rate for token inpainter (can differ from base LR)
47
  DEPTH: 6 # Number of transformer blocks
48
  HEADS: 16 # Number of attention heads
49
- DROP: 0 # Dropout rate
50
  USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings
51
  USE_FINAL_NORM: True # Enable final LayerNorm before output projection
52
  USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds
@@ -54,6 +52,4 @@ parameters:
54
  LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
55
  SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
56
  INPAINT_MASK_DILATION:
57
- value: 1 # Dilation kernel size (pixels) for inpaint mask - Must be odd
58
- USE_TORCH_COMPILE: # Enable PyTorch 2.0 torch.compile for faster training (experimental)
59
- value: False
 
10
  RGB_ENCODER:
11
  ENCODER: "facebook/dinov3-vitl16-pretrain-lvd1689m" # DINOv3 encoder model name (HuggingFace format)
12
  IMAGE_SIZE: 896 # Input image size (height and width in pixels)
13
+ RETURN_SELECTED_LAYERS: [3, 6, 9 , 12] # Transformer layer indices to extract features from (0-indexed)
14
  RGB_ENCODER_LR: 0.0 # Learning rate for RGB encoder (0.0 = frozen, must be explicitly set)
15
  DECODERS:
16
  diffuse:
 
17
  FEATURE_DIM: 1024 # Feature dimension for decoder (should match encoder output)
18
  REASSEMBLE_OUT_CHANNELS: [768,1024,1536,2048] # Output channels for each decoder stage (DPT-style reassembly)
19
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
20
  READOUT_TYPE: "ignore" # Readout type for DPT decoder ("ignore", "project", etc.)
21
+ FROM_PRETRAINED: "weights/decoder_896.pth" # Path to pretrained decoder weights (optional)
22
  USE_BN: False # Use batch normalization in decoder
23
  DROPOUT: 0.1 # Dropout rate in decoder layers
24
  OUTPUT_IMAGE_SIZE: [896,896] # Output image resolution [height, width]
25
  OUTPUT_CHANNELS: 3 # Number of output channels (3 for RGB diffuse image)
26
+ DECODER_LR: 0.0 # Custom learning rate for decoder (0.0 = frozen, 1.0 = same as base LR)
27
  NUM_FUSION_BLOCKS_TRAINABLE: 1 # Number of fusion blocks to train (0-4, null = train all if DECODER_LR != 0)
28
  TRAIN_RGB_HEAD: True # Whether to train RGB head (true/false, null = train if DECODER_LR != 0)
29
  highlight:
 
30
  FEATURE_DIM: 1024 # Feature dimension for highlight decoder
31
  REASSEMBLE_OUT_CHANNELS: [96,192,384,768] # Output channels for each decoder stage
32
  REASSEMBLE_FACTORS: [4.0, 2.0, 1.0, 0.5] # Spatial upsampling factors for each stage
 
40
  TOKEN_INPAINTER:
41
  TOKEN_INPAINTER_CLASS: "TokenInpainter_Prior" # Token inpainter class name
42
  TOKEN_INPAINTER_MODULE: "token_inpainters" # Module name to import token inpainter from
43
+ # FROM_PRETRAINED: "weights/token_inpainter.pth" # Path to pretrained token inpainter weights (optional)
44
+ TOKEN_INPAINTER_LR: 1.0e-4 # Learning rate for token inpainter (can differ from base LR)
45
  DEPTH: 6 # Number of transformer blocks
46
  HEADS: 16 # Number of attention heads
47
+ DROP: 0.05 # Dropout rate
48
  USE_POSITIONAL_ENCODING: True # Enable 2D sinusoidal positional encodings
49
  USE_FINAL_NORM: True # Enable final LayerNorm before output projection
50
  USE_LOCAL_PRIOR: True # Blend local mean prior for masked seeds
 
52
  LOCAL_PRIOR_KERNEL: 5 # Kernel size for local prior blending (> 1)
53
  SEED_NOISE_STD: 0.02 # Standard deviation of noise added to masked seeds during training
54
  INPAINT_MASK_DILATION:
55
+ value: 3 # Dilation kernel size (pixels) for inpaint mask - Must be odd