"""Contains params for backbone. For licensing see accompanying LICENSE file. Copyright (C) 2025 Apple Inc. All Rights Reserved. """ import dataclasses from typing import Literal import sharp.utils.math as math_utils from sharp.models.blocks import NormLayerName, UpsamplingMode from sharp.models.presets import ViTPreset from sharp.utils.color_space import ColorSpace DimsDecoder = tuple[int, int, int, int, int] DPTImageEncoderType = Literal["skip_conv", "skip_conv_kernel2"] ColorInitOption = Literal[ "none", # Initialize as gray. "first_layer", # Initialize the first layer with input image, other layers with gray. "all_layers", # Initialize all layers with input image. ] DepthInitOption = Literal[ # Initialize the layer of gaussian on surface using min pooling of input depth. "surface_min", # Initialize the layer of gaussian on surface using max pooling of input depth "surface_max", # Initialize the layer of gaussian on plane using base_depth depth. "base_depth", # Initialize the layer of gaussian on plane based on base_depth and index of layer. "linear_disparity", ] @dataclasses.dataclass class AlignmentParams: """Parameters for depth alignment.""" kernel_size: int = 16 stride: int = 1 frozen: bool = False # The following parameters are only used for LearnedAlignment. # Number of steps in the UNet for LearnedAlignment. steps: int = 4 # Activation type for LearnedAlignment. activation_type: math_utils.ActivationType = "exp" # Whether to use depth decoder features for LearnedAlignment. depth_decoder_features: bool = False # Base width of the UNet for LearnedAlignment. base_width: int = 16 @dataclasses.dataclass class DeltaFactor: """Factors to multiply deltas with before activation. These factors effectively selectively reduce the learning rate. """ xy: float = 0.001 z: float = 0.001 color: float = 0.1 # We recommend 0.1 for linearRGB and 1.0 for sRGB. opacity: float = 1.0 scale: float = 1.0 quaternion: float = 1.0 @dataclasses.dataclass class InitializerParams: """Parameters for initializer.""" # Common parameters. # Multiply scale of Gaussians by this factor. scale_factor: float = 1.0 # Factor to convert inverse depth to disparity. disparity_factor: float = 1.0 # Stride of the initializer. stride: int = 2 # Parameters that only affect MultiLayerInitializer. # How many layers of Gaussians to predict (only available for MultiLayerInitializer). num_layers: int = 2 # Which option to use for depth initialization. first_layer_depth_option: DepthInitOption = "surface_min" rest_layer_depth_option: DepthInitOption = "surface_min" # Which option to use for color initialization. color_option: ColorInitOption = "all_layers" # Which depth value to use for depth layers. base_depth: float = 10.0 # Deactivate gradient for feature inputs. feature_input_stop_grad: bool = False # Whether to normalize depth to [DepthTransformParam.depth_min, # DepthTransformParam.depth_max). normalize_depth: bool = True # Output only the inpainted layer. In this case, num_layers = 1. output_inpainted_layer_only: bool = False # Whether to set the uninpainted region to zero opacities. set_uninpainted_opacity_to_zero: bool = False # Whether to concatenate the inpainting mask to the feature input. concat_inpainting_mask: bool = False @dataclasses.dataclass class MonodepthParams: """Parameters for monodepth network.""" patch_encoder_preset: ViTPreset = "dinov2l16_384" image_encoder_preset: ViTPreset = "dinov2l16_384" checkpoint_uri: str | None = None unfreeze_patch_encoder: bool = False unfreeze_image_encoder: bool = False unfreeze_decoder: bool = False unfreeze_head: bool = False unfreeze_norm_layers: bool = False grad_checkpointing: bool = False use_patch_overlap: bool = True dims_decoder: DimsDecoder = (256, 256, 256, 256, 256) @dataclasses.dataclass class MonodepthAdaptorParams: """Parameters for monodepth network feature adaptor.""" encoder_features: bool = True decoder_features: bool = False @dataclasses.dataclass class GaussianDecoderParams: """Parameters for backbone with default values.""" dim_in: int = 5 dim_out: int = 32 # Which normalization to use in backbone. norm_type: NormLayerName = "group_norm" # How many groups to use for group normalization. norm_num_groups: int = 8 # Stride of backbone. stride: int = 2 patch_encoder_preset: ViTPreset = "dinov2l16_384" image_encoder_preset: ViTPreset = "dinov2l16_384" # Dimensionality of feature maps for DPT decoder. dims_decoder: DimsDecoder = (128, 128, 128, 128, 128) # Whether to use depth as input. use_depth_input: bool = True # Whether to enable gradient checkpointing for the backbone grad_checkpointing: bool = False # What mode to use for upsampling in decoder. upsampling_mode: UpsamplingMode = "transposed_conv" # The type of image encoder. image_encoder_type: DPTImageEncoderType = "skip_conv_kernel2" @dataclasses.dataclass class PredictorParams: """Parameters for predictors with default values.""" # Parameters for submodules. initializer: InitializerParams = dataclasses.field(default_factory=InitializerParams) monodepth: MonodepthParams = dataclasses.field(default_factory=MonodepthParams) monodepth_adaptor: MonodepthAdaptorParams = dataclasses.field( default_factory=MonodepthAdaptorParams ) gaussian_decoder: GaussianDecoderParams = dataclasses.field( default_factory=GaussianDecoderParams ) # How to align depth map (only relevant for RGBGaussianPredictor). depth_alignment: AlignmentParams = dataclasses.field(default_factory=AlignmentParams) # Selectively reduce learning rate for different properties. delta_factor: DeltaFactor = dataclasses.field(default_factory=DeltaFactor) # The maximum scale of Gaussians relative to initial scale. max_scale: float = 10.0 # The minimum scale of Gaussians relative to initial scale. min_scale: float = 0.0 # Which normalization to use in prediction head. norm_type: NormLayerName = "group_norm" # How many groups to use for group normalization. norm_num_groups: int = 8 # Whether to use predicted mean to sample triplane features. use_predicted_mean: bool = False # Which activation function to use for colors / opacities. color_activation_type: math_utils.ActivationType = "sigmoid" opacity_activation_type: math_utils.ActivationType = "sigmoid" # Colorspace of the renderer ("linearRGB" or "sRGB"). color_space: ColorSpace = "linearRGB" # A small value to avoid ill-conditioned splats low_pass_filter_eps: float = 1e-2 # How many layer of depth does monodepth model predict. num_monodepth_layers: int = 2 # Whether to sort the monodepth output (for two layer monodepth). sorting_monodepth: bool = False # Whether to account the z offsets for estimating base scale. base_scale_on_predicted_mean: bool = True