name: fax # only used for demonstration data api
root_dir: '/data/s2/semantic-opv2v/train'
validate_dir: '/data/s2/semantic-opv2v/test'


train_params:
  batch_size: &batch_size 1
  epoches: &epoches 71
  eval_freq: 5
  save_freq: 5
  max_cav: &max_cav 5
  visible: true


fusion:
  core_method: 'CamIntermediateFusionDataset' # LateFusionDataset, EarlyFusionDataset, IntermediateFusionDataset supported
  args: []


data_augment: []
add_data_extension: ['bev_dynamic.png', 'bev_static.png', 'bev_lane.png', 'bev_visibility.png', 'bev_visibility_corp.png']

# preprocess-related
preprocess:
  # options: BasePreprocessor, VoxelPreprocessor, BevPreprocessor
  core_method: 'RgbPreprocessor'
  args:
    bgr2rgb: true
    resize_x: &image_width 512
    resize_y: &image_height 512
    mean: [0.485, 0.456, 0.406]
    std: [0.229, 0.224, 0.225]
  # object evaluation range
  cav_lidar_range: &cav_lidar [-50, -50, -3, 50, 50, 1]


# anchor box related
postprocess:
  core_method: 'CameraBevPostprocessor' # VoxelPostprocessor, BevPostprocessor supported
  anchor_args:
    cav_lidar_range: *cav_lidar
  order: 'hwl' # hwl or lwh
  max_num: 100 # maximum number of objects in a single frame. use this number to make sure different frames has the same dimension in the same batch
  nms_thresh: 0.15

model:
  core_method: corpbevt
  args:
    target: &target  'dynamic' #'dynamic' dynamic, static or both
    max_cav: *max_cav
    encoder:
      num_layers: 34
      pretrained: true
      image_width: *image_width
      image_height: *image_height
      id_pick: [1, 2, 3]

    compression: 8 #0.2 #2 #0 #64 #0 #8 #64 #0 # compression rate

    decoder:
      input_dim: 128
      num_layer: 3
      num_ch_dec: &decoder_block [32, 64, 128]

    fax:
      dim: [128, 128, 128] # b, d, h w from resenet -> b 256 h w
      middle: [2, 2, 2] # middle conv
      bev_embedding:
        sigma: 1.0
        bev_height: 256
        bev_width: 256
        h_meters: 100
        w_meters: 100
        offset: 0.0
        upsample_scales: [2, 4, 8]

      cross_view: #cross_view attention
        image_height: *image_height
        image_width: *image_width
        no_image_features: False
        skip: True
        heads: [4, 4, 4]
        dim_head: [32, 32, 32]
        qkv_bias: True

      cross_view_swap:
        rel_pos_emb: False
        q_win_size: [ [ 16, 16 ], [ 16, 16 ], [ 32, 32 ] ]
        feat_win_size: [ [ 8, 8 ], [ 8, 8 ], [ 16, 16 ] ]
        bev_embedding_flag: [ true, false, false ]

      self_attn:
        dim_head: 32
        dropout: 0.1
        window_size: 32

    sttf: &sttf
      resolution: 0.390625 # m/pixel
      downsample_rate: 8
      use_roi_mask: true

    fax_fusion:
      input_dim: 128
      mlp_dim: 256
      agent_size: *max_cav
      window_size: 8
      dim_head: 32
      drop_out: 0.1
      depth: 3
      mask: true


    seg_head_dim: 32
    output_class: 2

loss:
  core_method: vanilla_seg_loss
  args:
    target: *target
    d_weights: 75.0
    s_weights: 15.0
    d_coe: 2.0
    s_coe: 0.0

optimizer:
  core_method: AdamW
  lr: 2e-4
  args:
    eps: 1e-10
    weight_decay: 1e-2

lr_scheduler:
    core_method: cosineannealwarm #step, multistep, Exponential and cosineannealwarm support
    epoches: *epoches
    warmup_lr: 2e-5
    warmup_epoches: 10
    lr_min: 5e-6