project:
  name: hyper3-clip
  experiment: hyper3_vitb_clip_uncha_hier_beta_argent_mp5_paper_scratch_8x500k_s31
seed: 31
output_dir: /sc/projects/sci-aisc/matin.mahmood/runs/hyper3_vitb_hierbeta_argent_mp5_paper_scratch_500k_v1/hyper3_vitb_clip_uncha_hier_beta_argent_mp5_paper_scratch_8x500k_s31
model:
  objective: uncha
  vision_backbone: vit_base_patch16_224
  vision_pretrained: false
  vision_global_pool: token
  vision_use_sincos2d_pos: true
  vision_timm_norm_layer: layer_norm
  text_model_name: openai/clip-vit-base-patch32
  text_pretrained: false
  text_pooling: auto
  embed_dim: 512
  curv_init: 1.0
  learn_curv: true
  entail_weight: 0.2
  inter_aperture_scale: 0.7
  intra_aperture_scale: 1.2
  uncha_piecewise_factor: 0.1
  uncha_calibration_alpha: 10.0
  uncha_stop_grad_calibration: true
  uncha_entailment_geometry: lorentz
  uncha_aggregate_weight: 0.0
  uncha_entailment_loss: hier_beta_argent
  uncha_argent_beta: 1.0
  uncha_argent_norm_weight: 0.1
  uncha_argent_aux_weight: 0.5
  uncha_argent_aggregation: uncha
  uncha_part_weight_power: 0.0
  uncha_contrastive_loss: ce
  uncha_sigmoid_bias_init: -10.0
  uncha_sigmoid_negative_weight: 1.0
  uncha_part_quality_mode: none
  uncha_part_quality_topk: 5
  uncha_part_quality_temperature: 4.0
  uncha_entailment_warmup_steps: 0
  uncha_global_local_mode: repeat
  beta_clip_global_weight: 0.0
  beta_clip_weight: 0.0
  beta_clip_beta: 0.5
  beta_clip_similarity: dot
  beta_clip_num_heads: 8
  beta_clip_mlp_ratio: 4.0
  beta_clip_drop_cls_token: true
  fuse_beta_query_encoder_forwards: true
  group_beta_query_pooling: true
  beta_clip_variant: ce
  phyclip_product_metric: l1
training:
  total_steps: 500000
  global_batch_size: 768
  grad_accum_steps: 1
  num_workers: 8
  lr: 0.0005
  weight_decay: 0.2
  betas:
  - 0.9
  - 0.98
  warmup_steps: 4000
  log_interval: 20
  ckpt_interval: 10000
  amp: true
  max_grad_norm: 1.0
  resume: true
  resume_from: null
  resume_from_env: RESUME_FROM_CHECKPOINT
  find_unused_parameters: true
optimizer:
  no_decay_params:
  - logit_scale
  - global_logit_scale
  - local_logit_scale
  - global_local_logit_scale
  - visual_alpha
  - textual_alpha
  - log_curv
  - global_logit_bias
  - local_logit_bias
  - global_local_logit_bias
data:
  type: processed_grit
  part_sampling: all
  max_parts: 5
  train_transform: tight_crop_color_jitter_gray
  tarfiles:
  - /sc/projects/sci-aisc/matin.mahmood/datasets/hycoclip/train/GRIT/processed/*.tar
  shuffle_buffer: 4000
  image_size: 224
  max_text_length: 77
  num_workers: 8
  image_normalization: imagenet
  beta_clip:
    enabled: true
    max_sentences: 5
    max_phrases: 30
    max_queries_per_image: 6
    use_part_texts: true