{
  "datasets": [
    {
      "name": "alpaca",
      "repo_id": "tatsu-lab/alpaca",
      "format": "alpaca_single_turn",
      "source_split": "train",
      "validation_ratio": 0.02,
      "split_seed": 17,
      "use_base_chat_template": true
    }
  ],
  "model": {
    "base_model_name": "google/t5gemma-l-l-prefixlm-it",
    "dtype": "bfloat16",
    "attn_implementation": "sdpa",
    "magicnorm_eps": 1e-06,
    "z_slots": 256,
    "num_time_tokens": 0,
    "use_explicit_time_features": false,
    "gate_attention_heads": 4,
    "max_observation_tokens": 1024,
    "max_decoder_tokens": 1024,
    "thought_loop_proposal_mode": "observation_hidden_compression",
    "preserve_observation_encoder_manifold": true,
    "observation_encoder_use_state_context": true,
    "latent_attention_mask_mode": "full",
    "initial_update_gate_bias": 6.0
  },
  "training": {
    "seed": 17,
    "num_workers": 4,
    "gradient_checkpointing": true,
    "mixed_precision": "bf16",
    "max_grad_norm": 1.0,
    "weight_decay": 0.01,
    "backbone_learning_rate": 1e-05,
    "new_module_learning_rate": 0.0003,
    "adam_beta1": 0.9,
    "adam_beta2": 0.95,
    "adam_epsilon": 1e-08,
    "fused_adamw": true,
    "freeze_gate_head": true,
    "log_every_steps": 1,
    "eval_every_steps": 100,
    "checkpoint_every_steps": 1000,
    "max_train_examples": null,
    "max_validation_examples": null,
    "latent_bridge_mode": "thought_loop",
    "decoder_training_mode": "teacher_forced",
    "freeze_decoder_first_fraction": 1.0,
    "probe_extra_new_tokens": 16,
    "identity_control_target_slots": 256,
    "identity_control_use_chat_template": false
  },
  "phases": {
    "phase0a": {
      "enabled": false,
      "micro_batch_size": 32,
      "eval_batch_size": 32,
      "gradient_accumulation_steps": 2,
      "num_train_epochs": 1,
      "warmup_ratio": 0.03,
      "shuffle_train": true,
      "instruction_prefix": "Recover the original text from the corrupted text through the latent bottleneck.",
      "corruption": {
        "span_mask_fraction": 0.22,
        "min_span_words": 1,
        "max_span_words": 8,
        "max_mask_spans": 6,
        "word_dropout_prob": 0.12,
        "placeholder_text": "[blank]"
      }
    },
    "phase0b": {
      "enabled": true,
      "micro_batch_size": 32,
      "eval_batch_size": 32,
      "gradient_accumulation_steps": 4,
      "num_train_epochs": 1,
      "warmup_ratio": 0.03,
      "shuffle_train": true,
      "instruction_prefix": "You are a helpful assistant. Reply to the user with the best of your capabilities."
    }
  },
  "cache": {
    "preprocessed_root": "cache/preprocessed_pre_sft"
  },
  "paths": {
    "run_root": "runs_pre_sft",
    "export_root": "exports"
  },
  "inference": {
    "format": "alpaca_single_turn"
  },
  "wandb": {
    "enabled": true,
    "project": "samantha-pre-sft",
    "run_name": "t5gemma2-thoughtloop-pre-sft-alpaca"
  },
  "hub": {
    "model_repo_id": "BRlkl/test",
    "private": false
  }
}