| { |
| "datasets": [ |
| { |
| "name": "alpaca", |
| "repo_id": "tatsu-lab/alpaca", |
| "format": "alpaca_single_turn", |
| "source_split": "train", |
| "validation_ratio": 0.02, |
| "split_seed": 17, |
| "use_base_chat_template": true |
| } |
| ], |
| "model": { |
| "base_model_name": "google/t5gemma-l-l-prefixlm-it", |
| "dtype": "bfloat16", |
| "attn_implementation": "sdpa", |
| "magicnorm_eps": 1e-06, |
| "z_slots": 256, |
| "num_time_tokens": 0, |
| "use_explicit_time_features": false, |
| "gate_attention_heads": 4, |
| "max_observation_tokens": 1024, |
| "max_decoder_tokens": 1024, |
| "thought_loop_proposal_mode": "observation_hidden_compression", |
| "preserve_observation_encoder_manifold": true, |
| "observation_encoder_use_state_context": true, |
| "latent_attention_mask_mode": "full", |
| "initial_update_gate_bias": 6.0 |
| }, |
| "training": { |
| "seed": 17, |
| "num_workers": 4, |
| "gradient_checkpointing": true, |
| "mixed_precision": "bf16", |
| "max_grad_norm": 1.0, |
| "weight_decay": 0.01, |
| "backbone_learning_rate": 1e-05, |
| "new_module_learning_rate": 0.0003, |
| "adam_beta1": 0.9, |
| "adam_beta2": 0.95, |
| "adam_epsilon": 1e-08, |
| "fused_adamw": true, |
| "freeze_gate_head": true, |
| "log_every_steps": 1, |
| "eval_every_steps": 100, |
| "checkpoint_every_steps": 1000, |
| "max_train_examples": null, |
| "max_validation_examples": null, |
| "latent_bridge_mode": "thought_loop", |
| "decoder_training_mode": "teacher_forced", |
| "freeze_decoder_first_fraction": 1.0, |
| "probe_extra_new_tokens": 16, |
| "identity_control_target_slots": 256, |
| "identity_control_use_chat_template": false |
| }, |
| "phases": { |
| "phase0a": { |
| "enabled": false, |
| "micro_batch_size": 32, |
| "eval_batch_size": 32, |
| "gradient_accumulation_steps": 2, |
| "num_train_epochs": 1, |
| "warmup_ratio": 0.03, |
| "shuffle_train": true, |
| "instruction_prefix": "Recover the original text from the corrupted text through the latent bottleneck.", |
| "corruption": { |
| "span_mask_fraction": 0.22, |
| "min_span_words": 1, |
| "max_span_words": 8, |
| "max_mask_spans": 6, |
| "word_dropout_prob": 0.12, |
| "placeholder_text": "[blank]" |
| } |
| }, |
| "phase0b": { |
| "enabled": true, |
| "micro_batch_size": 32, |
| "eval_batch_size": 32, |
| "gradient_accumulation_steps": 4, |
| "num_train_epochs": 1, |
| "warmup_ratio": 0.03, |
| "shuffle_train": true, |
| "instruction_prefix": "You are a helpful assistant. Reply to the user with the best of your capabilities." |
| } |
| }, |
| "cache": { |
| "preprocessed_root": "cache/preprocessed_pre_sft" |
| }, |
| "paths": { |
| "run_root": "runs_pre_sft", |
| "export_root": "exports" |
| }, |
| "inference": { |
| "format": "alpaca_single_turn" |
| }, |
| "wandb": { |
| "enabled": true, |
| "project": "samantha-pre-sft", |
| "run_name": "t5gemma2-thoughtloop-pre-sft-alpaca" |
| }, |
| "hub": { |
| "model_repo_id": "BRlkl/test", |
| "private": false |
| } |
| } |