{ "datasets": [ { "name": "alpaca", "repo_id": "tatsu-lab/alpaca", "format": "alpaca_single_turn", "source_split": "train", "validation_ratio": 0.02, "split_seed": 17, "use_base_chat_template": true } ], "model": { "base_model_name": "google/t5gemma-l-l-prefixlm-it", "dtype": "bfloat16", "attn_implementation": "sdpa", "magicnorm_eps": 1e-06, "z_slots": 256, "num_time_tokens": 0, "use_explicit_time_features": false, "gate_attention_heads": 4, "max_observation_tokens": 1024, "max_decoder_tokens": 1024, "thought_loop_proposal_mode": "observation_hidden_compression", "preserve_observation_encoder_manifold": true, "observation_encoder_use_state_context": true, "latent_attention_mask_mode": "full", "initial_update_gate_bias": 6.0 }, "training": { "seed": 17, "num_workers": 4, "gradient_checkpointing": true, "mixed_precision": "bf16", "max_grad_norm": 1.0, "weight_decay": 0.01, "backbone_learning_rate": 1e-05, "new_module_learning_rate": 0.0003, "adam_beta1": 0.9, "adam_beta2": 0.95, "adam_epsilon": 1e-08, "fused_adamw": true, "freeze_gate_head": true, "log_every_steps": 1, "eval_every_steps": 100, "checkpoint_every_steps": 1000, "max_train_examples": null, "max_validation_examples": null, "latent_bridge_mode": "thought_loop", "decoder_training_mode": "teacher_forced", "freeze_decoder_first_fraction": 1.0, "probe_extra_new_tokens": 16, "identity_control_target_slots": 256, "identity_control_use_chat_template": false }, "phases": { "phase0a": { "enabled": false, "micro_batch_size": 32, "eval_batch_size": 32, "gradient_accumulation_steps": 2, "num_train_epochs": 1, "warmup_ratio": 0.03, "shuffle_train": true, "instruction_prefix": "Recover the original text from the corrupted text through the latent bottleneck.", "corruption": { "span_mask_fraction": 0.22, "min_span_words": 1, "max_span_words": 8, "max_mask_spans": 6, "word_dropout_prob": 0.12, "placeholder_text": "[blank]" } }, "phase0b": { "enabled": true, "micro_batch_size": 32, "eval_batch_size": 32, "gradient_accumulation_steps": 4, "num_train_epochs": 1, "warmup_ratio": 0.03, "shuffle_train": true, "instruction_prefix": "You are a helpful assistant. Reply to the user with the best of your capabilities." } }, "cache": { "preprocessed_root": "cache/preprocessed_pre_sft" }, "paths": { "run_root": "runs_pre_sft", "export_root": "exports" }, "inference": { "format": "alpaca_single_turn" }, "wandb": { "enabled": true, "project": "samantha-pre-sft", "run_name": "t5gemma2-thoughtloop-pre-sft-alpaca" }, "hub": { "model_repo_id": "BRlkl/test", "private": false } }