test / sft_config.json
BRlkl's picture
Upload folder using huggingface_hub
5232381 verified
Raw
History Blame Contribute Delete
3 kB
{
"datasets": [
{
"name": "alpaca",
"repo_id": "tatsu-lab/alpaca",
"format": "alpaca_single_turn",
"source_split": "train",
"validation_ratio": 0.02,
"split_seed": 17,
"use_base_chat_template": true
}
],
"model": {
"base_model_name": "google/t5gemma-l-l-prefixlm-it",
"dtype": "bfloat16",
"attn_implementation": "sdpa",
"magicnorm_eps": 1e-06,
"z_slots": 256,
"num_time_tokens": 0,
"use_explicit_time_features": false,
"gate_attention_heads": 4,
"max_observation_tokens": 1024,
"max_decoder_tokens": 1024,
"thought_loop_proposal_mode": "observation_hidden_compression",
"preserve_observation_encoder_manifold": true,
"observation_encoder_use_state_context": true,
"latent_attention_mask_mode": "full",
"initial_update_gate_bias": 6.0
},
"training": {
"seed": 17,
"num_workers": 4,
"gradient_checkpointing": true,
"mixed_precision": "bf16",
"max_grad_norm": 1.0,
"weight_decay": 0.01,
"backbone_learning_rate": 1e-05,
"new_module_learning_rate": 0.0003,
"adam_beta1": 0.9,
"adam_beta2": 0.95,
"adam_epsilon": 1e-08,
"fused_adamw": true,
"freeze_gate_head": true,
"log_every_steps": 1,
"eval_every_steps": 100,
"checkpoint_every_steps": 1000,
"max_train_examples": null,
"max_validation_examples": null,
"latent_bridge_mode": "thought_loop",
"decoder_training_mode": "teacher_forced",
"freeze_decoder_first_fraction": 1.0,
"probe_extra_new_tokens": 16,
"identity_control_target_slots": 256,
"identity_control_use_chat_template": false
},
"phases": {
"phase0a": {
"enabled": false,
"micro_batch_size": 32,
"eval_batch_size": 32,
"gradient_accumulation_steps": 2,
"num_train_epochs": 1,
"warmup_ratio": 0.03,
"shuffle_train": true,
"instruction_prefix": "Recover the original text from the corrupted text through the latent bottleneck.",
"corruption": {
"span_mask_fraction": 0.22,
"min_span_words": 1,
"max_span_words": 8,
"max_mask_spans": 6,
"word_dropout_prob": 0.12,
"placeholder_text": "[blank]"
}
},
"phase0b": {
"enabled": true,
"micro_batch_size": 32,
"eval_batch_size": 32,
"gradient_accumulation_steps": 4,
"num_train_epochs": 1,
"warmup_ratio": 0.03,
"shuffle_train": true,
"instruction_prefix": "You are a helpful assistant. Reply to the user with the best of your capabilities."
}
},
"cache": {
"preprocessed_root": "cache/preprocessed_pre_sft"
},
"paths": {
"run_root": "runs_pre_sft",
"export_root": "exports"
},
"inference": {
"format": "alpaca_single_turn"
},
"wandb": {
"enabled": true,
"project": "samantha-pre-sft",
"run_name": "t5gemma2-thoughtloop-pre-sft-alpaca"
},
"hub": {
"model_repo_id": "BRlkl/test",
"private": false
}
}