{ "datasets": [ { "name": "chatalpaca_multiturn_enriched", "repo_id": "BRlkl/chatalpaca-multiturn-enriched", "source_split": "train", "format": "messages_all_turns", "validation_ratio": 0.02, "split_seed": 17, "min_turns": 2, "max_turns": 6, "max_message_chars": 6000, "use_base_chat_template": true } ], "model": { "base_model_name": "google/t5gemma-l-l-prefixlm-it", "initial_model_path": null, "dtype": "bfloat16", "attn_implementation": "sdpa", "disable_cudnn_sdp": true, "disable_mha_fastpath": true, "magicnorm_eps": 1e-06, "z_slots": 1024, "num_time_tokens": 0, "use_explicit_time_features": false, "gate_attention_heads": 4, "max_observation_tokens": 1024, "max_decoder_tokens": 1024, "thought_loop_proposal_mode": "observation_hidden_compression", "preserve_observation_encoder_manifold": true, "observation_encoder_use_state_context": true, "latent_attention_mask_mode": "full", "hard_state_replace": true }, "training": { "seed": 17, "num_workers": 2, "gradient_checkpointing": true, "mixed_precision": "bf16", "max_grad_norm": 1.0, "weight_decay": 0.01, "backbone_learning_rate": 5e-06, "new_module_learning_rate": 0.0001, "adam_beta1": 0.9, "adam_beta2": 0.95, "adam_epsilon": 1e-08, "fused_adamw": true, "freeze_gate_head": true, "assistant_feedback_mode": "teacher_forced", "log_every_steps": 1, "eval_every_steps": 90, "checkpoint_every_steps": 500, "eval_max_batches": 16, "validation_behavior_max_batches": 4, "max_train_examples": null, "max_validation_examples": null, "response_loss_weight": 0.5, "current_user_reconstruction_loss_weight": 0.25, "probe_loss_weight": 0.25, "probe_question_text": "What is everything we have talked about so far? Give exact conversation transcript verbatim in following format: [User 1]: X [Assistant 1]: Y [User 2]: A etc", "feedback_generation_max_new_tokens": 1024, "feedback_generation_extra_new_tokens": 16, "validation_response_max_new_tokens": 1024, "validation_response_extra_new_tokens": 16, "validation_probe_max_new_tokens": 1024, "validation_probe_extra_new_tokens": 16, "wandb_train_metric_keys": [ "train/loss_total", "train/loss_response", "train/loss_current_user_reconstruction", "train/loss_probe", "train/response_first_token_exact_match", "train/current-user_reconstruction_first_token_exact_match", "train/probe_first_token_exact_match" ], "wandb_validation_metric_keys": [ "validation/loss_total", "validation/loss_response", "validation/loss_current_user_reconstruction", "validation/loss_probe", "validation/goal_loss", "validation/response_similarity", "validation/response_reconstruction_similarity", "validation/probe_transcript_similarity" ], "checkpoint_selection_metric": "validation/goal_loss", "checkpoint_selection_mode": "min", "validation_response_exact_miss_penalty": 1.0, "validation_reconstruction_similarity_miss_penalty": 1.0, "validation_probe_exact_miss_penalty": 1.0, "validation_probe_similarity_miss_penalty": 2.0 }, "phase": { "micro_batch_size": 10, "eval_batch_size": 10, "gradient_accumulation_steps": 4, "num_train_epochs": 2, "warmup_ratio": 0.03, "shuffle_train": true }, "cache": { "preprocessed_root": "cache/preprocessed_pre_sft_multiturn_simple_transcript" }, "paths": { "run_root": "runs_pre_sft_multiturn_simple_transcript", "export_root": "exports_multiturn_simple_transcript" }, "inference": { "format": "predictive_state_multiturn", "use_base_chat_template": true }, "wandb": { "enabled": true, "project": "samantha-pre-sft", "run_name": "t5gemma2-thoughtloop-pre-sft-simple-transcript" }, "hub": { "model_repo_id": "BRlkl/test_multiturn_simple_transcript1024", "private": false } }