BRlkl's picture
Upload folder using huggingface_hub
c5524c2 verified
Raw
History Blame Contribute Delete
4.1 kB
{
"datasets": [
{
"name": "chatalpaca_multiturn_enriched",
"repo_id": "BRlkl/chatalpaca-multiturn-enriched",
"source_split": "train",
"format": "messages_all_turns",
"validation_ratio": 0.02,
"split_seed": 17,
"min_turns": 2,
"max_turns": 6,
"max_message_chars": 6000,
"use_base_chat_template": true
}
],
"model": {
"base_model_name": "google/t5gemma-l-l-prefixlm-it",
"initial_model_path": "BRlkl/test",
"dtype": "bfloat16",
"attn_implementation": "sdpa",
"disable_cudnn_sdp": true,
"disable_mha_fastpath": true,
"magicnorm_eps": 1e-06,
"z_slots": 256,
"num_time_tokens": 0,
"use_explicit_time_features": false,
"gate_attention_heads": 4,
"max_observation_tokens": 1024,
"max_decoder_tokens": 1024,
"thought_loop_proposal_mode": "observation_hidden_compression",
"preserve_observation_encoder_manifold": true,
"observation_encoder_use_state_context": true,
"latent_attention_mask_mode": "full",
"hard_state_replace": true
},
"training": {
"seed": 17,
"num_workers": 2,
"gradient_checkpointing": true,
"mixed_precision": "bf16",
"max_grad_norm": 1.0,
"weight_decay": 0.01,
"backbone_learning_rate": 5e-06,
"new_module_learning_rate": 0.0001,
"adam_beta1": 0.9,
"adam_beta2": 0.95,
"adam_epsilon": 1e-08,
"fused_adamw": true,
"freeze_gate_head": true,
"assistant_feedback_mode": "teacher_forced",
"log_every_steps": 1,
"eval_every_steps": 90,
"checkpoint_every_steps": 500,
"eval_max_batches": 16,
"validation_behavior_max_batches": 4,
"max_train_examples": null,
"max_validation_examples": null,
"response_loss_weight": 0.5,
"current_user_reconstruction_loss_weight": 0.25,
"probe_loss_weight": 0.25,
"probe_question_text": "What is everything we have talked about so far? Give exact conversation transcript verbatim in following format: [User 1]: X [Assistant 1]: Y [User 2]: A etc",
"feedback_generation_max_new_tokens": 1024,
"feedback_generation_extra_new_tokens": 16,
"validation_response_max_new_tokens": 1024,
"validation_response_extra_new_tokens": 16,
"validation_probe_max_new_tokens": 1024,
"validation_probe_extra_new_tokens": 16,
"wandb_train_metric_keys": [
"train/loss_total",
"train/loss_response",
"train/loss_current_user_reconstruction",
"train/loss_probe",
"train/response_first_token_exact_match",
"train/current-user_reconstruction_first_token_exact_match",
"train/probe_first_token_exact_match"
],
"wandb_validation_metric_keys": [
"validation/loss_total",
"validation/loss_response",
"validation/loss_current_user_reconstruction",
"validation/loss_probe",
"validation/goal_loss",
"validation/response_similarity",
"validation/response_reconstruction_similarity",
"validation/probe_transcript_similarity"
],
"checkpoint_selection_metric": "validation/goal_loss",
"checkpoint_selection_mode": "min",
"validation_response_exact_miss_penalty": 1.0,
"validation_reconstruction_similarity_miss_penalty": 1.0,
"validation_probe_exact_miss_penalty": 1.0,
"validation_probe_similarity_miss_penalty": 2.0
},
"phase": {
"micro_batch_size": 10,
"eval_batch_size": 10,
"gradient_accumulation_steps": 4,
"num_train_epochs": 2,
"warmup_ratio": 0.03,
"shuffle_train": true
},
"cache": {
"preprocessed_root": "cache/preprocessed_pre_sft_multiturn_simple_transcript"
},
"paths": {
"run_root": "runs_pre_sft_multiturn_simple_transcript",
"export_root": "exports_multiturn_simple_transcript"
},
"inference": {
"format": "predictive_state_multiturn",
"use_base_chat_template": true
},
"wandb": {
"enabled": true,
"project": "samantha-pre-sft",
"run_name": "t5gemma2-thoughtloop-pre-sft-simple-transcript"
},
"hub": {
"model_repo_id": "BRlkl/test_multiturn_simple_transcript256",
"private": false
}
}