BRlkl's picture
Upload folder using huggingface_hub
48b5767 verified
Raw
History Blame Contribute Delete
3.49 kB
datasets:
- name: chatalpaca_multiturn_enriched
repo_id: BRlkl/chatalpaca-multiturn-enriched
source_split: train
format: messages_all_turns
validation_ratio: 0.02
split_seed: 17
min_turns: 2
max_turns: 6
max_message_chars: 6000
use_base_chat_template: true
model:
base_model_name: google/t5gemma-l-l-prefixlm-it
initial_model_path: BRlkl/test_1024
dtype: bfloat16
attn_implementation: sdpa
disable_cudnn_sdp: true
disable_mha_fastpath: true
magicnorm_eps: 1.0e-6
z_slots: 1024
num_time_tokens: 0
use_explicit_time_features: false
gate_attention_heads: 4
max_observation_tokens: 1024
max_decoder_tokens: 1024
thought_loop_proposal_mode: observation_hidden_compression
preserve_observation_encoder_manifold: true
observation_encoder_use_state_context: true
latent_attention_mask_mode: full
hard_state_replace: true
training:
seed: 17
num_workers: 2
gradient_checkpointing: true
mixed_precision: bf16
max_grad_norm: 1.0
weight_decay: 0.01
backbone_learning_rate: 5.0e-6
new_module_learning_rate: 1.0e-4
adam_beta1: 0.9
adam_beta2: 0.95
adam_epsilon: 1.0e-8
fused_adamw: true
freeze_gate_head: true
assistant_feedback_mode: teacher_forced
log_every_steps: 1
eval_every_steps: 100
checkpoint_every_steps: 500
eval_max_batches: 16
validation_behavior_max_batches: 4
max_train_examples:
max_validation_examples:
response_loss_weight: 0.33
current_user_reconstruction_loss_weight: 0.33
probe_loss_weight: 0.33
probe_question_text: "What is everything we have talked about so far? Give exact conversation transcript verbatim in following format: [User 1]: X [Assistant 1]: Y [User 2]: A etc"
feedback_generation_max_new_tokens: 1024
feedback_generation_extra_new_tokens: 16
validation_response_max_new_tokens: 1024
validation_response_extra_new_tokens: 16
validation_probe_max_new_tokens: 1024
validation_probe_extra_new_tokens: 16
wandb_train_metric_keys:
- train/loss_total
- train/loss_response
- train/loss_current_user_reconstruction
- train/loss_probe
- train/response_first_token_exact_match
- train/current-user_reconstruction_first_token_exact_match
- train/probe_first_token_exact_match
wandb_validation_metric_keys:
- validation/loss_total
- validation/loss_response
- validation/loss_current_user_reconstruction
- validation/loss_probe
- validation/goal_loss
- validation/response_similarity
- validation/response_reconstruction_similarity
- validation/probe_transcript_similarity
checkpoint_selection_metric: validation/goal_loss
checkpoint_selection_mode: min
validation_response_exact_miss_penalty: 1.0
validation_reconstruction_similarity_miss_penalty: 1.0
validation_probe_exact_miss_penalty: 1.0
validation_probe_similarity_miss_penalty: 2.0
phase:
micro_batch_size: 10
eval_batch_size: 10
gradient_accumulation_steps: 4
num_train_epochs: 5
warmup_ratio: 0.03
shuffle_train: true
cache:
preprocessed_root: cache/preprocessed_pre_sft_multiturn_simple_transcript
paths:
run_root: runs_pre_sft_multiturn_simple_transcript
export_root: exports_multiturn_simple_transcript
inference:
format: predictive_state_multiturn
use_base_chat_template: true
wandb:
enabled: true
project: samantha-pre-sft
run_name: t5gemma2-thoughtloop-pre-sft-simple-transcript
hub:
model_repo_id: BRlkl/test_multiturn_simple_transcript1024_2
private: false