# ========================================================================== # Training args for the released anchor SFT run. # PHASE-Tree hyper-LoRA SFT on Qwen2.5-7B-Instruct. # Variant: warm-start, lr=5e-6 (anchor SFT run). # Effective batch size = batch_size * grad_accum_steps = 4 * 2 = 8. # 40000 optimizer steps; a checkpoint is saved every val_freq=5000 steps. # ========================================================================== run_name: anchor save_dir: phase_tree_models/sft/hyper_lora # ── Model ──────────────────────────────────────────────────────────────── model_dir: Qwen/Qwen2.5-7B-Instruct emb_model: Qwen/Qwen3-Embedding-4B init_hypermod_from: phase_tree_models/phase_tree_pretrained/hypermod.pt init_hypermod_strict: false # ── Task / data ────────────────────────────────────────────────────────── training_task: sft exp_setup: hyper_lora sft_mode: completion encoder_type: linear train_ds_names: - RAIDEN_train - CharacterEval_train - HPD_train - SimsConv_train - ChatHaruhi_train - Friends_train - StarTrek_TNG_train - TheOffice_train eval_ds_info: - RAIDEN_random_test - RAIDEN_ood_test - CharacterEval_random_test - CharacterEval_ood_test - HPD_random_test - HPD_ood_test - SimsConv_random_test - SimsConv_ood_test - ChatHaruhi_random_test - ChatHaruhi_ood_test - Friends_random_test - Friends_ood_test - StarTrek_TNG_random_test - StarTrek_TNG_ood_test - TheOffice_random_test - TheOffice_ood_test use_per_task_emb: true use_one_hot_task_emb: false use_inp_as_desc: false use_per_sample_desc: false use_default_desc: false use_hierarchical_sampler: true dataset_sampling_strategy: sqrt_size equally_weight_sample: true n_tasks_per_batch: 6 n_points_per_task: 2 inp_max_len: 1024 target_modules: - q_proj - v_proj # ── Hypermod architecture ──────────────────────────────────────────────── use_hypernet: true head_in_size: 2048 head_use_bias: false hypernet_latent_size: 1024 delta_w_scaling: 100 pred_z_score: true factorized: false shared_AB_head: false autoreg_gen: false learnable_pos_emb: false learnable_AB_offset: false freeze_heads: false # ── Fusion (disabled for this run, kept for loader compatibility) ──────── use_conv_fusion: false conv_fusion_type: 1d conv_fusion_kernel_size: 3 conv_fusion_num_layers: 2 conv_fusion_channels: 64 conv_fusion_dropout: 0.1 use_attention_fusion: false attention_fusion_type: self attention_num_heads: 8 attention_num_layers: 2 attention_dropout: 0.1 # ── Optimisation ───────────────────────────────────────────────────────── lr: 5.0e-06 weight_decay: 0.01 warmup_frac: 0.05 max_grad_norm: 1.0 label_smoothing: 0.1 l2_reg_generated_w: 0.001 neftune_noise_alpha: 5.0 gradient_checkpointing: true # ── Schedule / batching ────────────────────────────────────────────────── epochs: 40000 batch_size: 4 grad_accum_steps: 2 val_batch_size: 16 seed: 42 # ── Logging / saving ───────────────────────────────────────────────────── logging_freq: 50 val_freq: 5000 # validation + checkpoint cadence (steps) top_k_checkpoints: 999 # keep every checkpoint skip_val: true # validation done post-hoc via eval scripts skip_eval: false use_early_stopping: true early_stopping_patience: 5 early_stopping_min_delta: 0.0