# ========================================================================== # Architecture / loading metadata for the PHASE-Tree pretrained hypermod. # # The released warm-start checkpoint is # phase_tree_models/phase_tree_pretrained/hypermod.pt # (= the it_20000 snapshot of the original pretraining run). # # Only fields read by `load_hypermod_checkpoint` (path resolution + # hypermod architecture) are kept here; the dataset lists and the # original training schedule are intentionally omitted, because the # PHASE-Tree SFT runs warm-start from these weights and override every # training hyperparameter from `train_phase_tree_qwen_7b.sh`. # ========================================================================== # ── Paths ──────────────────────────────────────────────────────────────── model_dir: Qwen/Qwen2.5-7B-Instruct emb_model: Qwen/Qwen3-Embedding-4B mt_lora_path: null # ── Task setup ─────────────────────────────────────────────────────────── training_task: sft exp_setup: hyper_lora sft_mode: completion encoder_type: linear # ── Task-embedding mode ───────────────────────────────────────────────── use_hypernet: true use_per_task_emb: true use_one_hot_task_emb: false use_inp_as_desc: false use_per_sample_desc: false use_default_desc: false # ── Hypermod architecture ──────────────────────────────────────────────── head_in_size: 2048 head_use_bias: false hypernet_latent_size: 1024 delta_w_scaling: 100 pred_z_score: true factorized: false shared_AB_head: false autoreg_gen: false learnable_pos_emb: false learnable_AB_offset: false # ── Fusion (disabled; kept for loader compatibility) ──────────────────── use_conv_fusion: false conv_fusion_type: 1d conv_fusion_kernel_size: 3 conv_fusion_num_layers: 2 conv_fusion_channels: 64 conv_fusion_dropout: 0.1 use_attention_fusion: false attention_fusion_type: self attention_num_heads: 8 attention_num_layers: 2 attention_dropout: 0.1 # ── Target LoRA modules and context window ────────────────────────────── target_modules: - q_proj - v_proj inp_max_len: 1024