IAAR-Shanghai
/

phase_tree_models

Text Generation

character-impersonation

Model card Files Files and versions

phase_tree_models / phase_tree_pretrained /args.yaml

Mathematics-Yang's picture

Mathematics-Yang

Add files using upload-large-folder tool

1145a14 verified 1 day ago

history blame contribute delete

2.61 kB

	# ==========================================================================
	# Architecture / loading metadata for the PHASE-Tree pretrained hypermod.
	#
	# The released warm-start checkpoint is
	# phase_tree_models/phase_tree_pretrained/hypermod.pt
	# (= the it_20000 snapshot of the original pretraining run).
	#
	# Only fields read by `load_hypermod_checkpoint` (path resolution +
	# hypermod architecture) are kept here; the dataset lists and the
	# original training schedule are intentionally omitted, because the
	# PHASE-Tree SFT runs warm-start from these weights and override every
	# training hyperparameter from `train_phase_tree_qwen_7b.sh`.
	# ==========================================================================

	# ── Paths ────────────────────────────────────────────────────────────────
	model_dir: Qwen/Qwen2.5-7B-Instruct
	emb_model: Qwen/Qwen3-Embedding-4B
	mt_lora_path: null

	# ── Task setup ───────────────────────────────────────────────────────────
	training_task: sft
	exp_setup: hyper_lora
	sft_mode: completion
	encoder_type: linear

	# ── Task-embedding mode ─────────────────────────────────────────────────
	use_hypernet: true
	use_per_task_emb: true
	use_one_hot_task_emb: false
	use_inp_as_desc: false
	use_per_sample_desc: false
	use_default_desc: false

	# ── Hypermod architecture ────────────────────────────────────────────────
	head_in_size: 2048
	head_use_bias: false
	hypernet_latent_size: 1024
	delta_w_scaling: 100
	pred_z_score: true
	factorized: false
	shared_AB_head: false
	autoreg_gen: false
	learnable_pos_emb: false
	learnable_AB_offset: false

	# ── Fusion (disabled; kept for loader compatibility) ────────────────────
	use_conv_fusion: false
	conv_fusion_type: 1d
	conv_fusion_kernel_size: 3
	conv_fusion_num_layers: 2
	conv_fusion_channels: 64
	conv_fusion_dropout: 0.1
	use_attention_fusion: false
	attention_fusion_type: self
	attention_num_heads: 8
	attention_num_layers: 2
	attention_dropout: 0.1

	# ── Target LoRA modules and context window ──────────────────────────────
	target_modules:
	- q_proj
	- v_proj
	inp_max_len: 1024