Text Generation
PEFT
English
Chinese
hypernetwork
hyper-lora
lora
role-play
character-impersonation
persona
dialogue
phase-tree
Instructions to use IAAR-Shanghai/phase_tree_models with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use IAAR-Shanghai/phase_tree_models with PEFT:
Task type is invalid.
- Notebooks
- Google Colab
- Kaggle
| # ========================================================================== | |
| # Training args for the released anchor SFT run. | |
| # PHASE-Tree hyper-LoRA SFT on Qwen2.5-7B-Instruct. | |
| # Variant: warm-start, lr=5e-6 (anchor SFT run). | |
| # Effective batch size = batch_size * grad_accum_steps = 4 * 2 = 8. | |
| # 40000 optimizer steps; a checkpoint is saved every val_freq=5000 steps. | |
| # ========================================================================== | |
| run_name: anchor | |
| save_dir: phase_tree_models/sft/hyper_lora | |
| # ββ Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| model_dir: Qwen/Qwen2.5-7B-Instruct | |
| emb_model: Qwen/Qwen3-Embedding-4B | |
| init_hypermod_from: phase_tree_models/phase_tree_pretrained/hypermod.pt | |
| init_hypermod_strict: false | |
| # ββ Task / data ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| training_task: sft | |
| exp_setup: hyper_lora | |
| sft_mode: completion | |
| encoder_type: linear | |
| train_ds_names: | |
| - RAIDEN_train | |
| - CharacterEval_train | |
| - HPD_train | |
| - SimsConv_train | |
| - ChatHaruhi_train | |
| - Friends_train | |
| - StarTrek_TNG_train | |
| - TheOffice_train | |
| eval_ds_info: | |
| - RAIDEN_random_test | |
| - RAIDEN_ood_test | |
| - CharacterEval_random_test | |
| - CharacterEval_ood_test | |
| - HPD_random_test | |
| - HPD_ood_test | |
| - SimsConv_random_test | |
| - SimsConv_ood_test | |
| - ChatHaruhi_random_test | |
| - ChatHaruhi_ood_test | |
| - Friends_random_test | |
| - Friends_ood_test | |
| - StarTrek_TNG_random_test | |
| - StarTrek_TNG_ood_test | |
| - TheOffice_random_test | |
| - TheOffice_ood_test | |
| use_per_task_emb: true | |
| use_one_hot_task_emb: false | |
| use_inp_as_desc: false | |
| use_per_sample_desc: false | |
| use_default_desc: false | |
| use_hierarchical_sampler: true | |
| dataset_sampling_strategy: sqrt_size | |
| equally_weight_sample: true | |
| n_tasks_per_batch: 6 | |
| n_points_per_task: 2 | |
| inp_max_len: 1024 | |
| target_modules: | |
| - q_proj | |
| - v_proj | |
| # ββ Hypermod architecture ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| use_hypernet: true | |
| head_in_size: 2048 | |
| head_use_bias: false | |
| hypernet_latent_size: 1024 | |
| delta_w_scaling: 100 | |
| pred_z_score: true | |
| factorized: false | |
| shared_AB_head: false | |
| autoreg_gen: false | |
| learnable_pos_emb: false | |
| learnable_AB_offset: false | |
| freeze_heads: false | |
| # ββ Fusion (disabled for this run, kept for loader compatibility) ββββββββ | |
| use_conv_fusion: false | |
| conv_fusion_type: 1d | |
| conv_fusion_kernel_size: 3 | |
| conv_fusion_num_layers: 2 | |
| conv_fusion_channels: 64 | |
| conv_fusion_dropout: 0.1 | |
| use_attention_fusion: false | |
| attention_fusion_type: self | |
| attention_num_heads: 8 | |
| attention_num_layers: 2 | |
| attention_dropout: 0.1 | |
| # ββ Optimisation βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| lr: 5.0e-06 | |
| weight_decay: 0.01 | |
| warmup_frac: 0.05 | |
| max_grad_norm: 1.0 | |
| label_smoothing: 0.1 | |
| l2_reg_generated_w: 0.001 | |
| neftune_noise_alpha: 5.0 | |
| gradient_checkpointing: true | |
| # ββ Schedule / batching ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| epochs: 40000 | |
| batch_size: 4 | |
| grad_accum_steps: 2 | |
| val_batch_size: 16 | |
| seed: 42 | |
| # ββ Logging / saving βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| logging_freq: 50 | |
| val_freq: 5000 # validation + checkpoint cadence (steps) | |
| top_k_checkpoints: 999 # keep every checkpoint | |
| skip_val: true # validation done post-hoc via eval scripts | |
| skip_eval: false | |
| use_early_stopping: true | |
| early_stopping_patience: 5 | |
| early_stopping_min_delta: 0.0 | |