|
|
from __future__ import annotations |
|
|
|
|
|
from typing import Sequence |
|
|
|
|
|
from .constants import MODEL_ARCH, MODEL_TENSOR, MODEL_TENSORS, TENSOR_NAMES |
|
|
|
|
|
|
|
|
class TensorNameMap: |
|
|
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { |
|
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD: ( |
|
|
"gpt_neox.embed_in", |
|
|
"transformer.wte", |
|
|
"transformer.word_embeddings", |
|
|
"word_embeddings", |
|
|
"model.embed_tokens", |
|
|
"embed_tokens", |
|
|
"tok_embeddings", |
|
|
"embeddings.word_embeddings", |
|
|
"language_model.embedding.word_embeddings", |
|
|
"wte", |
|
|
"transformer.embd.wte", |
|
|
"model.tok_embeddings", |
|
|
"model.embedding", |
|
|
"backbone.embedding", |
|
|
"backbone.embeddings", |
|
|
"transformer.in_out_embed", |
|
|
"embedding.word_embeddings", |
|
|
"transformer.token_embeddings", |
|
|
"shared", |
|
|
"rwkv.embeddings", |
|
|
"model.embeddings", |
|
|
"model.word_embeddings", |
|
|
"language_model.model.embed_tokens", |
|
|
"encoder", |
|
|
"model.transformer.wte", |
|
|
"embed_tokens", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.TOKEN_TYPES: ( |
|
|
"embeddings.token_type_embeddings", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM: ( |
|
|
"word_embeddings_layernorm", |
|
|
"embeddings.LayerNorm", |
|
|
"emb_ln", |
|
|
"transformer.norm", |
|
|
"rwkv.blocks.0.pre_ln", |
|
|
"rwkv.blocks.0.pre_ln", |
|
|
"model.pre_ln", |
|
|
"model.layers.0.pre_norm", |
|
|
"backbone.norm", |
|
|
"model.embedding_norm", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.POS_EMBD: ( |
|
|
"transformer.wpe", |
|
|
"embeddings.position_embeddings", |
|
|
"wpe", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.OUTPUT: ( |
|
|
"embed_out", |
|
|
"lm_head", |
|
|
"output", |
|
|
"word_embeddings_for_head", |
|
|
"lm_head.linear", |
|
|
"output_layer", |
|
|
"head", |
|
|
"head.out", |
|
|
"lm_head", |
|
|
"model.transformer.ff_out", |
|
|
), |
|
|
MODEL_TENSOR.DENSE_2_OUT: ( |
|
|
"dense_2_out", |
|
|
), |
|
|
MODEL_TENSOR.DENSE_3_OUT: ( |
|
|
"dense_3_out", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM: ( |
|
|
"gpt_neox.final_layer_norm", |
|
|
"transformer.ln_f", |
|
|
"model.norm", |
|
|
"norm", |
|
|
"transformer.norm_f", |
|
|
"ln_f", |
|
|
"language_model.encoder.final_layernorm", |
|
|
"model.final_layernorm", |
|
|
"lm_head.ln", |
|
|
"model.norm_f", |
|
|
"backbone.norm_f", |
|
|
"transformer.rms_norm", |
|
|
"encoder.final_layernorm", |
|
|
"transformer.norm", |
|
|
"model.norm", |
|
|
"rwkv.ln_out", |
|
|
"model.ln_out", |
|
|
"backbone.final_layer_norm", |
|
|
"model.norm", |
|
|
"model.transformer.ln_f", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.ROPE_FREQS: ( |
|
|
"rope.freqs", |
|
|
"rotary_pos_emb.inv_freq", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ROPE_FACTORS_LONG: (), |
|
|
MODEL_TENSOR.ROPE_FACTORS_SHORT: (), |
|
|
|
|
|
MODEL_TENSOR.CONV1D: ( |
|
|
"backbone.embed", |
|
|
), |
|
|
} |
|
|
|
|
|
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { |
|
|
|
|
|
MODEL_TENSOR.ATTN_NORM: ( |
|
|
"gpt_neox.layers.{bid}.input_layernorm", |
|
|
"transformer.h.{bid}.ln_1", |
|
|
"transformer.blocks.{bid}.norm_1", |
|
|
"transformer.h.{bid}.input_layernorm", |
|
|
"h.{bid}.input_layernorm", |
|
|
"transformer.h.{bid}.ln_mlp", |
|
|
"model.layers.{bid}.input_layernorm", |
|
|
"layers.{bid}.attention_norm", |
|
|
"language_model.encoder.layers.{bid}.input_layernorm", |
|
|
"model.layers.{bid}.ln1", |
|
|
"h.{bid}.ln_1", |
|
|
"transformer.h.{bid}.ln", |
|
|
"model.layers.layers.{bid}.norm", |
|
|
"model.layers.layers.{bid}.pre_mixer_norm", |
|
|
"model.layers.{bid}.attention_norm", |
|
|
"model.layers.{bid}.norm", |
|
|
"backbone.layers.{bid}.norm", |
|
|
"transformer.decoder_layer.{bid}.rms_norm", |
|
|
"model.layers.{bid}.pre_attn_norm", |
|
|
"transformer.blocks.{bid}.norm_attn_norm.norm_1", |
|
|
"encoder.layers.{bid}.input_layernorm", |
|
|
"transformer.layers.{bid}.attn_norm", |
|
|
"rwkv.blocks.{bid}.ln1", |
|
|
"model.layers.{bid}.ln1", |
|
|
"model.layers.{bid}.input_layernorm", |
|
|
"layers.{bid}.input_layernorm", |
|
|
"transformer_encoder.{bid}.attention_norm", |
|
|
"model.layers.{bid}.operator_norm", |
|
|
"model.transformer.blocks.{bid}.attn_norm", |
|
|
"layers.{bid}.input_layernorm", |
|
|
"model.layers.{bid}.attention_layernorm" |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.ATTN_NORM_2: ( |
|
|
"transformer.h.{bid}.ln_attn", |
|
|
"encoder.layer.{bid}.layer_norm_1", |
|
|
"rwkv.blocks.{bid}.ln2", |
|
|
"model.layers.{bid}.ln2", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.ATTN_QKV: ( |
|
|
"gpt_neox.layers.{bid}.attention.query_key_value", |
|
|
"transformer.h.{bid}.attn.c_attn", |
|
|
"transformer.blocks.{bid}.attn.Wqkv", |
|
|
"transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", |
|
|
"transformer.h.{bid}.self_attention.query_key_value", |
|
|
"h.{bid}.self_attention.query_key_value", |
|
|
"language_model.encoder.layers.{bid}.self_attention.query_key_value", |
|
|
"model.layers.{bid}.self_attn.query_key_value", |
|
|
"h.{bid}.attn.c_attn", |
|
|
"transformer.h.{bid}.mixer.Wqkv", |
|
|
"encoder.layers.{bid}.attn.Wqkv", |
|
|
"encoder.layers.{bid}.mixer.Wqkv", |
|
|
"model.layers.{bid}.self_attn.qkv_proj", |
|
|
"model.layers.layers.{bid}.mixer.qkv_proj", |
|
|
"encoder.layers.{bid}.self_attention.query_key_value", |
|
|
"transformer.layers.{bid}.attn.qkv_proj", |
|
|
"transformer_encoder.{bid}.qkv", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.ATTN_Q: ( |
|
|
"model.layers.{bid}.self_attn.q_proj", |
|
|
"layers.{bid}.self_attn.q_proj", |
|
|
"model.layers.{bid}.self_attn.q_proj_no_perm", |
|
|
"layers.{bid}.attention.wq", |
|
|
"encoder.layer.{bid}.attention.self.query", |
|
|
"transformer.layer.{bid}.attention.q_lin", |
|
|
"transformer.h.{bid}.attn.q_proj", |
|
|
"model.layers.layers.{bid}.self_attn.q_proj", |
|
|
"model.layers.{bid}.attention.wq", |
|
|
"transformer.decoder_layer.{bid}.multi_head_attention.query", |
|
|
"transformer.h.{bid}.attn.attention.q_proj", |
|
|
"model.layers.{bid}.self_attn.q_proj", |
|
|
"model.transformer.blocks.{bid}.q_proj", |
|
|
"layers.{bid}.self_attn.q_proj", |
|
|
"backbone.layers.{bid}.mixer.q_proj", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.ATTN_K: ( |
|
|
"model.layers.{bid}.self_attn.k_proj", |
|
|
"layers.{bid}.self_attn.k_proj", |
|
|
"model.layers.{bid}.self_attn.k_proj_no_perm", |
|
|
"layers.{bid}.attention.wk", |
|
|
"encoder.layer.{bid}.attention.self.key", |
|
|
"transformer.layer.{bid}.attention.k_lin", |
|
|
"transformer.h.{bid}.attn.k_proj", |
|
|
"transformer.h.{bid}.attn.k", |
|
|
"model.layers.layers.{bid}.self_attn.k_proj", |
|
|
"model.layers.{bid}.attention.wk", |
|
|
"transformer.decoder_layer.{bid}.multi_head_attention.key", |
|
|
"transformer.h.{bid}.attn.attention.k_proj", |
|
|
"model.layers.{bid}.self_attn.k_proj", |
|
|
"model.transformer.blocks.{bid}.k_proj", |
|
|
"layers.{bid}.self_attn.k_proj", |
|
|
"backbone.layers.{bid}.mixer.k_proj", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.ATTN_V: ( |
|
|
"model.layers.{bid}.self_attn.v_proj", |
|
|
"layers.{bid}.self_attn.v_proj", |
|
|
"layers.{bid}.attention.wv", |
|
|
"encoder.layer.{bid}.attention.self.value", |
|
|
"transformer.layer.{bid}.attention.v_lin", |
|
|
"transformer.h.{bid}.attn.v_proj", |
|
|
"transformer.h.{bid}.attn.v", |
|
|
"model.layers.layers.{bid}.self_attn.v_proj", |
|
|
"model.layers.{bid}.attention.wv", |
|
|
"transformer.decoder_layer.{bid}.multi_head_attention.value", |
|
|
"transformer.h.{bid}.attn.attention.v_proj", |
|
|
"model.layers.{bid}.self_attn.v_proj", |
|
|
"model.transformer.blocks.{bid}.v_proj", |
|
|
"layers.{bid}.self_attn.v_proj", |
|
|
"backbone.layers.{bid}.mixer.v_proj", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.ATTN_OUT: ( |
|
|
"gpt_neox.layers.{bid}.attention.dense", |
|
|
"transformer.h.{bid}.attn.c_proj", |
|
|
"transformer.blocks.{bid}.attn.out_proj", |
|
|
"transformer.h.{bid}.self_attention.dense", |
|
|
"h.{bid}.self_attention.dense", |
|
|
"model.layers.{bid}.self_attn.o_proj", |
|
|
"layers.{bid}.self_attn.o_proj", |
|
|
"model.layers.{bid}.self_attn.out_proj", |
|
|
"model.layers.{bid}.self_attn.linear_attn", |
|
|
"layers.{bid}.attention.wo", |
|
|
"encoder.layer.{bid}.attention.output.dense", |
|
|
"transformer.layer.{bid}.attention.out_lin", |
|
|
"transformer.h.{bid}.attn.out_proj", |
|
|
"language_model.encoder.layers.{bid}.self_attention.dense", |
|
|
"model.layers.{bid}.self_attn.dense", |
|
|
"h.{bid}.attn.c_proj", |
|
|
"transformer.h.{bid}.mixer.out_proj", |
|
|
"model.layers.layers.{bid}.self_attn.o_proj", |
|
|
"model.layers.layers.{bid}.mixer.o_proj", |
|
|
"model.layers.{bid}.attention.wo", |
|
|
"encoder.layers.{bid}.attn.out_proj", |
|
|
"encoder.layers.{bid}.mixer.out_proj", |
|
|
"transformer.decoder_layer.{bid}.multi_head_attention.linear", |
|
|
"transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", |
|
|
"encoder.layers.{bid}.self_attention.dense", |
|
|
"transformer.layers.{bid}.attn.out_proj", |
|
|
"transformer.h.{bid}.attn.attention.out_proj", |
|
|
"model.layers.{bid}.self_attn.o_proj", |
|
|
"transformer_encoder.{bid}.wo", |
|
|
"model.transformer.blocks.{bid}.attn_out", |
|
|
"layers.{bid}.self_attn.o_proj", |
|
|
"backbone.layers.{bid}.mixer.o_proj", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM: ( |
|
|
"encoder.layer.{bid}.attention.output.LayerNorm", |
|
|
"transformer.layer.{bid}.sa_layer_norm", |
|
|
"encoder.layers.{bid}.norm1", |
|
|
"transformer.decoder_layer.{bid}.rms_norm_1", |
|
|
"model.layers.{bid}.post_attn_norm", |
|
|
"transformer.blocks.{bid}.norm_attn_norm.norm_2", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_POST_NORM: ( |
|
|
"model.layers.{bid}.post_attention_layernorm", |
|
|
"layers.{bid}.post_attention_layernorm", |
|
|
"model.layers.{bid}.post_self_attn_layernorm", |
|
|
"model.layers.layers.{bid}.post_mixer_norm.weight", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD: ( |
|
|
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", |
|
|
"layers.{bid}.attention.inner_attention.rope.freqs", |
|
|
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", |
|
|
"transformer.h.{bid}.attn.rotary_emb.inv_freq", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_SINKS: ( |
|
|
"model.layers.{bid}.self_attn.sinks", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.FFN_NORM: ( |
|
|
"gpt_neox.layers.{bid}.post_attention_layernorm", |
|
|
"transformer.h.{bid}.ln_2", |
|
|
"h.{bid}.post_attention_layernorm", |
|
|
"transformer.blocks.{bid}.norm_2", |
|
|
"model.layers.{bid}.post_attention_layernorm", |
|
|
"layers.{bid}.ffn_norm", |
|
|
"language_model.encoder.layers.{bid}.post_attention_layernorm", |
|
|
"model.layers.{bid}.ln2", |
|
|
"h.{bid}.ln_2", |
|
|
"model.layers.{bid}.ffn_norm", |
|
|
"transformer.decoder_layer.{bid}.rms_norm_2", |
|
|
"model.layers.{bid}.pre_moe_norm", |
|
|
"encoder.layers.{bid}.post_attention_layernorm", |
|
|
"transformer.layers.{bid}.ffn_norm", |
|
|
"model.layers.{bid}.pre_ff_layernorm", |
|
|
"model.layers.{bid}.pre_moe_layernorm", |
|
|
"model.layers.{bid}.post_attention_layernorm", |
|
|
"transformer_encoder.{bid}.ffn_norm", |
|
|
"model.layers.layers.{bid}.pre_mlp_norm", |
|
|
"model.transformer.blocks.{bid}.ff_norm", |
|
|
"layers.{bid}.post_attention_layernorm", |
|
|
"model.layers.{bid}.feedforward_layernorm", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.FFN_PRE_NORM: ( |
|
|
"model.layers.{bid}.pre_feedforward_layernorm", |
|
|
"layers.{bid}.pre_feedforward_layernorm", |
|
|
"model.layers.{bid}.pre_ff_layernorm.weight", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.FFN_POST_NORM: ( |
|
|
"model.layers.{bid}.post_feedforward_layernorm", |
|
|
"layers.{bid}.post_feedforward_layernorm", |
|
|
"model.layers.{bid}.post_mlp_layernorm", |
|
|
"model.layers.layers.{bid}.post_mlp_norm.weight", |
|
|
"model.layers.{bid}.feed_forward.up_proj", |
|
|
"model.layers.{bid}.post_moe_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP: ( |
|
|
"layers.{bid}.feed_forward.gate", |
|
|
"model.layers.{bid}.block_sparse_moe.gate", |
|
|
"model.layers.{bid}.mlp.gate", |
|
|
"transformer.decoder_layer.{bid}.router", |
|
|
"transformer.blocks.{bid}.ffn.router.layer", |
|
|
"model.layers.{bid}.block_sparse_moe.router.layer", |
|
|
"model.layers.{bid}.feed_forward.router", |
|
|
"encoder.layers.{bid}.mlp.router.layer", |
|
|
"model.layers.{bid}.mlp.router", |
|
|
"model.layers.{bid}.mlp.gate.wg", |
|
|
"model.layers.{bid}.block_sparse_moe.primary_router", |
|
|
"model.layers.{bid}.feed_forward.gate", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( |
|
|
"model.layers.{bid}.mlp.shared_expert_gate", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_EXP_PROBS_B: ( |
|
|
"model.layers.{bid}.mlp.gate.e_score_correction", |
|
|
"model.layers.{bid}.mlp.moe_statics.e_score_correction", |
|
|
"model.layers.{bid}.feed_forward.expert_bias", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.FFN_UP: ( |
|
|
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", |
|
|
"transformer.h.{bid}.mlp.c_fc", |
|
|
"transformer.blocks.{bid}.ffn.up_proj", |
|
|
"transformer.h.{bid}.mlp.dense_h_to_4h", |
|
|
"h.{bid}.mlp.dense_h_to_4h", |
|
|
"model.layers.{bid}.mlp.up_proj", |
|
|
"layers.{bid}.mlp.up_proj", |
|
|
"layers.{bid}.feed_forward.w3", |
|
|
"encoder.layer.{bid}.intermediate.dense", |
|
|
"transformer.layer.{bid}.ffn.lin1", |
|
|
"transformer.h.{bid}.mlp.fc_in", |
|
|
"transformer.h.{bid}.mlp.linear_3", |
|
|
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", |
|
|
"model.layers.{bid}.mlp.dense_h_to_4h", |
|
|
"transformer.h.{bid}.mlp.w1", |
|
|
"h.{bid}.mlp.c_fc", |
|
|
"transformer.h.{bid}.mlp.fc1", |
|
|
"model.layers.{bid}.mlp.fc1", |
|
|
"model.layers.{bid}.mlp.gate_up_proj", |
|
|
"model.layers.layers.{bid}.mlp.up_proj", |
|
|
"model.layers.layers.{bid}.mlp.gate_up_proj", |
|
|
"model.layers.{bid}.feed_forward.w3", |
|
|
"encoder.layers.{bid}.mlp.fc11", |
|
|
"encoder.layers.{bid}.mlp.fc1", |
|
|
"model.layers.{bid}.mlp.c_fc", |
|
|
"encoder.layer.{bid}.mlp.gated_layers_v", |
|
|
"encoder.layer.{bid}.mlp.gated_layers", |
|
|
"encoder.layer.{bid}.mlp.up_gated_layer", |
|
|
"model.layers.{bid}.residual_mlp.w3", |
|
|
"encoder.layers.{bid}.mlp.dense_h_to_4h", |
|
|
"transformer.h.{bid}.mlp.c_fc_1", |
|
|
"model.layers.{bid}.feed_forward.up_proj", |
|
|
"transformer_encoder.{bid}.ffn.w12", |
|
|
"model.layers.{bid}.block_sparse_moe.up", |
|
|
"model.transformer.blocks.{bid}.up_proj", |
|
|
"layers.{bid}.mlp.up_proj", |
|
|
"backbone.layers.{bid}.mixer.up_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP: ( |
|
|
"layers.{bid}.feed_forward.experts.w3", |
|
|
"transformer.decoder_layer.{bid}.moe.linear_v", |
|
|
"transformer.blocks.{bid}.ffn.experts.mlp.v1", |
|
|
"model.layers.{bid}.mlp.experts.up_proj", |
|
|
"model.layers.{bid}.block_sparse_moe.experts.w3", |
|
|
"model.layers.{bid}.feed_forward.experts.up_proj", |
|
|
"encoder.layers.{bid}.mlp.experts.mlp.w1", |
|
|
"model.layers.{bid}.block_sparse_moe.experts.up", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP: ( |
|
|
"model.layers.{bid}.mlp.shared_expert.up_proj", |
|
|
"model.layers.{bid}.mlp.shared_experts.up_proj", |
|
|
"model.layers.{bid}.feed_forward.shared_expert.up_proj", |
|
|
"model.layers.{bid}.feed_forward.down_proj", |
|
|
"model.layers.{bid}.mlp.shared_mlp.up_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_UP_CHEXP: ( |
|
|
"model.layers.{bid}.mlp.chunk_experts.up_proj", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.FFN_ACT: ( |
|
|
"transformer.blocks.{bid}.ffn.act", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.FFN_GATE: ( |
|
|
"model.layers.{bid}.mlp.gate_proj", |
|
|
"layers.{bid}.mlp.gate_proj", |
|
|
"layers.{bid}.feed_forward.w1", |
|
|
"transformer.h.{bid}.mlp.w2", |
|
|
"transformer.h.{bid}.mlp.c_fc2", |
|
|
"model.layers.layers.{bid}.mlp.gate_proj", |
|
|
"model.layers.{bid}.feed_forward.w1", |
|
|
"encoder.layers.{bid}.mlp.fc12", |
|
|
"encoder.layer.{bid}.mlp.gated_layers_w", |
|
|
"transformer.h.{bid}.mlp.linear_1", |
|
|
"model.layers.{bid}.residual_mlp.w1", |
|
|
"transformer.h.{bid}.mlp.c_fc_0", |
|
|
"model.layers.{bid}.feed_forward.gate_proj", |
|
|
"model.transformer.blocks.{bid}.ff_proj", |
|
|
"layers.{bid}.mlp.gate_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP: ( |
|
|
"layers.{bid}.feed_forward.experts.w1", |
|
|
"transformer.decoder_layer.{bid}.moe.linear", |
|
|
"transformer.blocks.{bid}.ffn.experts.mlp.w1", |
|
|
"model.layers.{bid}.mlp.experts.gate_proj", |
|
|
"model.layers.{bid}.block_sparse_moe.experts.w1", |
|
|
"model.layers.{bid}.feed_forward.experts.gate_proj", |
|
|
"model.layers.{bid}.block_sparse_moe.experts.gate", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP: ( |
|
|
"model.layers.{bid}.mlp.shared_expert.gate_proj", |
|
|
"model.layers.{bid}.mlp.shared_experts.gate_proj", |
|
|
"model.layers.{bid}.feed_forward.shared_expert.gate_proj", |
|
|
"model.layers.{bid}.mlp.shared_mlp.gate_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_GATE_CHEXP: ( |
|
|
"model.layers.{bid}.mlp.chunk_experts.gate_proj", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.FFN_DOWN: ( |
|
|
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", |
|
|
"transformer.h.{bid}.mlp.c_proj", |
|
|
"transformer.blocks.{bid}.ffn.down_proj", |
|
|
"transformer.h.{bid}.mlp.dense_4h_to_h", |
|
|
"h.{bid}.mlp.dense_4h_to_h", |
|
|
"model.layers.{bid}.mlp.down_proj", |
|
|
"layers.{bid}.mlp.down_proj", |
|
|
"layers.{bid}.feed_forward.w2", |
|
|
"encoder.layer.{bid}.output.dense", |
|
|
"transformer.layer.{bid}.ffn.lin2", |
|
|
"transformer.h.{bid}.mlp.fc_out", |
|
|
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", |
|
|
"model.layers.{bid}.mlp.dense_4h_to_h", |
|
|
"h.{bid}.mlp.c_proj", |
|
|
"transformer.h.{bid}.mlp.fc2", |
|
|
"model.layers.{bid}.mlp.fc2", |
|
|
"model.layers.layers.{bid}.mlp.down_proj", |
|
|
"model.layers.{bid}.feed_forward.w2", |
|
|
"encoder.layers.{bid}.mlp.fc2", |
|
|
"model.layers.{bid}.mlp.c_proj", |
|
|
"encoder.layer.{bid}.mlp.wo", |
|
|
"transformer.layers.{bid}.ffn.proj_2", |
|
|
"model.layers.{bid}.residual_mlp.w2", |
|
|
"encoder.layer.{bid}.mlp.down_layer", |
|
|
"encoder.layers.{bid}.mlp.dense_4h_to_h", |
|
|
"model.layers.h.{bid}.mlp.c_proj", |
|
|
"model.layers.{bid}.feed_forward.down_proj", |
|
|
"transformer_encoder.{bid}.ffn.w3", |
|
|
"model.layers.{bid}.block_sparse_moe.down", |
|
|
"model.transformer.blocks.{bid}.ff_out", |
|
|
"layers.{bid}.mlp.down_proj", |
|
|
"backbone.layers.{bid}.mixer.down_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP: ( |
|
|
"layers.{bid}.feed_forward.experts.w2", |
|
|
"transformer.decoder_layer.{bid}.moe.linear_1", |
|
|
"transformer.blocks.{bid}.ffn.experts.mlp.w2", |
|
|
"model.layers.{bid}.mlp.experts.down_proj", |
|
|
"model.layers.{bid}.block_sparse_moe.output_linear", |
|
|
"model.layers.{bid}.block_sparse_moe.experts.w2", |
|
|
"model.layers.{bid}.feed_forward.experts.down_proj", |
|
|
"encoder.layers.{bid}.mlp.experts.mlp.w2", |
|
|
"model.layers.{bid}.block_sparse_moe.experts.down", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP: ( |
|
|
"model.layers.{bid}.mlp.shared_expert.down_proj", |
|
|
"model.layers.{bid}.mlp.shared_experts.down_proj", |
|
|
"model.layers.{bid}.feed_forward.shared_expert.down_proj", |
|
|
"model.layers.{bid}.shared_mlp.output_linear", |
|
|
"model.layers.{bid}.mlp.shared_mlp.down_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_DOWN_CHEXP: ( |
|
|
"model.layers.{bid}.mlp.chunk_experts.down_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM: ( |
|
|
"language_model.encoder.layers.{bid}.self_attention.q_layernorm", |
|
|
"model.layers.{bid}.self_attn.q_layernorm", |
|
|
"model.layers.{bid}.self_attn.query_layernorm", |
|
|
"model.layers.{bid}.self_attn.q_norm", |
|
|
"layers.{bid}.self_attn.q_norm", |
|
|
"transformer.blocks.{bid}.attn.q_ln", |
|
|
"encoder.layer.{bid}.attention.self.layer_norm_q", |
|
|
"transformer.layers.{bid}.attn.q_norm", |
|
|
"model.layers.layers.{bid}.mixer.q", |
|
|
"layers.{bid}.self_attn.q_norm", |
|
|
"model.layers.{bid}.attention.query_layernorm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM: ( |
|
|
"language_model.encoder.layers.{bid}.self_attention.k_layernorm", |
|
|
"model.layers.{bid}.self_attn.k_layernorm", |
|
|
"model.layers.{bid}.self_attn.key_layernorm", |
|
|
"model.layers.{bid}.self_attn.k_norm", |
|
|
"layers.{bid}.self_attn.k_norm", |
|
|
"transformer.blocks.{bid}.attn.k_ln", |
|
|
"encoder.layer.{bid}.attention.self.layer_norm_k", |
|
|
"transformer.layers.{bid}.attn.k_norm", |
|
|
"model.layers.layers.{bid}.mixer.k", |
|
|
"layers.{bid}.self_attn.k_norm", |
|
|
"model.layers.{bid}.attention.key_layernorm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ROPE_FREQS: ( |
|
|
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.LAYER_OUT_NORM: ( |
|
|
"encoder.layer.{bid}.output.LayerNorm", |
|
|
"transformer.layer.{bid}.output_layer_norm", |
|
|
"encoder.layers.{bid}.norm2", |
|
|
"transformer.decoder_layer.{bid}.rms_norm_3", |
|
|
"encoder.layer.{bid}.mlp.layernorm", |
|
|
"encoder.layer.{bid}.layer_norm_2", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: ( |
|
|
"model.embed_tokens_per_layer", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: ( |
|
|
"model.per_layer_model_projection", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.PER_LAYER_PROJ_NORM: ( |
|
|
"model.per_layer_projection_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ALTUP_PROJ: ( |
|
|
"model.altup_projections", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ALTUP_UNEMBD_PROJ: ( |
|
|
"model.altup_unembed_projections", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.PER_LAYER_INP_GATE: ( |
|
|
"model.layers.{bid}.per_layer_input_gate", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.PER_LAYER_PROJ: ( |
|
|
"model.layers.{bid}.per_layer_projection", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.PER_LAYER_POST_NORM: ( |
|
|
"model.layers.{bid}.post_per_layer_input_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ALTUP_CORRECT_COEF: ( |
|
|
"model.layers.{bid}.altup.correction_coefs", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ALTUP_CORRECT_SCALE: ( |
|
|
"model.layers.{bid}.altup.correct_output_scale", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ALTUP_PREDICT_COEF: ( |
|
|
"model.layers.{bid}.altup.prediction_coefs", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ALTUP_ROUTER: ( |
|
|
"model.layers.{bid}.altup.modality_router", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ALTUP_ROUTER_NORM: ( |
|
|
"model.layers.{bid}.altup.router_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.LAUREL_L: ( |
|
|
"model.layers.{bid}.laurel.linear_left", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.LAUREL_R: ( |
|
|
"model.layers.{bid}.laurel.linear_right", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.LAUREL_POST_NORM: ( |
|
|
"model.layers.{bid}.laurel.post_laurel_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SSM_IN: ( |
|
|
"model.layers.{bid}.in_proj", |
|
|
"backbone.layers.{bid}.mixer.in_proj", |
|
|
"model.layers.{bid}.mamba.in_proj", |
|
|
"model.layers.layers.{bid}.mixer.in_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SSM_CONV1D: ( |
|
|
"model.layers.{bid}.conv1d", |
|
|
"backbone.layers.{bid}.mixer.conv1d", |
|
|
"model.layers.{bid}.mamba.conv1d", |
|
|
"model.layers.layers.{bid}.mixer.conv1d", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SSM_X: ( |
|
|
"model.layers.{bid}.x_proj", |
|
|
"backbone.layers.{bid}.mixer.x_proj", |
|
|
"model.layers.{bid}.mamba.x_proj", |
|
|
"model.layers.layers.{bid}.mixer.bcdt_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SSM_DT: ( |
|
|
"model.layers.{bid}.dt_proj", |
|
|
"backbone.layers.{bid}.mixer.dt_proj", |
|
|
"model.layers.{bid}.mamba.dt_proj", |
|
|
"model.layers.layers.{bid}.mixer.dt_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SSM_DT_NORM: ( |
|
|
"model.layers.layers.{bid}.mixer.dt_norm.weight", |
|
|
"model.layers.{bid}.mamba.dt_layernorm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SSM_A: ( |
|
|
"model.layers.{bid}.A_log", |
|
|
"backbone.layers.{bid}.mixer.A_log", |
|
|
"model.layers.{bid}.mamba.A_log", |
|
|
"model.layers.layers.{bid}.mixer.A_log", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SSM_B_NORM: ( |
|
|
"model.layers.{bid}.mamba.b_layernorm", |
|
|
"model.layers.{bid}.mamba.B_layernorm", |
|
|
"model.layers.layers.{bid}.mixer.B_norm.weight", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SSM_C_NORM: ( |
|
|
"model.layers.{bid}.mamba.c_layernorm", |
|
|
"model.layers.{bid}.mamba.C_layernorm", |
|
|
"model.layers.layers.{bid}.mixer.C_norm.weight", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SSM_D: ( |
|
|
"model.layers.{bid}.D", |
|
|
"backbone.layers.{bid}.mixer.D", |
|
|
"model.layers.{bid}.mamba.D", |
|
|
"model.layers.layers.{bid}.mixer.D", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SSM_NORM: ( |
|
|
"model.layers.{bid}.mamba.norm", |
|
|
"backbone.layers.{bid}.mixer.norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SSM_OUT: ( |
|
|
"model.layers.{bid}.out_proj", |
|
|
"backbone.layers.{bid}.mixer.out_proj", |
|
|
"model.layers.{bid}.mamba.out_proj", |
|
|
"model.layers.layers.{bid}.mixer.out_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_W0: ( |
|
|
"model.layers.{bid}.attention.w0", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_W1: ( |
|
|
"rwkv.blocks.{bid}.attention.time_maa_w1", |
|
|
"model.layers.{bid}.self_attn.time_maa_w1", |
|
|
"model.layers.{bid}.attention.w1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_W2: ( |
|
|
"rwkv.blocks.{bid}.attention.time_maa_w2", |
|
|
"model.layers.{bid}.self_attn.time_maa_w2", |
|
|
"model.layers.{bid}.attention.w2", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_A0: ( |
|
|
"model.layers.{bid}.attention.a0", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_A1: ( |
|
|
"model.layers.{bid}.attention.a1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_A2: ( |
|
|
"model.layers.{bid}.attention.a2", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_V0: ( |
|
|
"model.layers.{bid}.attention.v0", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_V1: ( |
|
|
"model.layers.{bid}.attention.v1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_V2: ( |
|
|
"model.layers.{bid}.attention.v2", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_G1: ( |
|
|
"model.layers.{bid}.attention.g1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_G2: ( |
|
|
"model.layers.{bid}.attention.g2", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_K_K: ( |
|
|
"model.layers.{bid}.attention.k_k", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_K_A: ( |
|
|
"model.layers.{bid}.attention.k_a", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_R_K: ( |
|
|
"model.layers.{bid}.attention.r_k", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_X: ( |
|
|
"rwkv.blocks.{bid}.attention.time_maa_x", |
|
|
"model.layers.{bid}.self_attn.time_maa_x", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_K: ( |
|
|
"rwkv.blocks.{bid}.attention.time_maa_k", |
|
|
"model.layers.{bid}.self_attn.time_maa_k", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_V: ( |
|
|
"rwkv.blocks.{bid}.attention.time_maa_v", |
|
|
"model.layers.{bid}.self_attn.time_maa_v", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_R: ( |
|
|
"rwkv.blocks.{bid}.attention.time_maa_r", |
|
|
"model.layers.{bid}.self_attn.time_maa_r", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_G: ( |
|
|
"rwkv.blocks.{bid}.attention.time_maa_g", |
|
|
"model.layers.{bid}.self_attn.time_maa_g", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_W: ( |
|
|
"rwkv.blocks.{bid}.attention.time_maa_w", |
|
|
"model.layers.{bid}.self_attn.time_maa_w", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_FIRST: ( |
|
|
"rwkv.blocks.{bid}.attention.time_faaaa", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY: ( |
|
|
"rwkv.blocks.{bid}.attention.time_decay", |
|
|
"model.layers.{bid}.self_attn.time_decay", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W1: ( |
|
|
"rwkv.blocks.{bid}.attention.time_decay_w1", |
|
|
"model.layers.{bid}.self_attn.time_decay_w1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W2: ( |
|
|
"rwkv.blocks.{bid}.attention.time_decay_w2", |
|
|
"model.layers.{bid}.self_attn.time_decay_w2", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_KEY: ( |
|
|
"rwkv.blocks.{bid}.attention.key", |
|
|
"model.layers.{bid}.self_attn.k_proj", |
|
|
"model.layers.{bid}.attention.key", |
|
|
"model.layers.{bid}.attention.k_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_VALUE: ( |
|
|
"rwkv.blocks.{bid}.attention.value", |
|
|
"model.layers.{bid}.self_attn.v_proj", |
|
|
"model.layers.{bid}.attention.value", |
|
|
"model.layers.{bid}.attention.v_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE: ( |
|
|
"rwkv.blocks.{bid}.attention.receptance", |
|
|
"model.layers.{bid}.self_attn.q_proj", |
|
|
"model.layers.{bid}.attention.receptance", |
|
|
"model.layers.{bid}.attention.r_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_GATE: ( |
|
|
"rwkv.blocks.{bid}.attention.gate", |
|
|
"model.layers.{bid}.self_attn.gate", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_LN: ( |
|
|
"rwkv.blocks.{bid}.attention.ln_x", |
|
|
"model.layers.{bid}.attention.ln_x" |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.TIME_MIX_OUTPUT: ( |
|
|
"rwkv.blocks.{bid}.attention.output", |
|
|
"model.layers.{bid}.self_attn.o_proj", |
|
|
"model.layers.{bid}.attention.output", |
|
|
"model.layers.{bid}.attention.o_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_LERP_K: ( |
|
|
"rwkv.blocks.{bid}.feed_forward.time_maa_k", |
|
|
"model.layers.{bid}.feed_forward.x_k", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_LERP_R: ( |
|
|
"rwkv.blocks.{bid}.feed_forward.time_maa_r", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_KEY: ( |
|
|
"rwkv.blocks.{bid}.feed_forward.key", |
|
|
"model.layers.{bid}.feed_forward.key", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: ( |
|
|
"rwkv.blocks.{bid}.feed_forward.receptance", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_VALUE: ( |
|
|
"rwkv.blocks.{bid}.feed_forward.value", |
|
|
"model.layers.{bid}.feed_forward.value", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_Q_A: ( |
|
|
"model.layers.{bid}.self_attn.q_a_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_Q_B: ( |
|
|
"model.layers.{bid}.self_attn.q_b_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_MQA: ( |
|
|
"model.layers.{bid}.self_attn.kv_a_proj_with_mqa", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_KV_B: ( |
|
|
"model.layers.{bid}.self_attn.kv_b_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_K_B: ( |
|
|
"model.layers.{bid}.self_attn.k_b_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_V_B: ( |
|
|
"model.layers.{bid}.self_attn.v_b_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_Q_A_NORM: ( |
|
|
"model.layers.{bid}.self_attn.q_a_layernorm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_NORM: ( |
|
|
"model.layers.{bid}.self_attn.kv_a_layernorm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ATTN_SUB_NORM: ( |
|
|
"model.layers.{bid}.self_attn.inner_attn_ln", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.FFN_SUB_NORM: ( |
|
|
"model.layers.{bid}.mlp.ffn_layernorm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_ATTN_NORM: ( |
|
|
"decoder.block.{bid}.layer.0.layer_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_ATTN_Q: ( |
|
|
"decoder.block.{bid}.layer.0.SelfAttention.q", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_ATTN_K: ( |
|
|
"decoder.block.{bid}.layer.0.SelfAttention.k", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_ATTN_V: ( |
|
|
"decoder.block.{bid}.layer.0.SelfAttention.v", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_ATTN_OUT: ( |
|
|
"decoder.block.{bid}.layer.0.SelfAttention.o", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_ATTN_REL_B: ( |
|
|
"decoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: ( |
|
|
"decoder.block.{bid}.layer.1.layer_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q: ( |
|
|
"decoder.block.{bid}.layer.1.EncDecAttention.q", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_K: ( |
|
|
"decoder.block.{bid}.layer.1.EncDecAttention.k", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_V: ( |
|
|
"decoder.block.{bid}.layer.1.EncDecAttention.v", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: ( |
|
|
"decoder.block.{bid}.layer.1.EncDecAttention.o", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: ( |
|
|
"decoder.block.{bid}.layer.1.EncDecAttention.relative_attention_bias", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_FFN_NORM: ( |
|
|
"decoder.block.{bid}.layer.2.layer_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_FFN_GATE: ( |
|
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_0", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_FFN_UP: ( |
|
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi", |
|
|
"decoder.block.{bid}.layer.2.DenseReluDense.wi_1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_FFN_DOWN: ( |
|
|
"decoder.block.{bid}.layer.2.DenseReluDense.wo", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.DEC_OUTPUT_NORM: ( |
|
|
"decoder.final_layer_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ENC_ATTN_NORM: ( |
|
|
"encoder.block.{bid}.layer.0.layer_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ENC_ATTN_Q: ( |
|
|
"encoder.block.{bid}.layer.0.SelfAttention.q", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ENC_ATTN_K: ( |
|
|
"encoder.block.{bid}.layer.0.SelfAttention.k", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ENC_ATTN_V: ( |
|
|
"encoder.block.{bid}.layer.0.SelfAttention.v", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ENC_ATTN_OUT: ( |
|
|
"encoder.block.{bid}.layer.0.SelfAttention.o", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ENC_ATTN_REL_B: ( |
|
|
"encoder.block.{bid}.layer.0.SelfAttention.relative_attention_bias", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ENC_FFN_NORM: ( |
|
|
"encoder.block.{bid}.layer.1.layer_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ENC_FFN_GATE: ( |
|
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_0", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ENC_FFN_UP: ( |
|
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi", |
|
|
"encoder.block.{bid}.layer.1.DenseReluDense.wi_1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.ENC_FFN_DOWN: ( |
|
|
"encoder.block.{bid}.layer.1.DenseReluDense.wo", |
|
|
), |
|
|
|
|
|
|
|
|
|
|
|
MODEL_TENSOR.ENC_OUTPUT_NORM: ( |
|
|
"encoder.final_layer_norm", |
|
|
"layer_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.CLS: ( |
|
|
"classifier", |
|
|
"classifier.dense", |
|
|
"pre_classifier", |
|
|
"dense", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.CLS_OUT: ( |
|
|
"classifier.out_proj", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.CONVNEXT_DW: ( |
|
|
"backbone.convnext.{bid}.dwconv", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.CONVNEXT_NORM: ( |
|
|
"backbone.convnext.{bid}.norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.CONVNEXT_PW1: ( |
|
|
"backbone.convnext.{bid}.pwconv1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.CONVNEXT_PW2: ( |
|
|
"backbone.convnext.{bid}.pwconv2", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.CONVNEXT_GAMMA: ( |
|
|
"backbone.convnext.{bid}.gamma", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.POSNET_CONV1: ( |
|
|
"backbone.posnet.{bid}.conv1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.POSNET_CONV2: ( |
|
|
"backbone.posnet.{bid}.conv2", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.POSNET_NORM: ( |
|
|
"backbone.posnet.{bid}.norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.POSNET_NORM1: ( |
|
|
"backbone.posnet.{bid}.norm1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.POSNET_NORM2: ( |
|
|
"backbone.posnet.{bid}.norm2", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_NORM: ( |
|
|
"backbone.posnet.{bid}.norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_Q: ( |
|
|
"backbone.posnet.{bid}.q", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_K: ( |
|
|
"backbone.posnet.{bid}.k", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_V: ( |
|
|
"backbone.posnet.{bid}.v", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_OUT: ( |
|
|
"backbone.posnet.{bid}.proj_out", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SHORTCONV_CONV: ( |
|
|
"model.layers.{bid}.conv.conv", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SHORTCONV_INPROJ: ( |
|
|
"model.layers.{bid}.conv.in_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.SHORTCONV_OUTPROJ: ( |
|
|
"model.layers.{bid}.conv.out_proj", |
|
|
), |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_TENSOR.V_MMPROJ: ( |
|
|
"multi_modal_projector.linear_{bid}", |
|
|
"visual.merger.mlp.{bid}", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_MMPROJ_FC: ( |
|
|
"model.connector.modality_projection.proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_MMPROJ_MLP: ( |
|
|
"model.mm_projector.mlp.mlp.{bid}", |
|
|
"vision_model.vision_adapter.mlp.fc{bid}", |
|
|
"mlp1.{bid}", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_MMPROJ_PEG: ( |
|
|
"model.mm_projector.peg.peg.{bid}", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_EMBD_CLS: ( |
|
|
"vision_tower.vision_model.embeddings.class_embedding", |
|
|
"model.vision_tower.embeddings.cls_token", |
|
|
"vision_model.class_embedding", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_EMBD_PATCH: ( |
|
|
"vision_tower.vision_model.embeddings.patch_embedding", |
|
|
"model.vision_tower.embeddings.patch_embeddings.projection", |
|
|
"vpm.embeddings.patch_embedding", |
|
|
"model.vision_model.embeddings.patch_embedding", |
|
|
"vision_tower.patch_conv", |
|
|
"vision_encoder.patch_conv", |
|
|
"vision_model.patch_embedding.linear", |
|
|
"visual.patch_embed.proj", |
|
|
"vision_tower.patch_embed.proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_EMBD_POS: ( |
|
|
"vision_tower.vision_model.embeddings.position_embedding", |
|
|
"model.vision_tower.embeddings.position_embeddings", |
|
|
"vpm.embeddings.position_embedding", |
|
|
"model.vision_model.embeddings.position_embedding", |
|
|
"vision_model.positional_embedding_vlm", |
|
|
"vision_tower.patch_embed.pos_emb", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_ATTN_Q: ( |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", |
|
|
"model.vision_tower.encoder.layer.{bid}.attention.q_proj", |
|
|
"vpm.encoder.layers.{bid}.self_attn.q_proj", |
|
|
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", |
|
|
"vision_model.model.layers.{bid}.self_attn.q_proj", |
|
|
"vision_tower.transformer.layers.{bid}.attention.q_proj", |
|
|
"vision_encoder.transformer.layers.{bid}.attention.wq", |
|
|
"visual.blocks.{bid}.attn.q", |
|
|
"vision_tower.encoder.blocks.{bid}.wq", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", |
|
|
"model.vision_tower.encoder.layer.{bid}.attention.q_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_ATTN_K: ( |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", |
|
|
"model.vision_tower.encoder.layer.{bid}.attention.k_proj", |
|
|
"vpm.encoder.layers.{bid}.self_attn.k_proj", |
|
|
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", |
|
|
"vision_model.model.layers.{bid}.self_attn.k_proj", |
|
|
"vision_tower.transformer.layers.{bid}.attention.k_proj", |
|
|
"vision_encoder.transformer.layers.{bid}.attention.wk", |
|
|
"visual.blocks.{bid}.attn.k", |
|
|
"vision_tower.encoder.blocks.{bid}.wk", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", |
|
|
"model.vision_tower.encoder.layer.{bid}.attention.k_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_ATTN_V: ( |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", |
|
|
"model.vision_tower.encoder.layer.{bid}.attention.v_proj", |
|
|
"vpm.encoder.layers.{bid}.self_attn.v_proj", |
|
|
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", |
|
|
"vision_model.model.layers.{bid}.self_attn.v_proj", |
|
|
"vision_tower.transformer.layers.{bid}.attention.v_proj", |
|
|
"vision_encoder.transformer.layers.{bid}.attention.wv", |
|
|
"visual.blocks.{bid}.attn.v", |
|
|
"vision_tower.encoder.blocks.{bid}.wv", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_INPUT_NORM: ( |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.norm1", |
|
|
"model.vision_tower.encoder.layer.{bid}.layernorm_before", |
|
|
"vpm.encoder.layers.{bid}.layer_norm1", |
|
|
"model.vision_model.encoder.layers.{bid}.layer_norm1", |
|
|
"vision_tower.transformer.layers.{bid}.attention_norm", |
|
|
"vision_encoder.transformer.layers.{bid}.attention_norm", |
|
|
"vision_model.model.layers.{bid}.input_layernorm", |
|
|
"visual.blocks.{bid}.norm1", |
|
|
"vision_tower.encoder.blocks.{bid}.norm0", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_ATTN_O: ( |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.attn.proj", |
|
|
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", |
|
|
"vpm.encoder.layers.{bid}.self_attn.out_proj", |
|
|
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", |
|
|
"vision_model.model.layers.{bid}.self_attn.o_proj", |
|
|
"vision_tower.transformer.layers.{bid}.attention.o_proj", |
|
|
"vision_encoder.transformer.layers.{bid}.attention.wo", |
|
|
"visual.blocks.{bid}.attn.proj", |
|
|
"vision_tower.encoder.blocks.{bid}.wo", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.norm2", |
|
|
"model.vision_tower.encoder.layer.{bid}.layernorm_after", |
|
|
"vpm.encoder.layers.{bid}.layer_norm2", |
|
|
"model.vision_model.encoder.layers.{bid}.layer_norm2", |
|
|
"vision_model.model.layers.{bid}.post_attention_layernorm", |
|
|
"vision_tower.transformer.layers.{bid}.ffn_norm", |
|
|
"vision_encoder.transformer.layers.{bid}.ffn_norm", |
|
|
"visual.blocks.{bid}.norm2", |
|
|
"vision_tower.encoder.blocks.{bid}.norm1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_FFN_UP: ( |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", |
|
|
"model.vision_tower.encoder.layer.{bid}.mlp.fc1", |
|
|
"vpm.encoder.layers.{bid}.mlp.fc1", |
|
|
"model.vision_model.encoder.layers.{bid}.mlp.fc1", |
|
|
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", |
|
|
"vision_encoder.transformer.layers.{bid}.feed_forward.w3", |
|
|
"vision_model.model.layers.{bid}.mlp.fc1", |
|
|
"visual.blocks.{bid}.mlp.fc1", |
|
|
"visual.blocks.{bid}.mlp.up_proj", |
|
|
"vision_tower.encoder.blocks.{bid}.mlp.fc0", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_FFN_GATE: ( |
|
|
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", |
|
|
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", |
|
|
"visual.blocks.{bid}.mlp.gate_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_ENC_FFN_DOWN: ( |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", |
|
|
"model.vision_tower.encoder.layer.{bid}.mlp.fc2", |
|
|
"vpm.encoder.layers.{bid}.mlp.fc2", |
|
|
"model.vision_model.encoder.layers.{bid}.mlp.fc2", |
|
|
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", |
|
|
"vision_encoder.transformer.layers.{bid}.feed_forward.w2", |
|
|
"vision_model.model.layers.{bid}.mlp.fc2", |
|
|
"visual.blocks.{bid}.mlp.fc2", |
|
|
"visual.blocks.{bid}.mlp.down_proj", |
|
|
"vision_tower.encoder.blocks.{bid}.mlp.fc1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_LAYER_SCALE_1: ( |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.ls1", |
|
|
"model.vision_tower.encoder.layer.{bid}.lambda_1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_LAYER_SCALE_2: ( |
|
|
"vision_tower.vision_model.encoder.layers.{bid}.ls2", |
|
|
"model.vision_tower.encoder.layer.{bid}.lambda_2", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_PRE_NORM: ( |
|
|
"vision_tower.vision_model.pre_layrnorm", |
|
|
"vision_tower.ln_pre", |
|
|
"vision_encoder.ln_pre", |
|
|
"vision_model.layernorm_pre", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_POST_NORM: ( |
|
|
"vision_tower.vision_model.post_layernorm", |
|
|
"model.vision_model.post_layernorm", |
|
|
"vision_model.layernorm_post", |
|
|
"visual.merger.ln_q", |
|
|
"vision_tower.encoder.final_layernorm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_MM_INP_PROJ: ( |
|
|
"multi_modal_projector.mm_input_projection", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_MM_INP_NORM: ( |
|
|
"multi_modal_projector.norm", |
|
|
"multi_modal_projector.layer_norm", |
|
|
"multi_modal_projector.pre_norm", |
|
|
"pre_mm_projector_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( |
|
|
"multi_modal_projector.mm_soft_emb_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: ( |
|
|
"resampler.pos_embed_k", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_RESMPL_ATTN_Q: ( |
|
|
"resampler.attn.in_proj_q", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_RESMPL_ATTN_K: ( |
|
|
"resampler.attn.in_proj_k", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_RESMPL_ATTN_V: ( |
|
|
"resampler.attn.in_proj_v", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_RESMPL_ATTN_OUT: ( |
|
|
"resampler.attn.out_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_RESMPL_KV: ( |
|
|
"resampler.kv_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_RESMPL_POST_NORM: ( |
|
|
"resampler.ln_post", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_RESMPL_KV_NORM: ( |
|
|
"resampler.ln_kv", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_RESMPL_Q_NORM: ( |
|
|
"resampler.ln_q", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_RESMPL_PROJ: ( |
|
|
"resampler.proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_RESMPL_QUERY: ( |
|
|
"resampler.query", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: ( |
|
|
"v.token_embd.img_break", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.V_MM_PATCH_MERGER: ( |
|
|
"multi_modal_projector.patch_merger.merging_layer", |
|
|
"patch_merger.merging_layer", |
|
|
), |
|
|
|
|
|
|
|
|
|
|
|
MODEL_TENSOR.A_ENC_EMBD_POS: ( |
|
|
"audio_tower.embed_positions", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.A_ENC_CONV1D: ( |
|
|
"audio_tower.conv{bid}", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.A_PRE_NORM: (), |
|
|
|
|
|
MODEL_TENSOR.A_POST_NORM: ( |
|
|
"audio_tower.layer_norm", |
|
|
"audio_tower.ln_post", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.A_ENC_ATTN_Q: ( |
|
|
"audio_tower.layers.{bid}.self_attn.q_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.A_ENC_ATTN_K: ( |
|
|
"audio_tower.layers.{bid}.self_attn.k_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.A_ENC_ATTN_V: ( |
|
|
"audio_tower.layers.{bid}.self_attn.v_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.A_ENC_INPUT_NORM: ( |
|
|
"audio_tower.layers.{bid}.self_attn_layer_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.A_ENC_OUTPUT: ( |
|
|
"audio_tower.layers.{bid}.self_attn.out_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.A_ENC_OUTPUT_NORM: ( |
|
|
"audio_tower.layers.{bid}.final_layer_norm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.A_ENC_FFN_UP: ( |
|
|
"audio_tower.layers.{bid}.fc1", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.A_ENC_FFN_GATE: (), |
|
|
|
|
|
MODEL_TENSOR.A_ENC_FFN_DOWN: ( |
|
|
"audio_tower.layers.{bid}.fc2", |
|
|
), |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MODEL_TENSOR.A_MMPROJ: ( |
|
|
"audio.multi_modal_projector.linear_{bid}", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.A_MMPROJ_FC: ( |
|
|
"audio.multi_modal_projector.linear", |
|
|
"audio_tower.proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.A_MM_NORM_PRE: ( |
|
|
"audio.multi_modal_projector.ln_pre", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.A_MM_NORM_MID: ( |
|
|
"audio.multi_modal_projector.ln_mid", |
|
|
), |
|
|
|
|
|
|
|
|
MODEL_TENSOR.NEXTN_EH_PROJ: ( |
|
|
"model.layers.{bid}.eh_proj", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.NEXTN_EMBED_TOKENS: ( |
|
|
"model.layers.{bid}.embed_tokens", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.NEXTN_ENORM: ( |
|
|
"model.layers.{bid}.enorm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.NEXTN_HNORM: ( |
|
|
"model.layers.{bid}.hnorm", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: ( |
|
|
"model.layers.{bid}.shared_head.head", |
|
|
), |
|
|
|
|
|
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: ( |
|
|
"model.layers.{bid}.shared_head.norm", |
|
|
), |
|
|
} |
|
|
|
|
|
|
|
|
arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = { |
|
|
MODEL_ARCH.ARCTIC: { |
|
|
MODEL_TENSOR.FFN_NORM: ( |
|
|
"model.layers.{bid}.residual_layernorm", |
|
|
), |
|
|
MODEL_TENSOR.FFN_NORM_EXP: ( |
|
|
"model.layers.{bid}.post_attention_layernorm", |
|
|
), |
|
|
}, |
|
|
} |
|
|
|
|
|
mapping: dict[str, tuple[MODEL_TENSOR, str]] |
|
|
|
|
|
def __init__(self, arch: MODEL_ARCH, n_blocks: int): |
|
|
self.mapping = {} |
|
|
for tensor, keys in self.mappings_cfg.items(): |
|
|
if tensor not in MODEL_TENSORS[arch]: |
|
|
continue |
|
|
tensor_name = TENSOR_NAMES[tensor] |
|
|
self.mapping[tensor_name] = (tensor, tensor_name) |
|
|
for key in keys: |
|
|
self.mapping[key] = (tensor, tensor_name) |
|
|
if arch in self.arch_block_mappings_cfg: |
|
|
self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch]) |
|
|
for bid in range(n_blocks): |
|
|
for tensor, keys in self.block_mappings_cfg.items(): |
|
|
if tensor not in MODEL_TENSORS[arch]: |
|
|
continue |
|
|
|
|
|
tensor_name = TENSOR_NAMES[tensor].format(bid = bid) |
|
|
self.mapping[tensor_name] = (tensor, tensor_name) |
|
|
for key in keys: |
|
|
key = key.format(bid = bid) |
|
|
self.mapping[key] = (tensor, tensor_name) |
|
|
|
|
|
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None: |
|
|
result = self.mapping.get(key) |
|
|
if result is not None: |
|
|
return result |
|
|
for suffix in try_suffixes: |
|
|
if key.endswith(suffix): |
|
|
result = self.mapping.get(key[:-len(suffix)]) |
|
|
if result is not None: |
|
|
return result[0], result[1] + suffix |
|
|
return None |
|
|
|
|
|
def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None: |
|
|
result = self.get_type_and_name(key, try_suffixes = try_suffixes) |
|
|
if result is None: |
|
|
return None |
|
|
return result[1] |
|
|
|
|
|
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None: |
|
|
result = self.get_type_and_name(key, try_suffixes = try_suffixes) |
|
|
if result is None: |
|
|
return None |
|
|
return result[0] |
|
|
|
|
|
def __getitem__(self, key: str) -> str: |
|
|
try: |
|
|
return self.mapping[key][1] |
|
|
except KeyError: |
|
|
raise KeyError(key) |
|
|
|
|
|
def __contains__(self, key: str) -> bool: |
|
|
return key in self.mapping |
|
|
|
|
|
def __repr__(self) -> str: |
|
|
return repr(self.mapping) |
|
|
|
|
|
|
|
|
def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap: |
|
|
return TensorNameMap(arch, n_blocks) |
|
|
|