PeftModelForCausalLM( (base_model): LoraModel( (model): UnifiedForCausalLM( (model): UnifiedModel( (embed_tokens): Embedding(32028, 4096) (layers): ModuleList( (0-31): 32 x LlamaDecoderLayer( (self_attn): LlamaAttention( (q_proj): lora.Linear( (base_layer): Linear(in_features=4096, out_features=4096, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=4096, out_features=32, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=32, out_features=4096, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (k_proj): lora.Linear( (base_layer): Linear(in_features=4096, out_features=4096, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=4096, out_features=32, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=32, out_features=4096, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (v_proj): lora.Linear( (base_layer): Linear(in_features=4096, out_features=4096, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=4096, out_features=32, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=32, out_features=4096, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (o_proj): lora.Linear( (base_layer): Linear(in_features=4096, out_features=4096, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=4096, out_features=32, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=32, out_features=4096, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (rotary_emb): LlamaRotaryEmbedding() ) (mlp): LlamaMLP( (gate_proj): lora.Linear( (base_layer): Linear(in_features=4096, out_features=11008, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=4096, out_features=32, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=32, out_features=11008, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (up_proj): lora.Linear( (base_layer): Linear(in_features=4096, out_features=11008, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=4096, out_features=32, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=32, out_features=11008, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (down_proj): lora.Linear( (base_layer): Linear(in_features=11008, out_features=4096, bias=False) (lora_dropout): ModuleDict( (default): Dropout(p=0.05, inplace=False) ) (lora_A): ModuleDict( (default): Linear(in_features=11008, out_features=32, bias=False) ) (lora_B): ModuleDict( (default): Linear(in_features=32, out_features=4096, bias=False) ) (lora_embedding_A): ParameterDict() (lora_embedding_B): ParameterDict() (lora_magnitude_vector): ModuleDict() ) (act_fn): SiLU() ) (input_layernorm): LlamaRMSNorm() (post_attention_layernorm): LlamaRMSNorm() ) ) (norm): LlamaRMSNorm() (visual_encoder): VisualEncoder( (vision_tower): CLIPVisionModel( (vision_model): CLIPVisionTransformer( (embeddings): CLIPVisionEmbeddings( (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False) (position_embedding): Embedding(257, 1024) ) (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (encoder): CLIPEncoder( (layers): ModuleList( (0-23): 24 x CLIPEncoderLayer( (self_attn): CLIPAttention( (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (mlp): CLIPMLP( (activation_fn): QuickGELUActivation() (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) ) (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) ) ) (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) ) ) (vl_projector): VLProjector( (visual_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (visual_Qformer): BertLMHeadModel( (bert): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(30522, 768, padding_idx=0) (position_embeddings): Embedding(512, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0-1): 2 x BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (crossattention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=1024, out_features=768, bias=True) (value): Linear(in_features=1024, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (intermediate_query): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output_query): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) ) (cls): BertOnlyMLMHead( (predictions): BertLMPredictionHead( (transform): BertPredictionHeadTransform( (dense): Linear(in_features=768, out_features=768, bias=True) (transform_act_fn): GELUActivation() (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (decoder): Linear(in_features=768, out_features=30522, bias=True) ) ) ) (visual_proj): Sequential( (0): Linear(in_features=768, out_features=4096, bias=True) (1): GELU(approximate='none') (2): Linear(in_features=4096, out_features=4096, bias=True) ) ) (audio_encoder): AudioEncoder( (audio_encoder): BEATs( (post_extract_proj): Linear(in_features=512, out_features=768, bias=True) (patch_embedding): Conv2d(1, 512, kernel_size=(16, 16), stride=(16, 16), bias=False) (dropout_input): Dropout(p=0.0, inplace=False) (encoder): TransformerEncoder( (pos_conv): Sequential( (0): Conv1d(768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16) (1): SamePad() (2): GELU(approximate='none') ) (layers): ModuleList( (0): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): Dropout(p=0.0, inplace=False) (relative_attention_bias): Embedding(320, 12) (k_proj): Linear(in_features=768, out_features=768, bias=True) (v_proj): Linear(in_features=768, out_features=768, bias=True) (q_proj): Linear(in_features=768, out_features=768, bias=True) (out_proj): Linear(in_features=768, out_features=768, bias=True) (grep_linear): Linear(in_features=64, out_features=8, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=768, out_features=3072, bias=True) (fc2): Linear(in_features=3072, out_features=768, bias=True) (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (1-11): 11 x TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): Dropout(p=0.0, inplace=False) (k_proj): Linear(in_features=768, out_features=768, bias=True) (v_proj): Linear(in_features=768, out_features=768, bias=True) (q_proj): Linear(in_features=768, out_features=768, bias=True) (out_proj): Linear(in_features=768, out_features=768, bias=True) (grep_linear): Linear(in_features=64, out_features=8, bias=True) (relative_attention_bias): Embedding(320, 12) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=768, out_features=3072, bias=True) (fc2): Linear(in_features=3072, out_features=768, bias=True) (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (predictor_dropout): Dropout(p=0.0, inplace=False) (predictor): Linear(in_features=768, out_features=527, bias=True) ) ) (al_projector): ALProjector( (audio_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (audio_Qformer): BertLMHeadModel( (bert): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(30522, 768, padding_idx=0) (position_embeddings): Embedding(512, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0-1): 2 x BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (crossattention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (intermediate_query): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output_query): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) ) (cls): BertOnlyMLMHead( (predictions): BertLMPredictionHead( (transform): BertPredictionHeadTransform( (dense): Linear(in_features=768, out_features=768, bias=True) (transform_act_fn): GELUActivation() (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) (decoder): Linear(in_features=768, out_features=30522, bias=True) ) ) ) (audio_proj): Sequential( (0): Linear(in_features=768, out_features=4096, bias=True) (1): GELU(approximate='none') (2): Linear(in_features=4096, out_features=4096, bias=True) ) ) ) (lm_head): Linear(in_features=4096, out_features=32028, bias=False) ) ) )