diff --git "a/instruct.ipynb" "b/instruct.ipynb" --- "a/instruct.ipynb" +++ "b/instruct.ipynb" @@ -210,7 +210,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 15, "id": "1efffd3f-aece-4858-b615-8fb1f2997068", "metadata": { "scrolled": true @@ -219,7 +219,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f67282ac10514645b9ffcdb9f797cf22", + "model_id": "b39d97f2fb434903b0521f5dea2fd37c", "version_major": 2, "version_minor": 0 }, @@ -233,7 +233,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "0ff3d2de672641c8b59a0518d2987e4f", + "model_id": "da3388e8f61e4aee98971ce520785c0c", "version_major": 2, "version_minor": 0 }, @@ -247,7 +247,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "48453247d4564242b8a5c216fc3d7fe7", + "model_id": "db11101162ff455baed085d8c25a2c77", "version_major": 2, "version_minor": 0 }, @@ -281,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 16, "id": "86163110-f084-429e-bc70-5b281a679d1c", "metadata": { "colab": { @@ -325,7 +325,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "932c25f63b9242f0b2bfb42004a8362b", + "model_id": "591a95a8c90143a39a5616131c605ed3", "version_major": 2, "version_minor": 0 }, @@ -387,2180 +387,25 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "ffbbd54b-e579-44ef-9652-cd8496b2fd4d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "PeftModelForCausalLM(\n", - " (base_model): LoraModel(\n", - " (model): MllamaForConditionalGeneration(\n", - " (vision_model): MllamaVisionModel(\n", - " (patch_embedding): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), padding=valid, bias=False)\n", - " (gated_positional_embedding): MllamaPrecomputedPositionEmbedding(\n", - " (tile_embedding): Embedding(9, 8197120)\n", - " )\n", - " (pre_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(\n", - " (embedding): Embedding(9, 5120)\n", - " )\n", - " (post_tile_positional_embedding): MllamaPrecomputedAspectRatioEmbedding(\n", - " (embedding): Embedding(9, 5120)\n", - " )\n", - " (layernorm_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", - " (layernorm_post): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", - " (transformer): MllamaVisionEncoder(\n", - " (layers): ModuleList(\n", - " (0-31): 32 x MllamaVisionEncoderLayer(\n", - " (self_attn): MllamaVisionSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=1280, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1280, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=1280, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1280, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=1280, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1280, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=1280, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1280, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " )\n", - " (mlp): MllamaVisionMLP(\n", - " (activation_fn): GELUActivation()\n", - " (fc1): Linear4bit(in_features=1280, out_features=5120, bias=True)\n", - " (fc2): Linear4bit(in_features=5120, out_features=1280, bias=True)\n", - " )\n", - " (input_layernorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", - " (post_attention_layernorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", - " )\n", - " )\n", - " )\n", - " (global_transformer): MllamaVisionEncoder(\n", - " (layers): ModuleList(\n", - " (0-7): 8 x MllamaVisionEncoderLayer(\n", - " (self_attn): MllamaVisionSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=1280, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1280, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=1280, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1280, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=1280, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1280, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=1280, out_features=1280, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=1280, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1280, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " )\n", - " (mlp): MllamaVisionMLP(\n", - " (activation_fn): GELUActivation()\n", - " (fc1): Linear4bit(in_features=1280, out_features=5120, bias=True)\n", - " (fc2): Linear4bit(in_features=5120, out_features=1280, bias=True)\n", - " )\n", - " (input_layernorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", - " (post_attention_layernorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", - " )\n", - " )\n", - " )\n", - " )\n", - " (language_model): MllamaForCausalLM(\n", - " (model): MllamaTextModel(\n", - " (embed_tokens): Embedding(128264, 4096, padding_idx=128004)\n", - " (layers): ModuleList(\n", - " (0-2): 3 x MllamaSelfAttentionDecoderLayer(\n", - " (self_attn): MllamaTextSelfSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " )\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (3): MllamaCrossAttentionDecoderLayer(\n", - " (cross_attn): MllamaTextCrossSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (4-7): 4 x MllamaSelfAttentionDecoderLayer(\n", - " (self_attn): MllamaTextSelfSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " )\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (8): MllamaCrossAttentionDecoderLayer(\n", - " (cross_attn): MllamaTextCrossSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (9-12): 4 x MllamaSelfAttentionDecoderLayer(\n", - " (self_attn): MllamaTextSelfSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " )\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (13): MllamaCrossAttentionDecoderLayer(\n", - " (cross_attn): MllamaTextCrossSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (14-17): 4 x MllamaSelfAttentionDecoderLayer(\n", - " (self_attn): MllamaTextSelfSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " )\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (18): MllamaCrossAttentionDecoderLayer(\n", - " (cross_attn): MllamaTextCrossSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (19-22): 4 x MllamaSelfAttentionDecoderLayer(\n", - " (self_attn): MllamaTextSelfSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " )\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (23): MllamaCrossAttentionDecoderLayer(\n", - " (cross_attn): MllamaTextCrossSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (24-27): 4 x MllamaSelfAttentionDecoderLayer(\n", - " (self_attn): MllamaTextSelfSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " )\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (28): MllamaCrossAttentionDecoderLayer(\n", - " (cross_attn): MllamaTextCrossSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (29-32): 4 x MllamaSelfAttentionDecoderLayer(\n", - " (self_attn): MllamaTextSelfSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " )\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (33): MllamaCrossAttentionDecoderLayer(\n", - " (cross_attn): MllamaTextCrossSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (34-37): 4 x MllamaSelfAttentionDecoderLayer(\n", - " (self_attn): MllamaTextSelfSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " )\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (38): MllamaCrossAttentionDecoderLayer(\n", - " (cross_attn): MllamaTextCrossSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (q_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " (k_norm): MllamaTextRMSNorm((128,), eps=1e-05)\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " (39): MllamaSelfAttentionDecoderLayer(\n", - " (self_attn): MllamaTextSelfSdpaAttention(\n", - " (q_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (k_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (v_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=1024, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (o_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " )\n", - " (mlp): MllamaTextMLP(\n", - " (gate_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (up_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=4096, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=14336, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (down_proj): lora.Linear4bit(\n", - " (base_layer): Linear4bit(in_features=14336, out_features=4096, bias=False)\n", - " (lora_dropout): ModuleDict(\n", - " (default): Dropout(p=0.1, inplace=False)\n", - " )\n", - " (lora_A): ModuleDict(\n", - " (default): Linear(in_features=14336, out_features=8, bias=False)\n", - " )\n", - " (lora_B): ModuleDict(\n", - " (default): Linear(in_features=8, out_features=4096, bias=False)\n", - " )\n", - " (lora_embedding_A): ParameterDict()\n", - " (lora_embedding_B): ParameterDict()\n", - " (lora_magnitude_vector): ModuleDict()\n", - " )\n", - " (act_fn): SiLU()\n", - " )\n", - " (input_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (post_attention_layernorm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " )\n", - " )\n", - " (norm): MllamaTextRMSNorm((4096,), eps=1e-05)\n", - " (rotary_emb): MllamaRotaryEmbedding()\n", - " )\n", - " (lm_head): Linear(in_features=4096, out_features=128256, bias=False)\n", - " )\n", - " (multi_modal_projector): Linear4bit(in_features=7680, out_features=4096, bias=True)\n", - " )\n", - " )\n", - ")" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model" - ] - }, - { - "cell_type": "code", - "execution_count": 8, + "execution_count": 17, "id": "f2c2fd34-e1e4-427d-86d0-73bf74ff0005", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'question': 'What is the main factor in a patient with metastatic bone disease?',\n", - " 'context': 'Perioperative considerations in patients with metastatic bone disease.'}" + "{'question': 'What was the goal of the study?',\n", + " 'context': 'The aim of this study was to evaluate the success of steroid (PRED) withdrawal due to replacement by mycophenolate mofetil (MMF) in orthotopic liver transplant (OLT) recipients with autoimmune hepatitis (AIH). Thirty patients with AIH > 12 months after OLT randomized to receive either PRED and tacrolimus (TAC) or MMF and TAC were followed for 24 months. Withdrawal of steroids showed no difference regarding graft and patient survival. Also we demonstrated significantly lower glucose levels with lower HbA1c and a reduced need for insulin as well as a significantly lower serum cholesterol in the MMF group. Patients without steroids showed a lower incidence of osteopenia. Maintenance therapy in OLT patients with AIH may be performed safely using MMF instead of prednisone.'}" ] }, - "execution_count": 8, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "eval_dataset\n", - "eval_dataset[2310]" + "eval_dataset[24]" ] }, { @@ -2575,7 +420,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 53, "id": "29d96aea-445d-482d-b7dc-861635a5389c", "metadata": { "executionInfo": { @@ -2590,27 +435,12 @@ }, "id": "X6TWyPHaAMtH" }, - "outputs": [ - { - "ename": "ValueError", - "evalue": "'' is not in list", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[9], line 46\u001b[0m\n\u001b[1;32m 40\u001b[0m \u001b[38;5;66;03m#labels = batch[\"input_ids\"].clone()\u001b[39;00m\n\u001b[1;32m 41\u001b[0m \u001b[38;5;66;03m#labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id\u001b[39;00m\n\u001b[1;32m 42\u001b[0m \u001b[38;5;66;03m#batch[\"labels\"] = labels\u001b[39;00m\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m batch\n\u001b[0;32m---> 46\u001b[0m data_collator \u001b[38;5;241m=\u001b[39m MyDataCollator(processor)\n", - "Cell \u001b[0;32mIn[9], line 5\u001b[0m, in \u001b[0;36mMyDataCollator.__init__\u001b[0;34m(self, processor)\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, processor):\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprocessor \u001b[38;5;241m=\u001b[39m processor\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mimage_token_id \u001b[38;5;241m=\u001b[39m processor\u001b[38;5;241m.\u001b[39mtokenizer\u001b[38;5;241m.\u001b[39madditional_special_tokens_ids[\n\u001b[0;32m----> 5\u001b[0m processor\u001b[38;5;241m.\u001b[39mtokenizer\u001b[38;5;241m.\u001b[39madditional_special_tokens\u001b[38;5;241m.\u001b[39mindex(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m ]\n", - "\u001b[0;31mValueError\u001b[0m: '' is not in list" - ] - } - ], + "outputs": [], "source": [ "class MyDataCollator:\n", " def __init__(self, processor):\n", " self.processor = processor\n", - " #self.image_token_id = processor.tokenizer.additional_special_tokens_ids[\n", - " # processor.tokenizer.additional_special_tokens.index(\"\")\n", - " #]\n", + " self.image_token_id = 128256\n", "\n", " def __call__(self, samples):\n", " texts = []\n", @@ -2644,9 +474,9 @@ "\n", " batch = processor(text=texts, return_tensors=\"pt\", padding=True)\n", "\n", - " #labels = batch[\"input_ids\"].clone()\n", + " labels = batch[\"input_ids\"].clone()\n", " #labels[labels == processor.tokenizer.pad_token_id] = self.image_token_id\n", - " #batch[\"labels\"] = labels\n", + " batch[\"labels\"] = labels\n", "\n", " return batch\n", "\n", @@ -2665,7 +495,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "id": "f3cda658-05f6-4078-8d71-2d1c0352ecfa", "metadata": { "executionInfo": { @@ -2713,7 +543,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 55, "id": "e6569265-5941-4482-84e2-faf1b61b685c", "metadata": { "colab": { @@ -2732,7 +562,15 @@ "id": "vSIo17mgAMtH", "outputId": "3bebd35a-ed7f-49ee-e1bc-91594e8dcd24" }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "max_steps is given, it will override any value given in num_train_epochs\n" + ] + } + ], "source": [ "trainer = Trainer(\n", " model = model,\n",