| merge_method: della_linear |
| base_model: migtissera/Tess-3-Llama-3.1-70B |
| models: |
| - model: cognitivecomputations/dolphin-2.9.1-llama-3-70b |
| parameters: |
| weight: |
| - filter: q_proj |
| value: [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0] |
| - filter: k_proj |
| value: [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0] |
| - filter: v_proj |
| value: [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0] |
| - filter: o_proj |
| value: [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0] |
| - filter: input_layernorm |
| value: [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0] |
| - filter: up_proj |
| value: [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0] |
| - filter: gate_proj |
| value: [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0] |
| - filter: down_proj |
| value: [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0] |
| - filter: post_attention_layernorm |
| value: [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0] |
| - value: 0 |
| density: 0.25 |
| epsilon: 0.05 |
| lambda: 1.0 |
| - model: migtissera/Tess-3-Llama-3.1-70B |
| parameters: |
| weight: 1.0 |
| density: |
| - filter: q_proj |
| value: [1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1, 1] |
| - filter: k_proj |
| value: [1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1, 1] |
| - filter: v_proj |
| value: [1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1, 1] |
| - filter: o_proj |
| value: [1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1, 1] |
| - filter: input_layernorm |
| value: [1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1, 1] |
| - filter: up_proj |
| value: [1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1, 1] |
| - filter: gate_proj |
| value: [1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1, 1] |
| - filter: down_proj |
| value: [1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1, 1] |
| - filter: post_attention_layernorm |
| value: [1, 1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1, 1] |
| - value: 0.5 |
| epsilon: |
| - filter: q_proj |
| value: [0, 0, 0.05, 0.05, 0.07, 0.1, 0.07, 0.05, 0.05, 0, 0] |
| - filter: k_proj |
| value: [0, 0, 0.05, 0.05, 0.07, 0.1, 0.07, 0.05, 0.05, 0, 0] |
| - filter: v_proj |
| value: [0, 0, 0.05, 0.05, 0.07, 0.1, 0.07, 0.05, 0.05, 0, 0] |
| - filter: o_proj |
| value: [0, 0, 0.05, 0.05, 0.07, 0.1, 0.07, 0.05, 0.05, 0, 0] |
| - filter: input_layernorm |
| value: [0, 0, 0.05, 0.05, 0.07, 0.1, 0.07, 0.05, 0.05, 0, 0] |
| - filter: up_proj |
| value: [0, 0, 0.05, 0.05, 0.07, 0.1, 0.07, 0.05, 0.05, 0, 0] |
| - filter: gate_proj |
| value: [0, 0, 0.05, 0.05, 0.07, 0.1, 0.07, 0.05, 0.05, 0, 0] |
| - filter: down_proj |
| value: [0, 0, 0.05, 0.05, 0.07, 0.1, 0.07, 0.05, 0.05, 0, 0] |
| - filter: post_attention_layernorm |
| value: [0, 0, 0.05, 0.05, 0.07, 0.1, 0.07, 0.05, 0.05, 0, 0] |
| - value: 0.1 |
| lambda: 1.0 |
| dtype: bfloat16 |
| out_dtype: bfloat16 |
| parameters: |
| int8_mask: true |
| normalize: true |
| rescale: true |
| chat_template: auto |
| tokenizer: |
| source: union |