diff --git "a/info.ipynb" "b/info.ipynb" deleted file mode 100644--- "a/info.ipynb" +++ /dev/null @@ -1,3007 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "c538cbb8-93b2-4a17-800f-779327b886a4", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:41:45.765015Z", - "iopub.status.busy": "2026-03-09T05:41:45.764811Z", - "iopub.status.idle": "2026-03-09T05:41:46.915088Z", - "shell.execute_reply": "2026-03-09T05:41:46.914431Z", - "shell.execute_reply.started": "2026-03-09T05:41:45.764995Z" - } - }, - "outputs": [], - "source": [ - "import torch" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "4ad98003-a21a-409d-a562-dbdd5f877d77", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:42:26.480073Z", - "iopub.status.busy": "2026-03-09T05:42:26.479743Z", - "iopub.status.idle": "2026-03-09T05:42:28.246635Z", - "shell.execute_reply": "2026-03-09T05:42:28.245948Z", - "shell.execute_reply.started": "2026-03-09T05:42:26.480051Z" - } - }, - "outputs": [], - "source": [ - "acts_pt = torch.load('acts.pt')\n", - "model_pt = torch.load('model.pt')\n", - "scale_pt = torch.load('scale.pt')\n", - "smooth_pt = torch.load('smooth.pt')\n", - "wgts_pt = torch.load('wgts.pt')" - ] - }, - { - "cell_type": "markdown", - "id": "42d4d961-dd9f-4a08-a1c2-b4622b094205", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "# ACTS" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a47e22e8-4ea3-4174-b4df-5f63c06211c5", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:42:36.542901Z", - "iopub.status.busy": "2026-03-09T05:42:36.542555Z", - "iopub.status.idle": "2026-03-09T05:42:36.547732Z", - "shell.execute_reply": "2026-03-09T05:42:36.547190Z", - "shell.execute_reply.started": "2026-03-09T05:42:36.542883Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['model.layers.0.self_attn.v_proj.input', 'model.layers.0.self_attn.o_proj.input', 'model.layers.0.mlp.up_proj.input', 'model.layers.0.mlp.down_proj.input', 'model.layers.1.self_attn.v_proj.input', 'model.layers.1.self_attn.o_proj.input', 'model.layers.1.mlp.up_proj.input', 'model.layers.1.mlp.down_proj.input', 'model.layers.2.self_attn.v_proj.input', 'model.layers.2.self_attn.o_proj.input', 'model.layers.2.mlp.up_proj.input', 'model.layers.2.mlp.down_proj.input', 'model.layers.3.self_attn.v_proj.input', 'model.layers.3.self_attn.o_proj.input', 'model.layers.3.mlp.up_proj.input', 'model.layers.3.mlp.down_proj.input', 'model.layers.4.self_attn.v_proj.input', 'model.layers.4.self_attn.o_proj.input', 'model.layers.4.mlp.up_proj.input', 'model.layers.4.mlp.down_proj.input', 'model.layers.5.self_attn.v_proj.input', 'model.layers.5.self_attn.o_proj.input', 'model.layers.5.mlp.up_proj.input', 'model.layers.5.mlp.down_proj.input', 'model.layers.6.self_attn.v_proj.input', 'model.layers.6.self_attn.o_proj.input', 'model.layers.6.mlp.up_proj.input', 'model.layers.6.mlp.down_proj.input', 'model.layers.7.self_attn.v_proj.input', 'model.layers.7.self_attn.o_proj.input', 'model.layers.7.mlp.up_proj.input', 'model.layers.7.mlp.down_proj.input', 'model.layers.8.self_attn.v_proj.input', 'model.layers.8.self_attn.o_proj.input', 'model.layers.8.mlp.up_proj.input', 'model.layers.8.mlp.down_proj.input', 'model.layers.9.self_attn.v_proj.input', 'model.layers.9.self_attn.o_proj.input', 'model.layers.9.mlp.up_proj.input', 'model.layers.9.mlp.down_proj.input', 'model.layers.10.self_attn.v_proj.input', 'model.layers.10.self_attn.o_proj.input', 'model.layers.10.mlp.up_proj.input', 'model.layers.10.mlp.down_proj.input', 'model.layers.11.self_attn.v_proj.input', 'model.layers.11.self_attn.o_proj.input', 'model.layers.11.mlp.up_proj.input', 'model.layers.11.mlp.down_proj.input', 'model.layers.12.self_attn.v_proj.input', 'model.layers.12.self_attn.o_proj.input', 'model.layers.12.mlp.up_proj.input', 'model.layers.12.mlp.down_proj.input', 'model.layers.13.self_attn.v_proj.input', 'model.layers.13.self_attn.o_proj.input', 'model.layers.13.mlp.up_proj.input', 'model.layers.13.mlp.down_proj.input', 'model.layers.14.self_attn.v_proj.input', 'model.layers.14.self_attn.o_proj.input', 'model.layers.14.mlp.up_proj.input', 'model.layers.14.mlp.down_proj.input', 'model.layers.15.self_attn.v_proj.input', 'model.layers.15.self_attn.o_proj.input', 'model.layers.15.mlp.up_proj.input', 'model.layers.15.mlp.down_proj.input'])" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "acts_pt.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "40a07467-4f68-43a6-ac16-b7edf2fd1b4f", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:43:29.261870Z", - "iopub.status.busy": "2026-03-09T05:43:29.261517Z", - "iopub.status.idle": "2026-03-09T05:43:29.265525Z", - "shell.execute_reply": "2026-03-09T05:43:29.264884Z", - "shell.execute_reply.started": "2026-03-09T05:43:29.261840Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "64" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(acts_pt.keys())" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "78e1dbf3-7a53-4f20-a103-159685de218a", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:49:48.802408Z", - "iopub.status.busy": "2026-03-09T05:49:48.802136Z", - "iopub.status.idle": "2026-03-09T05:49:48.819885Z", - "shell.execute_reply": "2026-03-09T05:49:48.819151Z", - "shell.execute_reply.started": "2026-03-09T05:49:48.802390Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "model.layers.0.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.3594]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.0.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.3340]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.0.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.4824]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.0.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[32.2812]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.1.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.5547]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.1.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.7031]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.1.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[20.4531]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.1.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[64.1875]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.2.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.9609]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.2.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.6816]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.2.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[6.1680]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.2.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.0547]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.3.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[8.1484]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.3.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.7969]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.3.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[5.7109]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.3.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.6279]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.4.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.1914]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.4.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.1338]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.4.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[7.5234]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.4.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.4834]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.5.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[10.3750]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.5.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.5781]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.5.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.9062]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.5.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.0605]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.6.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[7.6797]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.6.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.7715]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.6.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.6875]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.6.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.6094]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.7.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[5.8320]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.7.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.2686]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.7.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[6.3047]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.7.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.6582]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.8.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.8750]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.8.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.5840]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.8.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[6.0234]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.8.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.4062]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.9.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[5.6289]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.9.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.4727]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.9.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[5.0703]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.9.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.8164]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.10.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[10.9766]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.10.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.3223]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.10.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.2969]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.10.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.5566]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.11.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[9.5547]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.11.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.3672]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.11.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.5039]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.11.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.9570]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.12.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[9.4766]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.12.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.4551]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.12.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[5.2031]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.12.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.2871]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.13.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[10.9766]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.13.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.8203]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.13.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[5.4062]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.13.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.7148]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.14.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[13.4219]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.14.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.5957]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.14.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.6641]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.14.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[36.7188]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.15.self_attn.v_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[8.0312]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.15.self_attn.o_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.5938]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.15.mlp.up_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[9.0078]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.15.mlp.down_proj.input:\n", - "\tchannels_dim : -1\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : ({'min': None, 'max': tensor([[[[151.1250]]]]), 'ratio': None},)\n", - "\trange_bound : None\n", - "\tquant_range : None\n" - ] - } - ], - "source": [ - "for name, data in acts_pt.items():\n", - " print(f\"{name}:\")\n", - " for key, value in data.items():\n", - " print(f\"\\t{key} : {value}\") \n" - ] - }, - { - "cell_type": "markdown", - "id": "b38bd5a7-0450-4ac8-bcfd-a856d8b15fe0", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "# MODEL" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "b565a179-2c69-460e-b1f8-99b997114fcd", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:47:15.725801Z", - "iopub.status.busy": "2026-03-09T05:47:15.725573Z", - "iopub.status.idle": "2026-03-09T05:47:15.729324Z", - "shell.execute_reply": "2026-03-09T05:47:15.728722Z", - "shell.execute_reply.started": "2026-03-09T05:47:15.725784Z" - }, - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "odict_keys(['model.embed_tokens.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.0.input_layernorm.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.mlp.down_proj.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.1.post_attention_layernorm.weight', 'model.layers.2.self_attn.q_proj.weight', 'model.layers.2.self_attn.k_proj.weight', 'model.layers.2.self_attn.v_proj.weight', 'model.layers.2.self_attn.o_proj.weight', 'model.layers.2.mlp.gate_proj.weight', 'model.layers.2.mlp.up_proj.weight', 'model.layers.2.mlp.down_proj.weight', 'model.layers.2.input_layernorm.weight', 'model.layers.2.post_attention_layernorm.weight', 'model.layers.3.self_attn.q_proj.weight', 'model.layers.3.self_attn.k_proj.weight', 'model.layers.3.self_attn.v_proj.weight', 'model.layers.3.self_attn.o_proj.weight', 'model.layers.3.mlp.gate_proj.weight', 'model.layers.3.mlp.up_proj.weight', 'model.layers.3.mlp.down_proj.weight', 'model.layers.3.input_layernorm.weight', 'model.layers.3.post_attention_layernorm.weight', 'model.layers.4.self_attn.q_proj.weight', 'model.layers.4.self_attn.k_proj.weight', 'model.layers.4.self_attn.v_proj.weight', 'model.layers.4.self_attn.o_proj.weight', 'model.layers.4.mlp.gate_proj.weight', 'model.layers.4.mlp.up_proj.weight', 'model.layers.4.mlp.down_proj.weight', 'model.layers.4.input_layernorm.weight', 'model.layers.4.post_attention_layernorm.weight', 'model.layers.5.self_attn.q_proj.weight', 'model.layers.5.self_attn.k_proj.weight', 'model.layers.5.self_attn.v_proj.weight', 'model.layers.5.self_attn.o_proj.weight', 'model.layers.5.mlp.gate_proj.weight', 'model.layers.5.mlp.up_proj.weight', 'model.layers.5.mlp.down_proj.weight', 'model.layers.5.input_layernorm.weight', 'model.layers.5.post_attention_layernorm.weight', 'model.layers.6.self_attn.q_proj.weight', 'model.layers.6.self_attn.k_proj.weight', 'model.layers.6.self_attn.v_proj.weight', 'model.layers.6.self_attn.o_proj.weight', 'model.layers.6.mlp.gate_proj.weight', 'model.layers.6.mlp.up_proj.weight', 'model.layers.6.mlp.down_proj.weight', 'model.layers.6.input_layernorm.weight', 'model.layers.6.post_attention_layernorm.weight', 'model.layers.7.self_attn.q_proj.weight', 'model.layers.7.self_attn.k_proj.weight', 'model.layers.7.self_attn.v_proj.weight', 'model.layers.7.self_attn.o_proj.weight', 'model.layers.7.mlp.gate_proj.weight', 'model.layers.7.mlp.up_proj.weight', 'model.layers.7.mlp.down_proj.weight', 'model.layers.7.input_layernorm.weight', 'model.layers.7.post_attention_layernorm.weight', 'model.layers.8.self_attn.q_proj.weight', 'model.layers.8.self_attn.k_proj.weight', 'model.layers.8.self_attn.v_proj.weight', 'model.layers.8.self_attn.o_proj.weight', 'model.layers.8.mlp.gate_proj.weight', 'model.layers.8.mlp.up_proj.weight', 'model.layers.8.mlp.down_proj.weight', 'model.layers.8.input_layernorm.weight', 'model.layers.8.post_attention_layernorm.weight', 'model.layers.9.self_attn.q_proj.weight', 'model.layers.9.self_attn.k_proj.weight', 'model.layers.9.self_attn.v_proj.weight', 'model.layers.9.self_attn.o_proj.weight', 'model.layers.9.mlp.gate_proj.weight', 'model.layers.9.mlp.up_proj.weight', 'model.layers.9.mlp.down_proj.weight', 'model.layers.9.input_layernorm.weight', 'model.layers.9.post_attention_layernorm.weight', 'model.layers.10.self_attn.q_proj.weight', 'model.layers.10.self_attn.k_proj.weight', 'model.layers.10.self_attn.v_proj.weight', 'model.layers.10.self_attn.o_proj.weight', 'model.layers.10.mlp.gate_proj.weight', 'model.layers.10.mlp.up_proj.weight', 'model.layers.10.mlp.down_proj.weight', 'model.layers.10.input_layernorm.weight', 'model.layers.10.post_attention_layernorm.weight', 'model.layers.11.self_attn.q_proj.weight', 'model.layers.11.self_attn.k_proj.weight', 'model.layers.11.self_attn.v_proj.weight', 'model.layers.11.self_attn.o_proj.weight', 'model.layers.11.mlp.gate_proj.weight', 'model.layers.11.mlp.up_proj.weight', 'model.layers.11.mlp.down_proj.weight', 'model.layers.11.input_layernorm.weight', 'model.layers.11.post_attention_layernorm.weight', 'model.layers.12.self_attn.q_proj.weight', 'model.layers.12.self_attn.k_proj.weight', 'model.layers.12.self_attn.v_proj.weight', 'model.layers.12.self_attn.o_proj.weight', 'model.layers.12.mlp.gate_proj.weight', 'model.layers.12.mlp.up_proj.weight', 'model.layers.12.mlp.down_proj.weight', 'model.layers.12.input_layernorm.weight', 'model.layers.12.post_attention_layernorm.weight', 'model.layers.13.self_attn.q_proj.weight', 'model.layers.13.self_attn.k_proj.weight', 'model.layers.13.self_attn.v_proj.weight', 'model.layers.13.self_attn.o_proj.weight', 'model.layers.13.mlp.gate_proj.weight', 'model.layers.13.mlp.up_proj.weight', 'model.layers.13.mlp.down_proj.weight', 'model.layers.13.input_layernorm.weight', 'model.layers.13.post_attention_layernorm.weight', 'model.layers.14.self_attn.q_proj.weight', 'model.layers.14.self_attn.k_proj.weight', 'model.layers.14.self_attn.v_proj.weight', 'model.layers.14.self_attn.o_proj.weight', 'model.layers.14.mlp.gate_proj.weight', 'model.layers.14.mlp.up_proj.weight', 'model.layers.14.mlp.down_proj.weight', 'model.layers.14.input_layernorm.weight', 'model.layers.14.post_attention_layernorm.weight', 'model.layers.15.self_attn.q_proj.weight', 'model.layers.15.self_attn.k_proj.weight', 'model.layers.15.self_attn.v_proj.weight', 'model.layers.15.self_attn.o_proj.weight', 'model.layers.15.mlp.gate_proj.weight', 'model.layers.15.mlp.up_proj.weight', 'model.layers.15.mlp.down_proj.weight', 'model.layers.15.input_layernorm.weight', 'model.layers.15.post_attention_layernorm.weight', 'model.norm.weight', 'lm_head.weight'])" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model_pt.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "b2b93d90-3364-41ac-9bb1-2be496d476c5", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:47:36.705138Z", - "iopub.status.busy": "2026-03-09T05:47:36.704893Z", - "iopub.status.idle": "2026-03-09T05:47:36.708559Z", - "shell.execute_reply": "2026-03-09T05:47:36.708047Z", - "shell.execute_reply.started": "2026-03-09T05:47:36.705118Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "147" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(model_pt.keys())" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "801c3a98-612b-4e14-9c9a-57150da7fef1", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:50:38.340142Z", - "iopub.status.busy": "2026-03-09T05:50:38.339930Z", - "iopub.status.idle": "2026-03-09T05:50:38.345306Z", - "shell.execute_reply": "2026-03-09T05:50:38.344564Z", - "shell.execute_reply.started": "2026-03-09T05:50:38.340127Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "model.embed_tokens.weight:\n", - "\ttorch.Size([128256, 2048])\n", - "model.layers.0.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.0.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.0.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.0.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.0.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.0.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.0.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.0.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.0.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.1.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.1.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.1.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.1.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.1.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.1.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.1.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.1.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.1.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.2.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.2.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.2.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.2.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.2.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.2.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.2.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.2.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.2.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.3.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.3.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.3.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.3.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.3.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.3.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.3.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.3.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.3.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.4.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.4.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.4.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.4.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.4.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.4.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.4.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.4.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.4.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.5.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.5.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.5.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.5.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.5.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.5.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.5.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.5.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.5.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.6.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.6.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.6.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.6.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.6.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.6.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.6.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.6.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.6.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.7.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.7.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.7.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.7.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.7.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.7.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.7.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.7.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.7.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.8.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.8.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.8.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.8.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.8.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.8.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.8.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.8.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.8.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.9.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.9.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.9.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.9.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.9.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.9.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.9.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.9.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.9.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.10.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.10.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.10.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.10.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.10.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.10.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.10.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.10.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.10.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.11.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.11.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.11.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.11.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.11.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.11.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.11.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.11.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.11.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.12.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.12.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.12.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.12.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.12.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.12.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.12.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.12.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.12.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.13.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.13.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.13.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.13.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.13.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.13.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.13.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.13.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.13.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.14.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.14.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.14.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.14.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.14.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.14.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.14.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.14.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.14.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.15.self_attn.q_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.15.self_attn.k_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.15.self_attn.v_proj.weight:\n", - "\ttorch.Size([512, 2048])\n", - "model.layers.15.self_attn.o_proj.weight:\n", - "\ttorch.Size([2048, 2048])\n", - "model.layers.15.mlp.gate_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.15.mlp.up_proj.weight:\n", - "\ttorch.Size([8192, 2048])\n", - "model.layers.15.mlp.down_proj.weight:\n", - "\ttorch.Size([2048, 8192])\n", - "model.layers.15.input_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.layers.15.post_attention_layernorm.weight:\n", - "\ttorch.Size([2048])\n", - "model.norm.weight:\n", - "\ttorch.Size([2048])\n", - "lm_head.weight:\n", - "\ttorch.Size([128256, 2048])\n" - ] - } - ], - "source": [ - "for name, data in model_pt.items():\n", - " print(f\"{name}:\")\n", - " print(f\"\\t{data.shape}\") \n" - ] - }, - { - "cell_type": "markdown", - "id": "67ae1f03-2fd5-4e65-8076-157e747cc6bb", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "# SCALE" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "d9a045f2-5f0b-46ef-9063-c2bc4b1fb1d7", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:51:28.177054Z", - "iopub.status.busy": "2026-03-09T05:51:28.176813Z", - "iopub.status.idle": "2026-03-09T05:51:28.180641Z", - "shell.execute_reply": "2026-03-09T05:51:28.180032Z", - "shell.execute_reply.started": "2026-03-09T05:51:28.177035Z" - }, - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['model.layers.0.self_attn.q_proj.weight.scale.0', 'model.layers.0.self_attn.q_proj.weight.zero', 'model.layers.0.self_attn.k_proj.weight.scale.0', 'model.layers.0.self_attn.k_proj.weight.zero', 'model.layers.0.self_attn.v_proj.weight.scale.0', 'model.layers.0.self_attn.v_proj.weight.zero', 'model.layers.0.self_attn.o_proj.weight.scale.0', 'model.layers.0.self_attn.o_proj.weight.zero', 'model.layers.0.mlp.up_proj.weight.scale.0', 'model.layers.0.mlp.up_proj.weight.zero', 'model.layers.0.mlp.gate_proj.weight.scale.0', 'model.layers.0.mlp.gate_proj.weight.zero', 'model.layers.0.mlp.down_proj.weight.scale.0', 'model.layers.0.mlp.down_proj.weight.zero', 'model.layers.1.self_attn.q_proj.weight.scale.0', 'model.layers.1.self_attn.q_proj.weight.zero', 'model.layers.1.self_attn.k_proj.weight.scale.0', 'model.layers.1.self_attn.k_proj.weight.zero', 'model.layers.1.self_attn.v_proj.weight.scale.0', 'model.layers.1.self_attn.v_proj.weight.zero', 'model.layers.1.self_attn.o_proj.weight.scale.0', 'model.layers.1.self_attn.o_proj.weight.zero', 'model.layers.1.mlp.up_proj.weight.scale.0', 'model.layers.1.mlp.up_proj.weight.zero', 'model.layers.1.mlp.gate_proj.weight.scale.0', 'model.layers.1.mlp.gate_proj.weight.zero', 'model.layers.1.mlp.down_proj.weight.scale.0', 'model.layers.1.mlp.down_proj.weight.zero', 'model.layers.2.self_attn.q_proj.weight.scale.0', 'model.layers.2.self_attn.q_proj.weight.zero', 'model.layers.2.self_attn.k_proj.weight.scale.0', 'model.layers.2.self_attn.k_proj.weight.zero', 'model.layers.2.self_attn.v_proj.weight.scale.0', 'model.layers.2.self_attn.v_proj.weight.zero', 'model.layers.2.self_attn.o_proj.weight.scale.0', 'model.layers.2.self_attn.o_proj.weight.zero', 'model.layers.2.mlp.up_proj.weight.scale.0', 'model.layers.2.mlp.up_proj.weight.zero', 'model.layers.2.mlp.gate_proj.weight.scale.0', 'model.layers.2.mlp.gate_proj.weight.zero', 'model.layers.2.mlp.down_proj.weight.scale.0', 'model.layers.2.mlp.down_proj.weight.zero', 'model.layers.3.self_attn.q_proj.weight.scale.0', 'model.layers.3.self_attn.q_proj.weight.zero', 'model.layers.3.self_attn.k_proj.weight.scale.0', 'model.layers.3.self_attn.k_proj.weight.zero', 'model.layers.3.self_attn.v_proj.weight.scale.0', 'model.layers.3.self_attn.v_proj.weight.zero', 'model.layers.3.self_attn.o_proj.weight.scale.0', 'model.layers.3.self_attn.o_proj.weight.zero', 'model.layers.3.mlp.up_proj.weight.scale.0', 'model.layers.3.mlp.up_proj.weight.zero', 'model.layers.3.mlp.gate_proj.weight.scale.0', 'model.layers.3.mlp.gate_proj.weight.zero', 'model.layers.3.mlp.down_proj.weight.scale.0', 'model.layers.3.mlp.down_proj.weight.zero', 'model.layers.4.self_attn.q_proj.weight.scale.0', 'model.layers.4.self_attn.q_proj.weight.zero', 'model.layers.4.self_attn.k_proj.weight.scale.0', 'model.layers.4.self_attn.k_proj.weight.zero', 'model.layers.4.self_attn.v_proj.weight.scale.0', 'model.layers.4.self_attn.v_proj.weight.zero', 'model.layers.4.self_attn.o_proj.weight.scale.0', 'model.layers.4.self_attn.o_proj.weight.zero', 'model.layers.4.mlp.up_proj.weight.scale.0', 'model.layers.4.mlp.up_proj.weight.zero', 'model.layers.4.mlp.gate_proj.weight.scale.0', 'model.layers.4.mlp.gate_proj.weight.zero', 'model.layers.4.mlp.down_proj.weight.scale.0', 'model.layers.4.mlp.down_proj.weight.zero', 'model.layers.5.self_attn.q_proj.weight.scale.0', 'model.layers.5.self_attn.q_proj.weight.zero', 'model.layers.5.self_attn.k_proj.weight.scale.0', 'model.layers.5.self_attn.k_proj.weight.zero', 'model.layers.5.self_attn.v_proj.weight.scale.0', 'model.layers.5.self_attn.v_proj.weight.zero', 'model.layers.5.self_attn.o_proj.weight.scale.0', 'model.layers.5.self_attn.o_proj.weight.zero', 'model.layers.5.mlp.up_proj.weight.scale.0', 'model.layers.5.mlp.up_proj.weight.zero', 'model.layers.5.mlp.gate_proj.weight.scale.0', 'model.layers.5.mlp.gate_proj.weight.zero', 'model.layers.5.mlp.down_proj.weight.scale.0', 'model.layers.5.mlp.down_proj.weight.zero', 'model.layers.6.self_attn.q_proj.weight.scale.0', 'model.layers.6.self_attn.q_proj.weight.zero', 'model.layers.6.self_attn.k_proj.weight.scale.0', 'model.layers.6.self_attn.k_proj.weight.zero', 'model.layers.6.self_attn.v_proj.weight.scale.0', 'model.layers.6.self_attn.v_proj.weight.zero', 'model.layers.6.self_attn.o_proj.weight.scale.0', 'model.layers.6.self_attn.o_proj.weight.zero', 'model.layers.6.mlp.up_proj.weight.scale.0', 'model.layers.6.mlp.up_proj.weight.zero', 'model.layers.6.mlp.gate_proj.weight.scale.0', 'model.layers.6.mlp.gate_proj.weight.zero', 'model.layers.6.mlp.down_proj.weight.scale.0', 'model.layers.6.mlp.down_proj.weight.zero', 'model.layers.7.self_attn.q_proj.weight.scale.0', 'model.layers.7.self_attn.q_proj.weight.zero', 'model.layers.7.self_attn.k_proj.weight.scale.0', 'model.layers.7.self_attn.k_proj.weight.zero', 'model.layers.7.self_attn.v_proj.weight.scale.0', 'model.layers.7.self_attn.v_proj.weight.zero', 'model.layers.7.self_attn.o_proj.weight.scale.0', 'model.layers.7.self_attn.o_proj.weight.zero', 'model.layers.7.mlp.up_proj.weight.scale.0', 'model.layers.7.mlp.up_proj.weight.zero', 'model.layers.7.mlp.gate_proj.weight.scale.0', 'model.layers.7.mlp.gate_proj.weight.zero', 'model.layers.7.mlp.down_proj.weight.scale.0', 'model.layers.7.mlp.down_proj.weight.zero', 'model.layers.8.self_attn.q_proj.weight.scale.0', 'model.layers.8.self_attn.q_proj.weight.zero', 'model.layers.8.self_attn.k_proj.weight.scale.0', 'model.layers.8.self_attn.k_proj.weight.zero', 'model.layers.8.self_attn.v_proj.weight.scale.0', 'model.layers.8.self_attn.v_proj.weight.zero', 'model.layers.8.self_attn.o_proj.weight.scale.0', 'model.layers.8.self_attn.o_proj.weight.zero', 'model.layers.8.mlp.up_proj.weight.scale.0', 'model.layers.8.mlp.up_proj.weight.zero', 'model.layers.8.mlp.gate_proj.weight.scale.0', 'model.layers.8.mlp.gate_proj.weight.zero', 'model.layers.8.mlp.down_proj.weight.scale.0', 'model.layers.8.mlp.down_proj.weight.zero', 'model.layers.9.self_attn.q_proj.weight.scale.0', 'model.layers.9.self_attn.q_proj.weight.zero', 'model.layers.9.self_attn.k_proj.weight.scale.0', 'model.layers.9.self_attn.k_proj.weight.zero', 'model.layers.9.self_attn.v_proj.weight.scale.0', 'model.layers.9.self_attn.v_proj.weight.zero', 'model.layers.9.self_attn.o_proj.weight.scale.0', 'model.layers.9.self_attn.o_proj.weight.zero', 'model.layers.9.mlp.up_proj.weight.scale.0', 'model.layers.9.mlp.up_proj.weight.zero', 'model.layers.9.mlp.gate_proj.weight.scale.0', 'model.layers.9.mlp.gate_proj.weight.zero', 'model.layers.9.mlp.down_proj.weight.scale.0', 'model.layers.9.mlp.down_proj.weight.zero', 'model.layers.10.self_attn.q_proj.weight.scale.0', 'model.layers.10.self_attn.q_proj.weight.zero', 'model.layers.10.self_attn.k_proj.weight.scale.0', 'model.layers.10.self_attn.k_proj.weight.zero', 'model.layers.10.self_attn.v_proj.weight.scale.0', 'model.layers.10.self_attn.v_proj.weight.zero', 'model.layers.10.self_attn.o_proj.weight.scale.0', 'model.layers.10.self_attn.o_proj.weight.zero', 'model.layers.10.mlp.up_proj.weight.scale.0', 'model.layers.10.mlp.up_proj.weight.zero', 'model.layers.10.mlp.gate_proj.weight.scale.0', 'model.layers.10.mlp.gate_proj.weight.zero', 'model.layers.10.mlp.down_proj.weight.scale.0', 'model.layers.10.mlp.down_proj.weight.zero', 'model.layers.11.self_attn.q_proj.weight.scale.0', 'model.layers.11.self_attn.q_proj.weight.zero', 'model.layers.11.self_attn.k_proj.weight.scale.0', 'model.layers.11.self_attn.k_proj.weight.zero', 'model.layers.11.self_attn.v_proj.weight.scale.0', 'model.layers.11.self_attn.v_proj.weight.zero', 'model.layers.11.self_attn.o_proj.weight.scale.0', 'model.layers.11.self_attn.o_proj.weight.zero', 'model.layers.11.mlp.up_proj.weight.scale.0', 'model.layers.11.mlp.up_proj.weight.zero', 'model.layers.11.mlp.gate_proj.weight.scale.0', 'model.layers.11.mlp.gate_proj.weight.zero', 'model.layers.11.mlp.down_proj.weight.scale.0', 'model.layers.11.mlp.down_proj.weight.zero', 'model.layers.12.self_attn.q_proj.weight.scale.0', 'model.layers.12.self_attn.q_proj.weight.zero', 'model.layers.12.self_attn.k_proj.weight.scale.0', 'model.layers.12.self_attn.k_proj.weight.zero', 'model.layers.12.self_attn.v_proj.weight.scale.0', 'model.layers.12.self_attn.v_proj.weight.zero', 'model.layers.12.self_attn.o_proj.weight.scale.0', 'model.layers.12.self_attn.o_proj.weight.zero', 'model.layers.12.mlp.up_proj.weight.scale.0', 'model.layers.12.mlp.up_proj.weight.zero', 'model.layers.12.mlp.gate_proj.weight.scale.0', 'model.layers.12.mlp.gate_proj.weight.zero', 'model.layers.12.mlp.down_proj.weight.scale.0', 'model.layers.12.mlp.down_proj.weight.zero', 'model.layers.13.self_attn.q_proj.weight.scale.0', 'model.layers.13.self_attn.q_proj.weight.zero', 'model.layers.13.self_attn.k_proj.weight.scale.0', 'model.layers.13.self_attn.k_proj.weight.zero', 'model.layers.13.self_attn.v_proj.weight.scale.0', 'model.layers.13.self_attn.v_proj.weight.zero', 'model.layers.13.self_attn.o_proj.weight.scale.0', 'model.layers.13.self_attn.o_proj.weight.zero', 'model.layers.13.mlp.up_proj.weight.scale.0', 'model.layers.13.mlp.up_proj.weight.zero', 'model.layers.13.mlp.gate_proj.weight.scale.0', 'model.layers.13.mlp.gate_proj.weight.zero', 'model.layers.13.mlp.down_proj.weight.scale.0', 'model.layers.13.mlp.down_proj.weight.zero', 'model.layers.14.self_attn.q_proj.weight.scale.0', 'model.layers.14.self_attn.q_proj.weight.zero', 'model.layers.14.self_attn.k_proj.weight.scale.0', 'model.layers.14.self_attn.k_proj.weight.zero', 'model.layers.14.self_attn.v_proj.weight.scale.0', 'model.layers.14.self_attn.v_proj.weight.zero', 'model.layers.14.self_attn.o_proj.weight.scale.0', 'model.layers.14.self_attn.o_proj.weight.zero', 'model.layers.14.mlp.up_proj.weight.scale.0', 'model.layers.14.mlp.up_proj.weight.zero', 'model.layers.14.mlp.gate_proj.weight.scale.0', 'model.layers.14.mlp.gate_proj.weight.zero', 'model.layers.14.mlp.down_proj.weight.scale.0', 'model.layers.14.mlp.down_proj.weight.zero', 'model.layers.15.self_attn.q_proj.weight.scale.0', 'model.layers.15.self_attn.q_proj.weight.zero', 'model.layers.15.self_attn.k_proj.weight.scale.0', 'model.layers.15.self_attn.k_proj.weight.zero', 'model.layers.15.self_attn.v_proj.weight.scale.0', 'model.layers.15.self_attn.v_proj.weight.zero', 'model.layers.15.self_attn.o_proj.weight.scale.0', 'model.layers.15.self_attn.o_proj.weight.zero', 'model.layers.15.mlp.up_proj.weight.scale.0', 'model.layers.15.mlp.up_proj.weight.zero', 'model.layers.15.mlp.gate_proj.weight.scale.0', 'model.layers.15.mlp.gate_proj.weight.zero', 'model.layers.15.mlp.down_proj.weight.scale.0', 'model.layers.15.mlp.down_proj.weight.zero'])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "scale_pt.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "366a18d2-a7a8-4cb7-87f2-be5640cea0f8", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:51:36.026092Z", - "iopub.status.busy": "2026-03-09T05:51:36.025876Z", - "iopub.status.idle": "2026-03-09T05:51:36.029556Z", - "shell.execute_reply": "2026-03-09T05:51:36.028789Z", - "shell.execute_reply.started": "2026-03-09T05:51:36.026078Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "224" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(scale_pt.keys())" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "9d51465f-16ce-4fbf-b6cc-9d06421580bc", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:52:05.925258Z", - "iopub.status.busy": "2026-03-09T05:52:05.924766Z", - "iopub.status.idle": "2026-03-09T05:52:05.931048Z", - "shell.execute_reply": "2026-03-09T05:52:05.930469Z", - "shell.execute_reply.started": "2026-03-09T05:52:05.925238Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "model.layers.0.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.0.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.0.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.0.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.0.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.0.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.0.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.0.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.0.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.0.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.0.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.0.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.0.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.0.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.1.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.1.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.1.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.1.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.1.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.1.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.1.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.1.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.1.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.1.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.1.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.1.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.1.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.1.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.2.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.2.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.2.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.2.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.2.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.2.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.2.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.2.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.2.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.2.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.2.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.2.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.2.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.2.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.3.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.3.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.3.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.3.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.3.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.3.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.3.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.3.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.3.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.3.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.3.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.3.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.3.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.3.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.4.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.4.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.4.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.4.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.4.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.4.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.4.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.4.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.4.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.4.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.4.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.4.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.4.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.4.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.5.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.5.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.5.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.5.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.5.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.5.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.5.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.5.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.5.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.5.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.5.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.5.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.5.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.5.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.6.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.6.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.6.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.6.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.6.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.6.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.6.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.6.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.6.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.6.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.6.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.6.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.6.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.6.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.7.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.7.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.7.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.7.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.7.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.7.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.7.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.7.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.7.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.7.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.7.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.7.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.7.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.7.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.8.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.8.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.8.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.8.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.8.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.8.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.8.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.8.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.8.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.8.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.8.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.8.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.8.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.8.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.9.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.9.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.9.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.9.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.9.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.9.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.9.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.9.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.9.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.9.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.9.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.9.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.9.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.9.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.10.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.10.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.10.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.10.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.10.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.10.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.10.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.10.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.10.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.10.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.10.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.10.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.10.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.10.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.11.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.11.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.11.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.11.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.11.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.11.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.11.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.11.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.11.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.11.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.11.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.11.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.11.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.11.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.12.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.12.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.12.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.12.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.12.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.12.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.12.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.12.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.12.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.12.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.12.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.12.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.12.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.12.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.13.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.13.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.13.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.13.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.13.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.13.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.13.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.13.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.13.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.13.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.13.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.13.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.13.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.13.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.14.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.14.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.14.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.14.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.14.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.14.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.14.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.14.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.14.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.14.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.14.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.14.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.14.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.14.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.15.self_attn.q_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.15.self_attn.q_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.15.self_attn.k_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.15.self_attn.k_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.15.self_attn.v_proj.weight.scale.0:\n", - "\ttorch.Size([512, 1, 1, 1])\n", - "model.layers.15.self_attn.v_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.15.self_attn.o_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.15.self_attn.o_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.15.mlp.up_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.15.mlp.up_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.15.mlp.gate_proj.weight.scale.0:\n", - "\ttorch.Size([8192, 1, 1, 1])\n", - "model.layers.15.mlp.gate_proj.weight.zero:\n", - "\ttorch.Size([])\n", - "model.layers.15.mlp.down_proj.weight.scale.0:\n", - "\ttorch.Size([2048, 1, 1, 1])\n", - "model.layers.15.mlp.down_proj.weight.zero:\n", - "\ttorch.Size([])\n" - ] - } - ], - "source": [ - "for name, data in scale_pt.items():\n", - " print(f\"{name}:\")\n", - " print(f\"\\t{data.shape}\") \n" - ] - }, - { - "cell_type": "markdown", - "id": "cc90682f-1272-4cc0-9c93-cbdf659ccd58", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:53:19.570302Z", - "iopub.status.busy": "2026-03-09T05:53:19.570002Z", - "iopub.status.idle": "2026-03-09T05:53:19.572786Z", - "shell.execute_reply": "2026-03-09T05:53:19.572116Z", - "shell.execute_reply.started": "2026-03-09T05:53:19.570286Z" - }, - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "# SMOOTH" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "2ea77800-d47a-4823-a205-32f5ed40036d", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:53:26.852073Z", - "iopub.status.busy": "2026-03-09T05:53:26.851843Z", - "iopub.status.idle": "2026-03-09T05:53:26.855415Z", - "shell.execute_reply": "2026-03-09T05:53:26.854825Z", - "shell.execute_reply.started": "2026-03-09T05:53:26.852055Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['model.layers.0.self_attn.v_proj', 'model.layers.0.self_attn.o_proj', 'model.layers.0.mlp', 'model.layers.0.mlp.down_proj', 'model.layers.1.self_attn.v_proj', 'model.layers.1.self_attn.o_proj', 'model.layers.1.mlp', 'model.layers.1.mlp.down_proj', 'model.layers.2.self_attn.v_proj', 'model.layers.2.self_attn.o_proj', 'model.layers.2.mlp', 'model.layers.2.mlp.down_proj', 'model.layers.3.self_attn.v_proj', 'model.layers.3.self_attn.o_proj', 'model.layers.3.mlp', 'model.layers.3.mlp.down_proj', 'model.layers.4.self_attn.v_proj', 'model.layers.4.self_attn.o_proj', 'model.layers.4.mlp', 'model.layers.4.mlp.down_proj', 'model.layers.5.self_attn.v_proj', 'model.layers.5.self_attn.o_proj', 'model.layers.5.mlp', 'model.layers.5.mlp.down_proj', 'model.layers.6.self_attn.v_proj', 'model.layers.6.self_attn.o_proj', 'model.layers.6.mlp', 'model.layers.6.mlp.down_proj', 'model.layers.7.self_attn.v_proj', 'model.layers.7.self_attn.o_proj', 'model.layers.7.mlp', 'model.layers.7.mlp.down_proj', 'model.layers.8.self_attn.v_proj', 'model.layers.8.self_attn.o_proj', 'model.layers.8.mlp', 'model.layers.8.mlp.down_proj', 'model.layers.9.self_attn.v_proj', 'model.layers.9.self_attn.o_proj', 'model.layers.9.mlp', 'model.layers.9.mlp.down_proj', 'model.layers.10.self_attn.v_proj', 'model.layers.10.self_attn.o_proj', 'model.layers.10.mlp', 'model.layers.10.mlp.down_proj', 'model.layers.11.self_attn.v_proj', 'model.layers.11.self_attn.o_proj', 'model.layers.11.mlp', 'model.layers.11.mlp.down_proj', 'model.layers.12.self_attn.v_proj', 'model.layers.12.self_attn.o_proj', 'model.layers.12.mlp', 'model.layers.12.mlp.down_proj', 'model.layers.13.self_attn.v_proj', 'model.layers.13.self_attn.o_proj', 'model.layers.13.mlp', 'model.layers.13.mlp.down_proj', 'model.layers.14.self_attn.v_proj', 'model.layers.14.self_attn.o_proj', 'model.layers.14.mlp', 'model.layers.14.mlp.down_proj', 'model.layers.15.self_attn.v_proj', 'model.layers.15.self_attn.o_proj', 'model.layers.15.mlp', 'model.layers.15.mlp.down_proj'])" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "smooth_pt.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "dd554ff6-a83c-4b53-8494-7164370b740f", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:53:33.987119Z", - "iopub.status.busy": "2026-03-09T05:53:33.986875Z", - "iopub.status.idle": "2026-03-09T05:53:33.990691Z", - "shell.execute_reply": "2026-03-09T05:53:33.989994Z", - "shell.execute_reply.started": "2026-03-09T05:53:33.987101Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "64" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(smooth_pt.keys())" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "31648435-f194-4bc5-aa47-a118a5287b1e", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:53:39.162115Z", - "iopub.status.busy": "2026-03-09T05:53:39.161651Z", - "iopub.status.idle": "2026-03-09T05:53:39.166122Z", - "shell.execute_reply": "2026-03-09T05:53:39.165513Z", - "shell.execute_reply.started": "2026-03-09T05:53:39.162074Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "model.layers.0.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.0.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.0.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.0.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.1.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.1.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.1.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.1.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.2.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.2.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.2.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.2.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.3.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.3.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.3.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.3.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.4.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.4.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.4.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.4.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.5.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.5.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.5.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.5.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.6.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.6.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.6.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.6.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.7.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.7.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.7.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.7.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.8.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.8.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.8.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.8.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.9.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.9.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.9.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.9.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.10.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.10.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.10.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.10.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.11.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.11.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.11.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.11.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.12.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.12.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.12.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.12.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.13.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.13.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.13.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.13.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.14.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.14.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.14.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.14.mlp.down_proj:\n", - "\ttorch.Size([8192])\n", - "model.layers.15.self_attn.v_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.15.self_attn.o_proj:\n", - "\ttorch.Size([2048])\n", - "model.layers.15.mlp:\n", - "\ttorch.Size([2048])\n", - "model.layers.15.mlp.down_proj:\n", - "\ttorch.Size([8192])\n" - ] - } - ], - "source": [ - "for name, data in smooth_pt.items():\n", - " print(f\"{name}:\")\n", - " print(f\"\\t{data.shape}\") \n" - ] - }, - { - "cell_type": "markdown", - "id": "a5af6801-9cb2-4a25-b53a-a13805fabfca", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "# WGTS" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "4dcb4c84-e9d8-4c49-906b-b6d47d5dccae", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:54:15.980981Z", - "iopub.status.busy": "2026-03-09T05:54:15.980657Z", - "iopub.status.idle": "2026-03-09T05:54:15.984513Z", - "shell.execute_reply": "2026-03-09T05:54:15.983899Z", - "shell.execute_reply.started": "2026-03-09T05:54:15.980963Z" - }, - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['model.layers.0.self_attn.q_proj', 'model.layers.0.self_attn.k_proj', 'model.layers.0.self_attn.v_proj', 'model.layers.0.self_attn.o_proj', 'model.layers.0.mlp.up_proj', 'model.layers.0.mlp.gate_proj', 'model.layers.0.mlp.down_proj', 'model.layers.1.self_attn.q_proj', 'model.layers.1.self_attn.k_proj', 'model.layers.1.self_attn.v_proj', 'model.layers.1.self_attn.o_proj', 'model.layers.1.mlp.up_proj', 'model.layers.1.mlp.gate_proj', 'model.layers.1.mlp.down_proj', 'model.layers.2.self_attn.q_proj', 'model.layers.2.self_attn.k_proj', 'model.layers.2.self_attn.v_proj', 'model.layers.2.self_attn.o_proj', 'model.layers.2.mlp.up_proj', 'model.layers.2.mlp.gate_proj', 'model.layers.2.mlp.down_proj', 'model.layers.3.self_attn.q_proj', 'model.layers.3.self_attn.k_proj', 'model.layers.3.self_attn.v_proj', 'model.layers.3.self_attn.o_proj', 'model.layers.3.mlp.up_proj', 'model.layers.3.mlp.gate_proj', 'model.layers.3.mlp.down_proj', 'model.layers.4.self_attn.q_proj', 'model.layers.4.self_attn.k_proj', 'model.layers.4.self_attn.v_proj', 'model.layers.4.self_attn.o_proj', 'model.layers.4.mlp.up_proj', 'model.layers.4.mlp.gate_proj', 'model.layers.4.mlp.down_proj', 'model.layers.5.self_attn.q_proj', 'model.layers.5.self_attn.k_proj', 'model.layers.5.self_attn.v_proj', 'model.layers.5.self_attn.o_proj', 'model.layers.5.mlp.up_proj', 'model.layers.5.mlp.gate_proj', 'model.layers.5.mlp.down_proj', 'model.layers.6.self_attn.q_proj', 'model.layers.6.self_attn.k_proj', 'model.layers.6.self_attn.v_proj', 'model.layers.6.self_attn.o_proj', 'model.layers.6.mlp.up_proj', 'model.layers.6.mlp.gate_proj', 'model.layers.6.mlp.down_proj', 'model.layers.7.self_attn.q_proj', 'model.layers.7.self_attn.k_proj', 'model.layers.7.self_attn.v_proj', 'model.layers.7.self_attn.o_proj', 'model.layers.7.mlp.up_proj', 'model.layers.7.mlp.gate_proj', 'model.layers.7.mlp.down_proj', 'model.layers.8.self_attn.q_proj', 'model.layers.8.self_attn.k_proj', 'model.layers.8.self_attn.v_proj', 'model.layers.8.self_attn.o_proj', 'model.layers.8.mlp.up_proj', 'model.layers.8.mlp.gate_proj', 'model.layers.8.mlp.down_proj', 'model.layers.9.self_attn.q_proj', 'model.layers.9.self_attn.k_proj', 'model.layers.9.self_attn.v_proj', 'model.layers.9.self_attn.o_proj', 'model.layers.9.mlp.up_proj', 'model.layers.9.mlp.gate_proj', 'model.layers.9.mlp.down_proj', 'model.layers.10.self_attn.q_proj', 'model.layers.10.self_attn.k_proj', 'model.layers.10.self_attn.v_proj', 'model.layers.10.self_attn.o_proj', 'model.layers.10.mlp.up_proj', 'model.layers.10.mlp.gate_proj', 'model.layers.10.mlp.down_proj', 'model.layers.11.self_attn.q_proj', 'model.layers.11.self_attn.k_proj', 'model.layers.11.self_attn.v_proj', 'model.layers.11.self_attn.o_proj', 'model.layers.11.mlp.up_proj', 'model.layers.11.mlp.gate_proj', 'model.layers.11.mlp.down_proj', 'model.layers.12.self_attn.q_proj', 'model.layers.12.self_attn.k_proj', 'model.layers.12.self_attn.v_proj', 'model.layers.12.self_attn.o_proj', 'model.layers.12.mlp.up_proj', 'model.layers.12.mlp.gate_proj', 'model.layers.12.mlp.down_proj', 'model.layers.13.self_attn.q_proj', 'model.layers.13.self_attn.k_proj', 'model.layers.13.self_attn.v_proj', 'model.layers.13.self_attn.o_proj', 'model.layers.13.mlp.up_proj', 'model.layers.13.mlp.gate_proj', 'model.layers.13.mlp.down_proj', 'model.layers.14.self_attn.q_proj', 'model.layers.14.self_attn.k_proj', 'model.layers.14.self_attn.v_proj', 'model.layers.14.self_attn.o_proj', 'model.layers.14.mlp.up_proj', 'model.layers.14.mlp.gate_proj', 'model.layers.14.mlp.down_proj', 'model.layers.15.self_attn.q_proj', 'model.layers.15.self_attn.k_proj', 'model.layers.15.self_attn.v_proj', 'model.layers.15.self_attn.o_proj', 'model.layers.15.mlp.up_proj', 'model.layers.15.mlp.gate_proj', 'model.layers.15.mlp.down_proj'])" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wgts_pt.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "2e0c33ea-bc93-4f8e-8779-aeb9aced5c1a", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:54:29.947054Z", - "iopub.status.busy": "2026-03-09T05:54:29.946817Z", - "iopub.status.idle": "2026-03-09T05:54:29.950480Z", - "shell.execute_reply": "2026-03-09T05:54:29.949882Z", - "shell.execute_reply.started": "2026-03-09T05:54:29.947037Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "112" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(wgts_pt.keys())" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "dea1b317-58ef-4b42-a04f-ec64d2a2e1dc", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-09T05:59:29.229677Z", - "iopub.status.busy": "2026-03-09T05:59:29.229437Z", - "iopub.status.idle": "2026-03-09T05:59:29.240418Z", - "shell.execute_reply": "2026-03-09T05:59:29.239728Z", - "shell.execute_reply.started": "2026-03-09T05:59:29.229659Z" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "model.layers.0.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.0.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.0.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.0.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.0.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.0.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.0.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.1.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.1.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.1.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.1.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.1.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.1.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.1.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.2.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.2.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.2.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.2.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.2.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.2.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.2.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.3.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.3.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.3.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.3.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.3.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.3.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.3.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.4.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.4.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.4.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.4.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.4.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.4.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.4.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.5.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.5.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.5.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.5.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.5.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.5.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.5.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.6.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.6.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.6.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.6.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.6.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.6.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.6.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.7.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.7.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.7.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.7.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.7.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.7.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.7.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.8.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.8.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.8.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.8.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.8.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.8.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.8.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.9.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.9.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.9.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.9.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.9.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.9.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.9.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.10.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.10.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.10.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.10.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.10.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.10.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.10.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.11.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.11.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.11.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.11.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.11.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.11.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.11.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.12.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.12.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.12.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.12.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.12.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.12.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.12.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.13.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.13.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.13.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.13.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.13.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.13.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.13.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.14.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.14.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.14.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.14.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.14.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.14.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.14.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.15.self_attn.q_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.15.self_attn.k_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.15.self_attn.v_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([512, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.15.self_attn.o_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.15.mlp.up_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.15.mlp.gate_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([8192, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n", - "model.layers.15.mlp.down_proj:\n", - "\tchannels_dim : None\n", - "\tscale : None\n", - "\tzero : None\n", - "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", - "\t\tmin : None\n", - "\t\tmax : torch.Size([2048, 1, 1, 1])\n", - "\t\tratio : None\n", - "\trange_bound : None\n", - "\tquant_range : None\n" - ] - } - ], - "source": [ - "for name, data in wgts_pt.items():\n", - " print(f\"{name}:\")\n", - " for key, value in data.items():\n", - " print(f\"\\t{key} : {value if not isinstance(value, tuple) else [value[0].keys()]}\") \n", - " if isinstance(value, tuple):\n", - " for k, v in value[0].items():\n", - " print(f\"\\t\\t{k} : {v.shape if isinstance(v, torch.Tensor) else v}\") \n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05601616-f6be-465f-b519-ab3e60378e75", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}