diff --git "a/info.ipynb" "b/info.ipynb" new file mode 100644--- /dev/null +++ "b/info.ipynb" @@ -0,0 +1,3007 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "c538cbb8-93b2-4a17-800f-779327b886a4", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:41:45.765015Z", + "iopub.status.busy": "2026-03-09T05:41:45.764811Z", + "iopub.status.idle": "2026-03-09T05:41:46.915088Z", + "shell.execute_reply": "2026-03-09T05:41:46.914431Z", + "shell.execute_reply.started": "2026-03-09T05:41:45.764995Z" + } + }, + "outputs": [], + "source": [ + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4ad98003-a21a-409d-a562-dbdd5f877d77", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:42:26.480073Z", + "iopub.status.busy": "2026-03-09T05:42:26.479743Z", + "iopub.status.idle": "2026-03-09T05:42:28.246635Z", + "shell.execute_reply": "2026-03-09T05:42:28.245948Z", + "shell.execute_reply.started": "2026-03-09T05:42:26.480051Z" + } + }, + "outputs": [], + "source": [ + "acts_pt = torch.load('acts.pt')\n", + "model_pt = torch.load('model.pt')\n", + "scale_pt = torch.load('scale.pt')\n", + "smooth_pt = torch.load('smooth.pt')\n", + "wgts_pt = torch.load('wgts.pt')" + ] + }, + { + "cell_type": "markdown", + "id": "42d4d961-dd9f-4a08-a1c2-b4622b094205", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "# ACTS" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a47e22e8-4ea3-4174-b4df-5f63c06211c5", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:42:36.542901Z", + "iopub.status.busy": "2026-03-09T05:42:36.542555Z", + "iopub.status.idle": "2026-03-09T05:42:36.547732Z", + "shell.execute_reply": "2026-03-09T05:42:36.547190Z", + "shell.execute_reply.started": "2026-03-09T05:42:36.542883Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['model.layers.0.self_attn.v_proj.input', 'model.layers.0.self_attn.o_proj.input', 'model.layers.0.mlp.up_proj.input', 'model.layers.0.mlp.down_proj.input', 'model.layers.1.self_attn.v_proj.input', 'model.layers.1.self_attn.o_proj.input', 'model.layers.1.mlp.up_proj.input', 'model.layers.1.mlp.down_proj.input', 'model.layers.2.self_attn.v_proj.input', 'model.layers.2.self_attn.o_proj.input', 'model.layers.2.mlp.up_proj.input', 'model.layers.2.mlp.down_proj.input', 'model.layers.3.self_attn.v_proj.input', 'model.layers.3.self_attn.o_proj.input', 'model.layers.3.mlp.up_proj.input', 'model.layers.3.mlp.down_proj.input', 'model.layers.4.self_attn.v_proj.input', 'model.layers.4.self_attn.o_proj.input', 'model.layers.4.mlp.up_proj.input', 'model.layers.4.mlp.down_proj.input', 'model.layers.5.self_attn.v_proj.input', 'model.layers.5.self_attn.o_proj.input', 'model.layers.5.mlp.up_proj.input', 'model.layers.5.mlp.down_proj.input', 'model.layers.6.self_attn.v_proj.input', 'model.layers.6.self_attn.o_proj.input', 'model.layers.6.mlp.up_proj.input', 'model.layers.6.mlp.down_proj.input', 'model.layers.7.self_attn.v_proj.input', 'model.layers.7.self_attn.o_proj.input', 'model.layers.7.mlp.up_proj.input', 'model.layers.7.mlp.down_proj.input', 'model.layers.8.self_attn.v_proj.input', 'model.layers.8.self_attn.o_proj.input', 'model.layers.8.mlp.up_proj.input', 'model.layers.8.mlp.down_proj.input', 'model.layers.9.self_attn.v_proj.input', 'model.layers.9.self_attn.o_proj.input', 'model.layers.9.mlp.up_proj.input', 'model.layers.9.mlp.down_proj.input', 'model.layers.10.self_attn.v_proj.input', 'model.layers.10.self_attn.o_proj.input', 'model.layers.10.mlp.up_proj.input', 'model.layers.10.mlp.down_proj.input', 'model.layers.11.self_attn.v_proj.input', 'model.layers.11.self_attn.o_proj.input', 'model.layers.11.mlp.up_proj.input', 'model.layers.11.mlp.down_proj.input', 'model.layers.12.self_attn.v_proj.input', 'model.layers.12.self_attn.o_proj.input', 'model.layers.12.mlp.up_proj.input', 'model.layers.12.mlp.down_proj.input', 'model.layers.13.self_attn.v_proj.input', 'model.layers.13.self_attn.o_proj.input', 'model.layers.13.mlp.up_proj.input', 'model.layers.13.mlp.down_proj.input', 'model.layers.14.self_attn.v_proj.input', 'model.layers.14.self_attn.o_proj.input', 'model.layers.14.mlp.up_proj.input', 'model.layers.14.mlp.down_proj.input', 'model.layers.15.self_attn.v_proj.input', 'model.layers.15.self_attn.o_proj.input', 'model.layers.15.mlp.up_proj.input', 'model.layers.15.mlp.down_proj.input'])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "acts_pt.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "40a07467-4f68-43a6-ac16-b7edf2fd1b4f", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:43:29.261870Z", + "iopub.status.busy": "2026-03-09T05:43:29.261517Z", + "iopub.status.idle": "2026-03-09T05:43:29.265525Z", + "shell.execute_reply": "2026-03-09T05:43:29.264884Z", + "shell.execute_reply.started": "2026-03-09T05:43:29.261840Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(acts_pt.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "78e1dbf3-7a53-4f20-a103-159685de218a", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:49:48.802408Z", + "iopub.status.busy": "2026-03-09T05:49:48.802136Z", + "iopub.status.idle": "2026-03-09T05:49:48.819885Z", + "shell.execute_reply": "2026-03-09T05:49:48.819151Z", + "shell.execute_reply.started": "2026-03-09T05:49:48.802390Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model.layers.0.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.3594]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.0.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.3340]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.0.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.4824]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.0.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[32.2812]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.1.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.5547]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.1.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.7031]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.1.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[20.4531]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.1.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[64.1875]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.2.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.9609]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.2.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.6816]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.2.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[6.1680]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.2.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.0547]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.3.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[8.1484]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.3.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.7969]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.3.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[5.7109]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.3.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.6279]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.4.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.1914]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.4.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.1338]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.4.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[7.5234]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.4.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.4834]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.5.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[10.3750]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.5.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.5781]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.5.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.9062]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.5.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.0605]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.6.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[7.6797]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.6.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.7715]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.6.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.6875]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.6.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.6094]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.7.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[5.8320]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.7.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.2686]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.7.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[6.3047]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.7.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.6582]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.8.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.8750]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.8.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[1.5840]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.8.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[6.0234]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.8.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.4062]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.9.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[5.6289]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.9.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.4727]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.9.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[5.0703]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.9.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.8164]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.10.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[10.9766]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.10.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.3223]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.10.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.2969]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.10.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.5566]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.11.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[9.5547]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.11.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.3672]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.11.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.5039]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.11.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.9570]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.12.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[9.4766]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.12.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.4551]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.12.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[5.2031]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.12.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.2871]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.13.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[10.9766]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.13.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[2.8203]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.13.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[5.4062]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.13.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.7148]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.14.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[13.4219]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.14.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.5957]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.14.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[4.6641]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.14.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[36.7188]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.15.self_attn.v_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[8.0312]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.15.self_attn.o_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[3.5938]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.15.mlp.up_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[9.0078]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.15.mlp.down_proj.input:\n", + "\tchannels_dim : -1\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : ({'min': None, 'max': tensor([[[[151.1250]]]]), 'ratio': None},)\n", + "\trange_bound : None\n", + "\tquant_range : None\n" + ] + } + ], + "source": [ + "for name, data in acts_pt.items():\n", + " print(f\"{name}:\")\n", + " for key, value in data.items():\n", + " print(f\"\\t{key} : {value}\") \n" + ] + }, + { + "cell_type": "markdown", + "id": "b38bd5a7-0450-4ac8-bcfd-a856d8b15fe0", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "# MODEL" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b565a179-2c69-460e-b1f8-99b997114fcd", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:47:15.725801Z", + "iopub.status.busy": "2026-03-09T05:47:15.725573Z", + "iopub.status.idle": "2026-03-09T05:47:15.729324Z", + "shell.execute_reply": "2026-03-09T05:47:15.728722Z", + "shell.execute_reply.started": "2026-03-09T05:47:15.725784Z" + }, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "odict_keys(['model.embed_tokens.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.0.input_layernorm.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.mlp.gate_proj.weight', 'model.layers.1.mlp.up_proj.weight', 'model.layers.1.mlp.down_proj.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.1.post_attention_layernorm.weight', 'model.layers.2.self_attn.q_proj.weight', 'model.layers.2.self_attn.k_proj.weight', 'model.layers.2.self_attn.v_proj.weight', 'model.layers.2.self_attn.o_proj.weight', 'model.layers.2.mlp.gate_proj.weight', 'model.layers.2.mlp.up_proj.weight', 'model.layers.2.mlp.down_proj.weight', 'model.layers.2.input_layernorm.weight', 'model.layers.2.post_attention_layernorm.weight', 'model.layers.3.self_attn.q_proj.weight', 'model.layers.3.self_attn.k_proj.weight', 'model.layers.3.self_attn.v_proj.weight', 'model.layers.3.self_attn.o_proj.weight', 'model.layers.3.mlp.gate_proj.weight', 'model.layers.3.mlp.up_proj.weight', 'model.layers.3.mlp.down_proj.weight', 'model.layers.3.input_layernorm.weight', 'model.layers.3.post_attention_layernorm.weight', 'model.layers.4.self_attn.q_proj.weight', 'model.layers.4.self_attn.k_proj.weight', 'model.layers.4.self_attn.v_proj.weight', 'model.layers.4.self_attn.o_proj.weight', 'model.layers.4.mlp.gate_proj.weight', 'model.layers.4.mlp.up_proj.weight', 'model.layers.4.mlp.down_proj.weight', 'model.layers.4.input_layernorm.weight', 'model.layers.4.post_attention_layernorm.weight', 'model.layers.5.self_attn.q_proj.weight', 'model.layers.5.self_attn.k_proj.weight', 'model.layers.5.self_attn.v_proj.weight', 'model.layers.5.self_attn.o_proj.weight', 'model.layers.5.mlp.gate_proj.weight', 'model.layers.5.mlp.up_proj.weight', 'model.layers.5.mlp.down_proj.weight', 'model.layers.5.input_layernorm.weight', 'model.layers.5.post_attention_layernorm.weight', 'model.layers.6.self_attn.q_proj.weight', 'model.layers.6.self_attn.k_proj.weight', 'model.layers.6.self_attn.v_proj.weight', 'model.layers.6.self_attn.o_proj.weight', 'model.layers.6.mlp.gate_proj.weight', 'model.layers.6.mlp.up_proj.weight', 'model.layers.6.mlp.down_proj.weight', 'model.layers.6.input_layernorm.weight', 'model.layers.6.post_attention_layernorm.weight', 'model.layers.7.self_attn.q_proj.weight', 'model.layers.7.self_attn.k_proj.weight', 'model.layers.7.self_attn.v_proj.weight', 'model.layers.7.self_attn.o_proj.weight', 'model.layers.7.mlp.gate_proj.weight', 'model.layers.7.mlp.up_proj.weight', 'model.layers.7.mlp.down_proj.weight', 'model.layers.7.input_layernorm.weight', 'model.layers.7.post_attention_layernorm.weight', 'model.layers.8.self_attn.q_proj.weight', 'model.layers.8.self_attn.k_proj.weight', 'model.layers.8.self_attn.v_proj.weight', 'model.layers.8.self_attn.o_proj.weight', 'model.layers.8.mlp.gate_proj.weight', 'model.layers.8.mlp.up_proj.weight', 'model.layers.8.mlp.down_proj.weight', 'model.layers.8.input_layernorm.weight', 'model.layers.8.post_attention_layernorm.weight', 'model.layers.9.self_attn.q_proj.weight', 'model.layers.9.self_attn.k_proj.weight', 'model.layers.9.self_attn.v_proj.weight', 'model.layers.9.self_attn.o_proj.weight', 'model.layers.9.mlp.gate_proj.weight', 'model.layers.9.mlp.up_proj.weight', 'model.layers.9.mlp.down_proj.weight', 'model.layers.9.input_layernorm.weight', 'model.layers.9.post_attention_layernorm.weight', 'model.layers.10.self_attn.q_proj.weight', 'model.layers.10.self_attn.k_proj.weight', 'model.layers.10.self_attn.v_proj.weight', 'model.layers.10.self_attn.o_proj.weight', 'model.layers.10.mlp.gate_proj.weight', 'model.layers.10.mlp.up_proj.weight', 'model.layers.10.mlp.down_proj.weight', 'model.layers.10.input_layernorm.weight', 'model.layers.10.post_attention_layernorm.weight', 'model.layers.11.self_attn.q_proj.weight', 'model.layers.11.self_attn.k_proj.weight', 'model.layers.11.self_attn.v_proj.weight', 'model.layers.11.self_attn.o_proj.weight', 'model.layers.11.mlp.gate_proj.weight', 'model.layers.11.mlp.up_proj.weight', 'model.layers.11.mlp.down_proj.weight', 'model.layers.11.input_layernorm.weight', 'model.layers.11.post_attention_layernorm.weight', 'model.layers.12.self_attn.q_proj.weight', 'model.layers.12.self_attn.k_proj.weight', 'model.layers.12.self_attn.v_proj.weight', 'model.layers.12.self_attn.o_proj.weight', 'model.layers.12.mlp.gate_proj.weight', 'model.layers.12.mlp.up_proj.weight', 'model.layers.12.mlp.down_proj.weight', 'model.layers.12.input_layernorm.weight', 'model.layers.12.post_attention_layernorm.weight', 'model.layers.13.self_attn.q_proj.weight', 'model.layers.13.self_attn.k_proj.weight', 'model.layers.13.self_attn.v_proj.weight', 'model.layers.13.self_attn.o_proj.weight', 'model.layers.13.mlp.gate_proj.weight', 'model.layers.13.mlp.up_proj.weight', 'model.layers.13.mlp.down_proj.weight', 'model.layers.13.input_layernorm.weight', 'model.layers.13.post_attention_layernorm.weight', 'model.layers.14.self_attn.q_proj.weight', 'model.layers.14.self_attn.k_proj.weight', 'model.layers.14.self_attn.v_proj.weight', 'model.layers.14.self_attn.o_proj.weight', 'model.layers.14.mlp.gate_proj.weight', 'model.layers.14.mlp.up_proj.weight', 'model.layers.14.mlp.down_proj.weight', 'model.layers.14.input_layernorm.weight', 'model.layers.14.post_attention_layernorm.weight', 'model.layers.15.self_attn.q_proj.weight', 'model.layers.15.self_attn.k_proj.weight', 'model.layers.15.self_attn.v_proj.weight', 'model.layers.15.self_attn.o_proj.weight', 'model.layers.15.mlp.gate_proj.weight', 'model.layers.15.mlp.up_proj.weight', 'model.layers.15.mlp.down_proj.weight', 'model.layers.15.input_layernorm.weight', 'model.layers.15.post_attention_layernorm.weight', 'model.norm.weight', 'lm_head.weight'])" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_pt.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b2b93d90-3364-41ac-9bb1-2be496d476c5", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:47:36.705138Z", + "iopub.status.busy": "2026-03-09T05:47:36.704893Z", + "iopub.status.idle": "2026-03-09T05:47:36.708559Z", + "shell.execute_reply": "2026-03-09T05:47:36.708047Z", + "shell.execute_reply.started": "2026-03-09T05:47:36.705118Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "147" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(model_pt.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "801c3a98-612b-4e14-9c9a-57150da7fef1", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:50:38.340142Z", + "iopub.status.busy": "2026-03-09T05:50:38.339930Z", + "iopub.status.idle": "2026-03-09T05:50:38.345306Z", + "shell.execute_reply": "2026-03-09T05:50:38.344564Z", + "shell.execute_reply.started": "2026-03-09T05:50:38.340127Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model.embed_tokens.weight:\n", + "\ttorch.Size([128256, 2048])\n", + "model.layers.0.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.0.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.0.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.0.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.0.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.0.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.0.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.0.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.0.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.1.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.1.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.1.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.1.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.1.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.1.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.1.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.1.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.1.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.2.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.2.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.2.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.2.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.2.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.2.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.2.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.2.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.2.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.3.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.3.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.3.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.3.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.3.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.3.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.3.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.3.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.3.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.4.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.4.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.4.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.4.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.4.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.4.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.4.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.4.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.4.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.5.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.5.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.5.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.5.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.5.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.5.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.5.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.5.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.5.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.6.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.6.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.6.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.6.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.6.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.6.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.6.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.6.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.6.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.7.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.7.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.7.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.7.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.7.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.7.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.7.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.7.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.7.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.8.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.8.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.8.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.8.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.8.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.8.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.8.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.8.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.8.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.9.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.9.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.9.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.9.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.9.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.9.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.9.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.9.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.9.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.10.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.10.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.10.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.10.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.10.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.10.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.10.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.10.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.10.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.11.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.11.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.11.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.11.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.11.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.11.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.11.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.11.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.11.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.12.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.12.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.12.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.12.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.12.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.12.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.12.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.12.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.12.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.13.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.13.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.13.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.13.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.13.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.13.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.13.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.13.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.13.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.14.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.14.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.14.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.14.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.14.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.14.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.14.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.14.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.14.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.15.self_attn.q_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.15.self_attn.k_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.15.self_attn.v_proj.weight:\n", + "\ttorch.Size([512, 2048])\n", + "model.layers.15.self_attn.o_proj.weight:\n", + "\ttorch.Size([2048, 2048])\n", + "model.layers.15.mlp.gate_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.15.mlp.up_proj.weight:\n", + "\ttorch.Size([8192, 2048])\n", + "model.layers.15.mlp.down_proj.weight:\n", + "\ttorch.Size([2048, 8192])\n", + "model.layers.15.input_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.layers.15.post_attention_layernorm.weight:\n", + "\ttorch.Size([2048])\n", + "model.norm.weight:\n", + "\ttorch.Size([2048])\n", + "lm_head.weight:\n", + "\ttorch.Size([128256, 2048])\n" + ] + } + ], + "source": [ + "for name, data in model_pt.items():\n", + " print(f\"{name}:\")\n", + " print(f\"\\t{data.shape}\") \n" + ] + }, + { + "cell_type": "markdown", + "id": "67ae1f03-2fd5-4e65-8076-157e747cc6bb", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "# SCALE" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d9a045f2-5f0b-46ef-9063-c2bc4b1fb1d7", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:51:28.177054Z", + "iopub.status.busy": "2026-03-09T05:51:28.176813Z", + "iopub.status.idle": "2026-03-09T05:51:28.180641Z", + "shell.execute_reply": "2026-03-09T05:51:28.180032Z", + "shell.execute_reply.started": "2026-03-09T05:51:28.177035Z" + }, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['model.layers.0.self_attn.q_proj.weight.scale.0', 'model.layers.0.self_attn.q_proj.weight.zero', 'model.layers.0.self_attn.k_proj.weight.scale.0', 'model.layers.0.self_attn.k_proj.weight.zero', 'model.layers.0.self_attn.v_proj.weight.scale.0', 'model.layers.0.self_attn.v_proj.weight.zero', 'model.layers.0.self_attn.o_proj.weight.scale.0', 'model.layers.0.self_attn.o_proj.weight.zero', 'model.layers.0.mlp.up_proj.weight.scale.0', 'model.layers.0.mlp.up_proj.weight.zero', 'model.layers.0.mlp.gate_proj.weight.scale.0', 'model.layers.0.mlp.gate_proj.weight.zero', 'model.layers.0.mlp.down_proj.weight.scale.0', 'model.layers.0.mlp.down_proj.weight.zero', 'model.layers.1.self_attn.q_proj.weight.scale.0', 'model.layers.1.self_attn.q_proj.weight.zero', 'model.layers.1.self_attn.k_proj.weight.scale.0', 'model.layers.1.self_attn.k_proj.weight.zero', 'model.layers.1.self_attn.v_proj.weight.scale.0', 'model.layers.1.self_attn.v_proj.weight.zero', 'model.layers.1.self_attn.o_proj.weight.scale.0', 'model.layers.1.self_attn.o_proj.weight.zero', 'model.layers.1.mlp.up_proj.weight.scale.0', 'model.layers.1.mlp.up_proj.weight.zero', 'model.layers.1.mlp.gate_proj.weight.scale.0', 'model.layers.1.mlp.gate_proj.weight.zero', 'model.layers.1.mlp.down_proj.weight.scale.0', 'model.layers.1.mlp.down_proj.weight.zero', 'model.layers.2.self_attn.q_proj.weight.scale.0', 'model.layers.2.self_attn.q_proj.weight.zero', 'model.layers.2.self_attn.k_proj.weight.scale.0', 'model.layers.2.self_attn.k_proj.weight.zero', 'model.layers.2.self_attn.v_proj.weight.scale.0', 'model.layers.2.self_attn.v_proj.weight.zero', 'model.layers.2.self_attn.o_proj.weight.scale.0', 'model.layers.2.self_attn.o_proj.weight.zero', 'model.layers.2.mlp.up_proj.weight.scale.0', 'model.layers.2.mlp.up_proj.weight.zero', 'model.layers.2.mlp.gate_proj.weight.scale.0', 'model.layers.2.mlp.gate_proj.weight.zero', 'model.layers.2.mlp.down_proj.weight.scale.0', 'model.layers.2.mlp.down_proj.weight.zero', 'model.layers.3.self_attn.q_proj.weight.scale.0', 'model.layers.3.self_attn.q_proj.weight.zero', 'model.layers.3.self_attn.k_proj.weight.scale.0', 'model.layers.3.self_attn.k_proj.weight.zero', 'model.layers.3.self_attn.v_proj.weight.scale.0', 'model.layers.3.self_attn.v_proj.weight.zero', 'model.layers.3.self_attn.o_proj.weight.scale.0', 'model.layers.3.self_attn.o_proj.weight.zero', 'model.layers.3.mlp.up_proj.weight.scale.0', 'model.layers.3.mlp.up_proj.weight.zero', 'model.layers.3.mlp.gate_proj.weight.scale.0', 'model.layers.3.mlp.gate_proj.weight.zero', 'model.layers.3.mlp.down_proj.weight.scale.0', 'model.layers.3.mlp.down_proj.weight.zero', 'model.layers.4.self_attn.q_proj.weight.scale.0', 'model.layers.4.self_attn.q_proj.weight.zero', 'model.layers.4.self_attn.k_proj.weight.scale.0', 'model.layers.4.self_attn.k_proj.weight.zero', 'model.layers.4.self_attn.v_proj.weight.scale.0', 'model.layers.4.self_attn.v_proj.weight.zero', 'model.layers.4.self_attn.o_proj.weight.scale.0', 'model.layers.4.self_attn.o_proj.weight.zero', 'model.layers.4.mlp.up_proj.weight.scale.0', 'model.layers.4.mlp.up_proj.weight.zero', 'model.layers.4.mlp.gate_proj.weight.scale.0', 'model.layers.4.mlp.gate_proj.weight.zero', 'model.layers.4.mlp.down_proj.weight.scale.0', 'model.layers.4.mlp.down_proj.weight.zero', 'model.layers.5.self_attn.q_proj.weight.scale.0', 'model.layers.5.self_attn.q_proj.weight.zero', 'model.layers.5.self_attn.k_proj.weight.scale.0', 'model.layers.5.self_attn.k_proj.weight.zero', 'model.layers.5.self_attn.v_proj.weight.scale.0', 'model.layers.5.self_attn.v_proj.weight.zero', 'model.layers.5.self_attn.o_proj.weight.scale.0', 'model.layers.5.self_attn.o_proj.weight.zero', 'model.layers.5.mlp.up_proj.weight.scale.0', 'model.layers.5.mlp.up_proj.weight.zero', 'model.layers.5.mlp.gate_proj.weight.scale.0', 'model.layers.5.mlp.gate_proj.weight.zero', 'model.layers.5.mlp.down_proj.weight.scale.0', 'model.layers.5.mlp.down_proj.weight.zero', 'model.layers.6.self_attn.q_proj.weight.scale.0', 'model.layers.6.self_attn.q_proj.weight.zero', 'model.layers.6.self_attn.k_proj.weight.scale.0', 'model.layers.6.self_attn.k_proj.weight.zero', 'model.layers.6.self_attn.v_proj.weight.scale.0', 'model.layers.6.self_attn.v_proj.weight.zero', 'model.layers.6.self_attn.o_proj.weight.scale.0', 'model.layers.6.self_attn.o_proj.weight.zero', 'model.layers.6.mlp.up_proj.weight.scale.0', 'model.layers.6.mlp.up_proj.weight.zero', 'model.layers.6.mlp.gate_proj.weight.scale.0', 'model.layers.6.mlp.gate_proj.weight.zero', 'model.layers.6.mlp.down_proj.weight.scale.0', 'model.layers.6.mlp.down_proj.weight.zero', 'model.layers.7.self_attn.q_proj.weight.scale.0', 'model.layers.7.self_attn.q_proj.weight.zero', 'model.layers.7.self_attn.k_proj.weight.scale.0', 'model.layers.7.self_attn.k_proj.weight.zero', 'model.layers.7.self_attn.v_proj.weight.scale.0', 'model.layers.7.self_attn.v_proj.weight.zero', 'model.layers.7.self_attn.o_proj.weight.scale.0', 'model.layers.7.self_attn.o_proj.weight.zero', 'model.layers.7.mlp.up_proj.weight.scale.0', 'model.layers.7.mlp.up_proj.weight.zero', 'model.layers.7.mlp.gate_proj.weight.scale.0', 'model.layers.7.mlp.gate_proj.weight.zero', 'model.layers.7.mlp.down_proj.weight.scale.0', 'model.layers.7.mlp.down_proj.weight.zero', 'model.layers.8.self_attn.q_proj.weight.scale.0', 'model.layers.8.self_attn.q_proj.weight.zero', 'model.layers.8.self_attn.k_proj.weight.scale.0', 'model.layers.8.self_attn.k_proj.weight.zero', 'model.layers.8.self_attn.v_proj.weight.scale.0', 'model.layers.8.self_attn.v_proj.weight.zero', 'model.layers.8.self_attn.o_proj.weight.scale.0', 'model.layers.8.self_attn.o_proj.weight.zero', 'model.layers.8.mlp.up_proj.weight.scale.0', 'model.layers.8.mlp.up_proj.weight.zero', 'model.layers.8.mlp.gate_proj.weight.scale.0', 'model.layers.8.mlp.gate_proj.weight.zero', 'model.layers.8.mlp.down_proj.weight.scale.0', 'model.layers.8.mlp.down_proj.weight.zero', 'model.layers.9.self_attn.q_proj.weight.scale.0', 'model.layers.9.self_attn.q_proj.weight.zero', 'model.layers.9.self_attn.k_proj.weight.scale.0', 'model.layers.9.self_attn.k_proj.weight.zero', 'model.layers.9.self_attn.v_proj.weight.scale.0', 'model.layers.9.self_attn.v_proj.weight.zero', 'model.layers.9.self_attn.o_proj.weight.scale.0', 'model.layers.9.self_attn.o_proj.weight.zero', 'model.layers.9.mlp.up_proj.weight.scale.0', 'model.layers.9.mlp.up_proj.weight.zero', 'model.layers.9.mlp.gate_proj.weight.scale.0', 'model.layers.9.mlp.gate_proj.weight.zero', 'model.layers.9.mlp.down_proj.weight.scale.0', 'model.layers.9.mlp.down_proj.weight.zero', 'model.layers.10.self_attn.q_proj.weight.scale.0', 'model.layers.10.self_attn.q_proj.weight.zero', 'model.layers.10.self_attn.k_proj.weight.scale.0', 'model.layers.10.self_attn.k_proj.weight.zero', 'model.layers.10.self_attn.v_proj.weight.scale.0', 'model.layers.10.self_attn.v_proj.weight.zero', 'model.layers.10.self_attn.o_proj.weight.scale.0', 'model.layers.10.self_attn.o_proj.weight.zero', 'model.layers.10.mlp.up_proj.weight.scale.0', 'model.layers.10.mlp.up_proj.weight.zero', 'model.layers.10.mlp.gate_proj.weight.scale.0', 'model.layers.10.mlp.gate_proj.weight.zero', 'model.layers.10.mlp.down_proj.weight.scale.0', 'model.layers.10.mlp.down_proj.weight.zero', 'model.layers.11.self_attn.q_proj.weight.scale.0', 'model.layers.11.self_attn.q_proj.weight.zero', 'model.layers.11.self_attn.k_proj.weight.scale.0', 'model.layers.11.self_attn.k_proj.weight.zero', 'model.layers.11.self_attn.v_proj.weight.scale.0', 'model.layers.11.self_attn.v_proj.weight.zero', 'model.layers.11.self_attn.o_proj.weight.scale.0', 'model.layers.11.self_attn.o_proj.weight.zero', 'model.layers.11.mlp.up_proj.weight.scale.0', 'model.layers.11.mlp.up_proj.weight.zero', 'model.layers.11.mlp.gate_proj.weight.scale.0', 'model.layers.11.mlp.gate_proj.weight.zero', 'model.layers.11.mlp.down_proj.weight.scale.0', 'model.layers.11.mlp.down_proj.weight.zero', 'model.layers.12.self_attn.q_proj.weight.scale.0', 'model.layers.12.self_attn.q_proj.weight.zero', 'model.layers.12.self_attn.k_proj.weight.scale.0', 'model.layers.12.self_attn.k_proj.weight.zero', 'model.layers.12.self_attn.v_proj.weight.scale.0', 'model.layers.12.self_attn.v_proj.weight.zero', 'model.layers.12.self_attn.o_proj.weight.scale.0', 'model.layers.12.self_attn.o_proj.weight.zero', 'model.layers.12.mlp.up_proj.weight.scale.0', 'model.layers.12.mlp.up_proj.weight.zero', 'model.layers.12.mlp.gate_proj.weight.scale.0', 'model.layers.12.mlp.gate_proj.weight.zero', 'model.layers.12.mlp.down_proj.weight.scale.0', 'model.layers.12.mlp.down_proj.weight.zero', 'model.layers.13.self_attn.q_proj.weight.scale.0', 'model.layers.13.self_attn.q_proj.weight.zero', 'model.layers.13.self_attn.k_proj.weight.scale.0', 'model.layers.13.self_attn.k_proj.weight.zero', 'model.layers.13.self_attn.v_proj.weight.scale.0', 'model.layers.13.self_attn.v_proj.weight.zero', 'model.layers.13.self_attn.o_proj.weight.scale.0', 'model.layers.13.self_attn.o_proj.weight.zero', 'model.layers.13.mlp.up_proj.weight.scale.0', 'model.layers.13.mlp.up_proj.weight.zero', 'model.layers.13.mlp.gate_proj.weight.scale.0', 'model.layers.13.mlp.gate_proj.weight.zero', 'model.layers.13.mlp.down_proj.weight.scale.0', 'model.layers.13.mlp.down_proj.weight.zero', 'model.layers.14.self_attn.q_proj.weight.scale.0', 'model.layers.14.self_attn.q_proj.weight.zero', 'model.layers.14.self_attn.k_proj.weight.scale.0', 'model.layers.14.self_attn.k_proj.weight.zero', 'model.layers.14.self_attn.v_proj.weight.scale.0', 'model.layers.14.self_attn.v_proj.weight.zero', 'model.layers.14.self_attn.o_proj.weight.scale.0', 'model.layers.14.self_attn.o_proj.weight.zero', 'model.layers.14.mlp.up_proj.weight.scale.0', 'model.layers.14.mlp.up_proj.weight.zero', 'model.layers.14.mlp.gate_proj.weight.scale.0', 'model.layers.14.mlp.gate_proj.weight.zero', 'model.layers.14.mlp.down_proj.weight.scale.0', 'model.layers.14.mlp.down_proj.weight.zero', 'model.layers.15.self_attn.q_proj.weight.scale.0', 'model.layers.15.self_attn.q_proj.weight.zero', 'model.layers.15.self_attn.k_proj.weight.scale.0', 'model.layers.15.self_attn.k_proj.weight.zero', 'model.layers.15.self_attn.v_proj.weight.scale.0', 'model.layers.15.self_attn.v_proj.weight.zero', 'model.layers.15.self_attn.o_proj.weight.scale.0', 'model.layers.15.self_attn.o_proj.weight.zero', 'model.layers.15.mlp.up_proj.weight.scale.0', 'model.layers.15.mlp.up_proj.weight.zero', 'model.layers.15.mlp.gate_proj.weight.scale.0', 'model.layers.15.mlp.gate_proj.weight.zero', 'model.layers.15.mlp.down_proj.weight.scale.0', 'model.layers.15.mlp.down_proj.weight.zero'])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "scale_pt.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "366a18d2-a7a8-4cb7-87f2-be5640cea0f8", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:51:36.026092Z", + "iopub.status.busy": "2026-03-09T05:51:36.025876Z", + "iopub.status.idle": "2026-03-09T05:51:36.029556Z", + "shell.execute_reply": "2026-03-09T05:51:36.028789Z", + "shell.execute_reply.started": "2026-03-09T05:51:36.026078Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "224" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(scale_pt.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "9d51465f-16ce-4fbf-b6cc-9d06421580bc", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:52:05.925258Z", + "iopub.status.busy": "2026-03-09T05:52:05.924766Z", + "iopub.status.idle": "2026-03-09T05:52:05.931048Z", + "shell.execute_reply": "2026-03-09T05:52:05.930469Z", + "shell.execute_reply.started": "2026-03-09T05:52:05.925238Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model.layers.0.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.0.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.0.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.0.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.0.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.0.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.0.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.0.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.0.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.0.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.0.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.0.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.0.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.0.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.1.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.1.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.1.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.1.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.1.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.1.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.1.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.1.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.1.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.1.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.1.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.1.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.1.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.1.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.2.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.2.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.2.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.2.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.2.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.2.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.2.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.2.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.2.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.2.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.2.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.2.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.2.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.2.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.3.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.3.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.3.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.3.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.3.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.3.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.3.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.3.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.3.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.3.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.3.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.3.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.3.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.3.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.4.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.4.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.4.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.4.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.4.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.4.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.4.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.4.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.4.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.4.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.4.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.4.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.4.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.4.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.5.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.5.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.5.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.5.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.5.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.5.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.5.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.5.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.5.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.5.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.5.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.5.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.5.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.5.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.6.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.6.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.6.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.6.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.6.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.6.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.6.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.6.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.6.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.6.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.6.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.6.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.6.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.6.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.7.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.7.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.7.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.7.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.7.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.7.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.7.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.7.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.7.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.7.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.7.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.7.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.7.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.7.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.8.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.8.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.8.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.8.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.8.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.8.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.8.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.8.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.8.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.8.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.8.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.8.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.8.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.8.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.9.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.9.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.9.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.9.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.9.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.9.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.9.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.9.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.9.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.9.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.9.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.9.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.9.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.9.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.10.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.10.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.10.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.10.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.10.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.10.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.10.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.10.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.10.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.10.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.10.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.10.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.10.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.10.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.11.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.11.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.11.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.11.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.11.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.11.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.11.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.11.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.11.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.11.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.11.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.11.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.11.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.11.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.12.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.12.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.12.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.12.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.12.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.12.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.12.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.12.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.12.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.12.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.12.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.12.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.12.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.12.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.13.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.13.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.13.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.13.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.13.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.13.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.13.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.13.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.13.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.13.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.13.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.13.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.13.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.13.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.14.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.14.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.14.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.14.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.14.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.14.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.14.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.14.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.14.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.14.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.14.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.14.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.14.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.14.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.15.self_attn.q_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.15.self_attn.q_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.15.self_attn.k_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.15.self_attn.k_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.15.self_attn.v_proj.weight.scale.0:\n", + "\ttorch.Size([512, 1, 1, 1])\n", + "model.layers.15.self_attn.v_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.15.self_attn.o_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.15.self_attn.o_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.15.mlp.up_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.15.mlp.up_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.15.mlp.gate_proj.weight.scale.0:\n", + "\ttorch.Size([8192, 1, 1, 1])\n", + "model.layers.15.mlp.gate_proj.weight.zero:\n", + "\ttorch.Size([])\n", + "model.layers.15.mlp.down_proj.weight.scale.0:\n", + "\ttorch.Size([2048, 1, 1, 1])\n", + "model.layers.15.mlp.down_proj.weight.zero:\n", + "\ttorch.Size([])\n" + ] + } + ], + "source": [ + "for name, data in scale_pt.items():\n", + " print(f\"{name}:\")\n", + " print(f\"\\t{data.shape}\") \n" + ] + }, + { + "cell_type": "markdown", + "id": "cc90682f-1272-4cc0-9c93-cbdf659ccd58", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:53:19.570302Z", + "iopub.status.busy": "2026-03-09T05:53:19.570002Z", + "iopub.status.idle": "2026-03-09T05:53:19.572786Z", + "shell.execute_reply": "2026-03-09T05:53:19.572116Z", + "shell.execute_reply.started": "2026-03-09T05:53:19.570286Z" + }, + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "# SMOOTH" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "2ea77800-d47a-4823-a205-32f5ed40036d", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:53:26.852073Z", + "iopub.status.busy": "2026-03-09T05:53:26.851843Z", + "iopub.status.idle": "2026-03-09T05:53:26.855415Z", + "shell.execute_reply": "2026-03-09T05:53:26.854825Z", + "shell.execute_reply.started": "2026-03-09T05:53:26.852055Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['model.layers.0.self_attn.v_proj', 'model.layers.0.self_attn.o_proj', 'model.layers.0.mlp', 'model.layers.0.mlp.down_proj', 'model.layers.1.self_attn.v_proj', 'model.layers.1.self_attn.o_proj', 'model.layers.1.mlp', 'model.layers.1.mlp.down_proj', 'model.layers.2.self_attn.v_proj', 'model.layers.2.self_attn.o_proj', 'model.layers.2.mlp', 'model.layers.2.mlp.down_proj', 'model.layers.3.self_attn.v_proj', 'model.layers.3.self_attn.o_proj', 'model.layers.3.mlp', 'model.layers.3.mlp.down_proj', 'model.layers.4.self_attn.v_proj', 'model.layers.4.self_attn.o_proj', 'model.layers.4.mlp', 'model.layers.4.mlp.down_proj', 'model.layers.5.self_attn.v_proj', 'model.layers.5.self_attn.o_proj', 'model.layers.5.mlp', 'model.layers.5.mlp.down_proj', 'model.layers.6.self_attn.v_proj', 'model.layers.6.self_attn.o_proj', 'model.layers.6.mlp', 'model.layers.6.mlp.down_proj', 'model.layers.7.self_attn.v_proj', 'model.layers.7.self_attn.o_proj', 'model.layers.7.mlp', 'model.layers.7.mlp.down_proj', 'model.layers.8.self_attn.v_proj', 'model.layers.8.self_attn.o_proj', 'model.layers.8.mlp', 'model.layers.8.mlp.down_proj', 'model.layers.9.self_attn.v_proj', 'model.layers.9.self_attn.o_proj', 'model.layers.9.mlp', 'model.layers.9.mlp.down_proj', 'model.layers.10.self_attn.v_proj', 'model.layers.10.self_attn.o_proj', 'model.layers.10.mlp', 'model.layers.10.mlp.down_proj', 'model.layers.11.self_attn.v_proj', 'model.layers.11.self_attn.o_proj', 'model.layers.11.mlp', 'model.layers.11.mlp.down_proj', 'model.layers.12.self_attn.v_proj', 'model.layers.12.self_attn.o_proj', 'model.layers.12.mlp', 'model.layers.12.mlp.down_proj', 'model.layers.13.self_attn.v_proj', 'model.layers.13.self_attn.o_proj', 'model.layers.13.mlp', 'model.layers.13.mlp.down_proj', 'model.layers.14.self_attn.v_proj', 'model.layers.14.self_attn.o_proj', 'model.layers.14.mlp', 'model.layers.14.mlp.down_proj', 'model.layers.15.self_attn.v_proj', 'model.layers.15.self_attn.o_proj', 'model.layers.15.mlp', 'model.layers.15.mlp.down_proj'])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "smooth_pt.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "dd554ff6-a83c-4b53-8494-7164370b740f", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:53:33.987119Z", + "iopub.status.busy": "2026-03-09T05:53:33.986875Z", + "iopub.status.idle": "2026-03-09T05:53:33.990691Z", + "shell.execute_reply": "2026-03-09T05:53:33.989994Z", + "shell.execute_reply.started": "2026-03-09T05:53:33.987101Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(smooth_pt.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "31648435-f194-4bc5-aa47-a118a5287b1e", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:53:39.162115Z", + "iopub.status.busy": "2026-03-09T05:53:39.161651Z", + "iopub.status.idle": "2026-03-09T05:53:39.166122Z", + "shell.execute_reply": "2026-03-09T05:53:39.165513Z", + "shell.execute_reply.started": "2026-03-09T05:53:39.162074Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model.layers.0.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.0.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.0.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.0.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.1.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.1.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.1.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.1.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.2.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.2.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.2.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.2.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.3.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.3.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.3.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.3.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.4.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.4.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.4.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.4.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.5.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.5.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.5.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.5.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.6.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.6.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.6.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.6.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.7.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.7.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.7.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.7.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.8.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.8.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.8.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.8.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.9.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.9.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.9.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.9.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.10.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.10.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.10.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.10.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.11.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.11.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.11.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.11.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.12.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.12.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.12.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.12.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.13.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.13.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.13.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.13.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.14.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.14.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.14.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.14.mlp.down_proj:\n", + "\ttorch.Size([8192])\n", + "model.layers.15.self_attn.v_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.15.self_attn.o_proj:\n", + "\ttorch.Size([2048])\n", + "model.layers.15.mlp:\n", + "\ttorch.Size([2048])\n", + "model.layers.15.mlp.down_proj:\n", + "\ttorch.Size([8192])\n" + ] + } + ], + "source": [ + "for name, data in smooth_pt.items():\n", + " print(f\"{name}:\")\n", + " print(f\"\\t{data.shape}\") \n" + ] + }, + { + "cell_type": "markdown", + "id": "a5af6801-9cb2-4a25-b53a-a13805fabfca", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "# WGTS" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "4dcb4c84-e9d8-4c49-906b-b6d47d5dccae", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:54:15.980981Z", + "iopub.status.busy": "2026-03-09T05:54:15.980657Z", + "iopub.status.idle": "2026-03-09T05:54:15.984513Z", + "shell.execute_reply": "2026-03-09T05:54:15.983899Z", + "shell.execute_reply.started": "2026-03-09T05:54:15.980963Z" + }, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['model.layers.0.self_attn.q_proj', 'model.layers.0.self_attn.k_proj', 'model.layers.0.self_attn.v_proj', 'model.layers.0.self_attn.o_proj', 'model.layers.0.mlp.up_proj', 'model.layers.0.mlp.gate_proj', 'model.layers.0.mlp.down_proj', 'model.layers.1.self_attn.q_proj', 'model.layers.1.self_attn.k_proj', 'model.layers.1.self_attn.v_proj', 'model.layers.1.self_attn.o_proj', 'model.layers.1.mlp.up_proj', 'model.layers.1.mlp.gate_proj', 'model.layers.1.mlp.down_proj', 'model.layers.2.self_attn.q_proj', 'model.layers.2.self_attn.k_proj', 'model.layers.2.self_attn.v_proj', 'model.layers.2.self_attn.o_proj', 'model.layers.2.mlp.up_proj', 'model.layers.2.mlp.gate_proj', 'model.layers.2.mlp.down_proj', 'model.layers.3.self_attn.q_proj', 'model.layers.3.self_attn.k_proj', 'model.layers.3.self_attn.v_proj', 'model.layers.3.self_attn.o_proj', 'model.layers.3.mlp.up_proj', 'model.layers.3.mlp.gate_proj', 'model.layers.3.mlp.down_proj', 'model.layers.4.self_attn.q_proj', 'model.layers.4.self_attn.k_proj', 'model.layers.4.self_attn.v_proj', 'model.layers.4.self_attn.o_proj', 'model.layers.4.mlp.up_proj', 'model.layers.4.mlp.gate_proj', 'model.layers.4.mlp.down_proj', 'model.layers.5.self_attn.q_proj', 'model.layers.5.self_attn.k_proj', 'model.layers.5.self_attn.v_proj', 'model.layers.5.self_attn.o_proj', 'model.layers.5.mlp.up_proj', 'model.layers.5.mlp.gate_proj', 'model.layers.5.mlp.down_proj', 'model.layers.6.self_attn.q_proj', 'model.layers.6.self_attn.k_proj', 'model.layers.6.self_attn.v_proj', 'model.layers.6.self_attn.o_proj', 'model.layers.6.mlp.up_proj', 'model.layers.6.mlp.gate_proj', 'model.layers.6.mlp.down_proj', 'model.layers.7.self_attn.q_proj', 'model.layers.7.self_attn.k_proj', 'model.layers.7.self_attn.v_proj', 'model.layers.7.self_attn.o_proj', 'model.layers.7.mlp.up_proj', 'model.layers.7.mlp.gate_proj', 'model.layers.7.mlp.down_proj', 'model.layers.8.self_attn.q_proj', 'model.layers.8.self_attn.k_proj', 'model.layers.8.self_attn.v_proj', 'model.layers.8.self_attn.o_proj', 'model.layers.8.mlp.up_proj', 'model.layers.8.mlp.gate_proj', 'model.layers.8.mlp.down_proj', 'model.layers.9.self_attn.q_proj', 'model.layers.9.self_attn.k_proj', 'model.layers.9.self_attn.v_proj', 'model.layers.9.self_attn.o_proj', 'model.layers.9.mlp.up_proj', 'model.layers.9.mlp.gate_proj', 'model.layers.9.mlp.down_proj', 'model.layers.10.self_attn.q_proj', 'model.layers.10.self_attn.k_proj', 'model.layers.10.self_attn.v_proj', 'model.layers.10.self_attn.o_proj', 'model.layers.10.mlp.up_proj', 'model.layers.10.mlp.gate_proj', 'model.layers.10.mlp.down_proj', 'model.layers.11.self_attn.q_proj', 'model.layers.11.self_attn.k_proj', 'model.layers.11.self_attn.v_proj', 'model.layers.11.self_attn.o_proj', 'model.layers.11.mlp.up_proj', 'model.layers.11.mlp.gate_proj', 'model.layers.11.mlp.down_proj', 'model.layers.12.self_attn.q_proj', 'model.layers.12.self_attn.k_proj', 'model.layers.12.self_attn.v_proj', 'model.layers.12.self_attn.o_proj', 'model.layers.12.mlp.up_proj', 'model.layers.12.mlp.gate_proj', 'model.layers.12.mlp.down_proj', 'model.layers.13.self_attn.q_proj', 'model.layers.13.self_attn.k_proj', 'model.layers.13.self_attn.v_proj', 'model.layers.13.self_attn.o_proj', 'model.layers.13.mlp.up_proj', 'model.layers.13.mlp.gate_proj', 'model.layers.13.mlp.down_proj', 'model.layers.14.self_attn.q_proj', 'model.layers.14.self_attn.k_proj', 'model.layers.14.self_attn.v_proj', 'model.layers.14.self_attn.o_proj', 'model.layers.14.mlp.up_proj', 'model.layers.14.mlp.gate_proj', 'model.layers.14.mlp.down_proj', 'model.layers.15.self_attn.q_proj', 'model.layers.15.self_attn.k_proj', 'model.layers.15.self_attn.v_proj', 'model.layers.15.self_attn.o_proj', 'model.layers.15.mlp.up_proj', 'model.layers.15.mlp.gate_proj', 'model.layers.15.mlp.down_proj'])" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wgts_pt.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "2e0c33ea-bc93-4f8e-8779-aeb9aced5c1a", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:54:29.947054Z", + "iopub.status.busy": "2026-03-09T05:54:29.946817Z", + "iopub.status.idle": "2026-03-09T05:54:29.950480Z", + "shell.execute_reply": "2026-03-09T05:54:29.949882Z", + "shell.execute_reply.started": "2026-03-09T05:54:29.947037Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "112" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(wgts_pt.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "dea1b317-58ef-4b42-a04f-ec64d2a2e1dc", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-09T05:59:29.229677Z", + "iopub.status.busy": "2026-03-09T05:59:29.229437Z", + "iopub.status.idle": "2026-03-09T05:59:29.240418Z", + "shell.execute_reply": "2026-03-09T05:59:29.239728Z", + "shell.execute_reply.started": "2026-03-09T05:59:29.229659Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model.layers.0.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.0.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.0.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.0.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.0.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.0.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.0.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.1.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.1.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.1.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.1.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.1.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.1.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.1.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.2.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.2.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.2.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.2.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.2.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.2.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.2.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.3.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.3.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.3.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.3.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.3.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.3.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.3.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.4.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.4.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.4.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.4.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.4.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.4.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.4.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.5.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.5.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.5.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.5.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.5.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.5.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.5.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.6.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.6.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.6.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.6.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.6.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.6.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.6.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.7.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.7.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.7.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.7.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.7.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.7.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.7.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.8.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.8.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.8.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.8.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.8.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.8.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.8.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.9.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.9.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.9.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.9.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.9.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.9.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.9.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.10.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.10.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.10.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.10.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.10.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.10.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.10.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.11.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.11.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.11.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.11.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.11.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.11.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.11.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.12.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.12.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.12.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.12.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.12.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.12.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.12.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.13.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.13.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.13.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.13.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.13.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.13.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.13.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.14.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.14.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.14.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.14.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.14.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.14.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.14.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.15.self_attn.q_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.15.self_attn.k_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.15.self_attn.v_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([512, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.15.self_attn.o_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.15.mlp.up_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.15.mlp.gate_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([8192, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n", + "model.layers.15.mlp.down_proj:\n", + "\tchannels_dim : None\n", + "\tscale : None\n", + "\tzero : None\n", + "\tdynamic_range : [dict_keys(['min', 'max', 'ratio'])]\n", + "\t\tmin : None\n", + "\t\tmax : torch.Size([2048, 1, 1, 1])\n", + "\t\tratio : None\n", + "\trange_bound : None\n", + "\tquant_range : None\n" + ] + } + ], + "source": [ + "for name, data in wgts_pt.items():\n", + " print(f\"{name}:\")\n", + " for key, value in data.items():\n", + " print(f\"\\t{key} : {value if not isinstance(value, tuple) else [value[0].keys()]}\") \n", + " if isinstance(value, tuple):\n", + " for k, v in value[0].items():\n", + " print(f\"\\t\\t{k} : {v.shape if isinstance(v, torch.Tensor) else v}\") \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05601616-f6be-465f-b519-ab3e60378e75", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}