diff --git "a/model_analysis.json" "b/model_analysis.json" new file mode 100644--- /dev/null +++ "b/model_analysis.json" @@ -0,0 +1,4683 @@ +{ + "layer_types": { + "transformer": 391 + }, + "parameter_counts": { + "transformer.time_embed.time_mlp.0.weight": 262144, + "transformer.time_embed.time_mlp.0.bias": 1024, + "transformer.time_embed.time_mlp.2.weight": 1048576, + "transformer.time_embed.time_mlp.2.bias": 1024, + "transformer.text_embed.text_embed.weight": 254600, + "transformer.input_embed.proj.weight": 307200, + "transformer.input_embed.proj.bias": 1024, + "transformer.input_embed.conv_pos_embed.conv1d.0.weight": 2031616, + "transformer.input_embed.conv_pos_embed.conv1d.0.bias": 1024, + "transformer.input_embed.conv_pos_embed.conv1d.2.weight": 2031616, + "transformer.input_embed.conv_pos_embed.conv1d.2.bias": 1024, + "transformer.layers.0.1.g": 1024, + "transformer.layers.0.2.to_q.weight": 1048576, + "transformer.layers.0.2.to_q.bias": 1024, + "transformer.layers.0.2.to_k.weight": 1048576, + "transformer.layers.0.2.to_k.bias": 1024, + "transformer.layers.0.2.to_v.weight": 1048576, + "transformer.layers.0.2.to_v.bias": 1024, + "transformer.layers.0.2.to_out.0.weight": 1048576, + "transformer.layers.0.2.to_out.0.bias": 1024, + "transformer.layers.0.3.g": 1024, + "transformer.layers.0.4.ff.0.0.weight": 4194304, + "transformer.layers.0.4.ff.0.0.bias": 4096, + "transformer.layers.0.4.ff.2.weight": 4194304, + "transformer.layers.0.4.ff.2.bias": 1024, + "transformer.layers.1.1.g": 1024, + "transformer.layers.1.2.to_q.weight": 1048576, + "transformer.layers.1.2.to_q.bias": 1024, + "transformer.layers.1.2.to_k.weight": 1048576, + "transformer.layers.1.2.to_k.bias": 1024, + "transformer.layers.1.2.to_v.weight": 1048576, + "transformer.layers.1.2.to_v.bias": 1024, + "transformer.layers.1.2.to_out.0.weight": 1048576, + "transformer.layers.1.2.to_out.0.bias": 1024, + "transformer.layers.1.3.g": 1024, + "transformer.layers.1.4.ff.0.0.weight": 4194304, + "transformer.layers.1.4.ff.0.0.bias": 4096, + "transformer.layers.1.4.ff.2.weight": 4194304, + "transformer.layers.1.4.ff.2.bias": 1024, + "transformer.layers.2.1.g": 1024, + "transformer.layers.2.2.to_q.weight": 1048576, + "transformer.layers.2.2.to_q.bias": 1024, + "transformer.layers.2.2.to_k.weight": 1048576, + "transformer.layers.2.2.to_k.bias": 1024, + "transformer.layers.2.2.to_v.weight": 1048576, + "transformer.layers.2.2.to_v.bias": 1024, + "transformer.layers.2.2.to_out.0.weight": 1048576, + "transformer.layers.2.2.to_out.0.bias": 1024, + "transformer.layers.2.3.g": 1024, + "transformer.layers.2.4.ff.0.0.weight": 4194304, + "transformer.layers.2.4.ff.0.0.bias": 4096, + "transformer.layers.2.4.ff.2.weight": 4194304, + "transformer.layers.2.4.ff.2.bias": 1024, + "transformer.layers.3.1.g": 1024, + "transformer.layers.3.2.to_q.weight": 1048576, + "transformer.layers.3.2.to_q.bias": 1024, + "transformer.layers.3.2.to_k.weight": 1048576, + "transformer.layers.3.2.to_k.bias": 1024, + "transformer.layers.3.2.to_v.weight": 1048576, + "transformer.layers.3.2.to_v.bias": 1024, + "transformer.layers.3.2.to_out.0.weight": 1048576, + "transformer.layers.3.2.to_out.0.bias": 1024, + "transformer.layers.3.3.g": 1024, + "transformer.layers.3.4.ff.0.0.weight": 4194304, + "transformer.layers.3.4.ff.0.0.bias": 4096, + "transformer.layers.3.4.ff.2.weight": 4194304, + "transformer.layers.3.4.ff.2.bias": 1024, + "transformer.layers.4.1.g": 1024, + "transformer.layers.4.2.to_q.weight": 1048576, + "transformer.layers.4.2.to_q.bias": 1024, + "transformer.layers.4.2.to_k.weight": 1048576, + "transformer.layers.4.2.to_k.bias": 1024, + "transformer.layers.4.2.to_v.weight": 1048576, + "transformer.layers.4.2.to_v.bias": 1024, + "transformer.layers.4.2.to_out.0.weight": 1048576, + "transformer.layers.4.2.to_out.0.bias": 1024, + "transformer.layers.4.3.g": 1024, + "transformer.layers.4.4.ff.0.0.weight": 4194304, + "transformer.layers.4.4.ff.0.0.bias": 4096, + "transformer.layers.4.4.ff.2.weight": 4194304, + "transformer.layers.4.4.ff.2.bias": 1024, + "transformer.layers.5.1.g": 1024, + "transformer.layers.5.2.to_q.weight": 1048576, + "transformer.layers.5.2.to_q.bias": 1024, + "transformer.layers.5.2.to_k.weight": 1048576, + "transformer.layers.5.2.to_k.bias": 1024, + "transformer.layers.5.2.to_v.weight": 1048576, + "transformer.layers.5.2.to_v.bias": 1024, + "transformer.layers.5.2.to_out.0.weight": 1048576, + "transformer.layers.5.2.to_out.0.bias": 1024, + "transformer.layers.5.3.g": 1024, + "transformer.layers.5.4.ff.0.0.weight": 4194304, + "transformer.layers.5.4.ff.0.0.bias": 4096, + "transformer.layers.5.4.ff.2.weight": 4194304, + "transformer.layers.5.4.ff.2.bias": 1024, + "transformer.layers.6.1.g": 1024, + "transformer.layers.6.2.to_q.weight": 1048576, + "transformer.layers.6.2.to_q.bias": 1024, + "transformer.layers.6.2.to_k.weight": 1048576, + "transformer.layers.6.2.to_k.bias": 1024, + "transformer.layers.6.2.to_v.weight": 1048576, + "transformer.layers.6.2.to_v.bias": 1024, + "transformer.layers.6.2.to_out.0.weight": 1048576, + "transformer.layers.6.2.to_out.0.bias": 1024, + "transformer.layers.6.3.g": 1024, + "transformer.layers.6.4.ff.0.0.weight": 4194304, + "transformer.layers.6.4.ff.0.0.bias": 4096, + "transformer.layers.6.4.ff.2.weight": 4194304, + "transformer.layers.6.4.ff.2.bias": 1024, + "transformer.layers.7.1.g": 1024, + "transformer.layers.7.2.to_q.weight": 1048576, + "transformer.layers.7.2.to_q.bias": 1024, + "transformer.layers.7.2.to_k.weight": 1048576, + "transformer.layers.7.2.to_k.bias": 1024, + "transformer.layers.7.2.to_v.weight": 1048576, + "transformer.layers.7.2.to_v.bias": 1024, + "transformer.layers.7.2.to_out.0.weight": 1048576, + "transformer.layers.7.2.to_out.0.bias": 1024, + "transformer.layers.7.3.g": 1024, + "transformer.layers.7.4.ff.0.0.weight": 4194304, + "transformer.layers.7.4.ff.0.0.bias": 4096, + "transformer.layers.7.4.ff.2.weight": 4194304, + "transformer.layers.7.4.ff.2.bias": 1024, + "transformer.layers.8.1.g": 1024, + "transformer.layers.8.2.to_q.weight": 1048576, + "transformer.layers.8.2.to_q.bias": 1024, + "transformer.layers.8.2.to_k.weight": 1048576, + "transformer.layers.8.2.to_k.bias": 1024, + "transformer.layers.8.2.to_v.weight": 1048576, + "transformer.layers.8.2.to_v.bias": 1024, + "transformer.layers.8.2.to_out.0.weight": 1048576, + "transformer.layers.8.2.to_out.0.bias": 1024, + "transformer.layers.8.3.g": 1024, + "transformer.layers.8.4.ff.0.0.weight": 4194304, + "transformer.layers.8.4.ff.0.0.bias": 4096, + "transformer.layers.8.4.ff.2.weight": 4194304, + "transformer.layers.8.4.ff.2.bias": 1024, + "transformer.layers.9.1.g": 1024, + "transformer.layers.9.2.to_q.weight": 1048576, + "transformer.layers.9.2.to_q.bias": 1024, + "transformer.layers.9.2.to_k.weight": 1048576, + "transformer.layers.9.2.to_k.bias": 1024, + "transformer.layers.9.2.to_v.weight": 1048576, + "transformer.layers.9.2.to_v.bias": 1024, + "transformer.layers.9.2.to_out.0.weight": 1048576, + "transformer.layers.9.2.to_out.0.bias": 1024, + "transformer.layers.9.3.g": 1024, + "transformer.layers.9.4.ff.0.0.weight": 4194304, + "transformer.layers.9.4.ff.0.0.bias": 4096, + "transformer.layers.9.4.ff.2.weight": 4194304, + "transformer.layers.9.4.ff.2.bias": 1024, + "transformer.layers.10.1.g": 1024, + "transformer.layers.10.2.to_q.weight": 1048576, + "transformer.layers.10.2.to_q.bias": 1024, + "transformer.layers.10.2.to_k.weight": 1048576, + "transformer.layers.10.2.to_k.bias": 1024, + "transformer.layers.10.2.to_v.weight": 1048576, + "transformer.layers.10.2.to_v.bias": 1024, + "transformer.layers.10.2.to_out.0.weight": 1048576, + "transformer.layers.10.2.to_out.0.bias": 1024, + "transformer.layers.10.3.g": 1024, + "transformer.layers.10.4.ff.0.0.weight": 4194304, + "transformer.layers.10.4.ff.0.0.bias": 4096, + "transformer.layers.10.4.ff.2.weight": 4194304, + "transformer.layers.10.4.ff.2.bias": 1024, + "transformer.layers.11.1.g": 1024, + "transformer.layers.11.2.to_q.weight": 1048576, + "transformer.layers.11.2.to_q.bias": 1024, + "transformer.layers.11.2.to_k.weight": 1048576, + "transformer.layers.11.2.to_k.bias": 1024, + "transformer.layers.11.2.to_v.weight": 1048576, + "transformer.layers.11.2.to_v.bias": 1024, + "transformer.layers.11.2.to_out.0.weight": 1048576, + "transformer.layers.11.2.to_out.0.bias": 1024, + "transformer.layers.11.3.g": 1024, + "transformer.layers.11.4.ff.0.0.weight": 4194304, + "transformer.layers.11.4.ff.0.0.bias": 4096, + "transformer.layers.11.4.ff.2.weight": 4194304, + "transformer.layers.11.4.ff.2.bias": 1024, + "transformer.layers.12.1.g": 1024, + "transformer.layers.12.2.to_q.weight": 1048576, + "transformer.layers.12.2.to_q.bias": 1024, + "transformer.layers.12.2.to_k.weight": 1048576, + "transformer.layers.12.2.to_k.bias": 1024, + "transformer.layers.12.2.to_v.weight": 1048576, + "transformer.layers.12.2.to_v.bias": 1024, + "transformer.layers.12.2.to_out.0.weight": 1048576, + "transformer.layers.12.2.to_out.0.bias": 1024, + "transformer.layers.12.3.g": 1024, + "transformer.layers.12.4.ff.0.0.weight": 4194304, + "transformer.layers.12.4.ff.0.0.bias": 4096, + "transformer.layers.12.4.ff.2.weight": 4194304, + "transformer.layers.12.4.ff.2.bias": 1024, + "transformer.layers.13.0.weight": 2097152, + "transformer.layers.13.1.g": 1024, + "transformer.layers.13.2.to_q.weight": 1048576, + "transformer.layers.13.2.to_q.bias": 1024, + "transformer.layers.13.2.to_k.weight": 1048576, + "transformer.layers.13.2.to_k.bias": 1024, + "transformer.layers.13.2.to_v.weight": 1048576, + "transformer.layers.13.2.to_v.bias": 1024, + "transformer.layers.13.2.to_out.0.weight": 1048576, + "transformer.layers.13.2.to_out.0.bias": 1024, + "transformer.layers.13.3.g": 1024, + "transformer.layers.13.4.ff.0.0.weight": 4194304, + "transformer.layers.13.4.ff.0.0.bias": 4096, + "transformer.layers.13.4.ff.2.weight": 4194304, + "transformer.layers.13.4.ff.2.bias": 1024, + "transformer.layers.14.0.weight": 2097152, + "transformer.layers.14.1.g": 1024, + "transformer.layers.14.2.to_q.weight": 1048576, + "transformer.layers.14.2.to_q.bias": 1024, + "transformer.layers.14.2.to_k.weight": 1048576, + "transformer.layers.14.2.to_k.bias": 1024, + "transformer.layers.14.2.to_v.weight": 1048576, + "transformer.layers.14.2.to_v.bias": 1024, + "transformer.layers.14.2.to_out.0.weight": 1048576, + "transformer.layers.14.2.to_out.0.bias": 1024, + "transformer.layers.14.3.g": 1024, + "transformer.layers.14.4.ff.0.0.weight": 4194304, + "transformer.layers.14.4.ff.0.0.bias": 4096, + "transformer.layers.14.4.ff.2.weight": 4194304, + "transformer.layers.14.4.ff.2.bias": 1024, + "transformer.layers.15.0.weight": 2097152, + "transformer.layers.15.1.g": 1024, + "transformer.layers.15.2.to_q.weight": 1048576, + "transformer.layers.15.2.to_q.bias": 1024, + "transformer.layers.15.2.to_k.weight": 1048576, + "transformer.layers.15.2.to_k.bias": 1024, + "transformer.layers.15.2.to_v.weight": 1048576, + "transformer.layers.15.2.to_v.bias": 1024, + "transformer.layers.15.2.to_out.0.weight": 1048576, + "transformer.layers.15.2.to_out.0.bias": 1024, + "transformer.layers.15.3.g": 1024, + "transformer.layers.15.4.ff.0.0.weight": 4194304, + "transformer.layers.15.4.ff.0.0.bias": 4096, + "transformer.layers.15.4.ff.2.weight": 4194304, + "transformer.layers.15.4.ff.2.bias": 1024, + "transformer.layers.16.0.weight": 2097152, + "transformer.layers.16.1.g": 1024, + "transformer.layers.16.2.to_q.weight": 1048576, + "transformer.layers.16.2.to_q.bias": 1024, + "transformer.layers.16.2.to_k.weight": 1048576, + "transformer.layers.16.2.to_k.bias": 1024, + "transformer.layers.16.2.to_v.weight": 1048576, + "transformer.layers.16.2.to_v.bias": 1024, + "transformer.layers.16.2.to_out.0.weight": 1048576, + "transformer.layers.16.2.to_out.0.bias": 1024, + "transformer.layers.16.3.g": 1024, + "transformer.layers.16.4.ff.0.0.weight": 4194304, + "transformer.layers.16.4.ff.0.0.bias": 4096, + "transformer.layers.16.4.ff.2.weight": 4194304, + "transformer.layers.16.4.ff.2.bias": 1024, + "transformer.layers.17.0.weight": 2097152, + "transformer.layers.17.1.g": 1024, + "transformer.layers.17.2.to_q.weight": 1048576, + "transformer.layers.17.2.to_q.bias": 1024, + "transformer.layers.17.2.to_k.weight": 1048576, + "transformer.layers.17.2.to_k.bias": 1024, + "transformer.layers.17.2.to_v.weight": 1048576, + "transformer.layers.17.2.to_v.bias": 1024, + "transformer.layers.17.2.to_out.0.weight": 1048576, + "transformer.layers.17.2.to_out.0.bias": 1024, + "transformer.layers.17.3.g": 1024, + "transformer.layers.17.4.ff.0.0.weight": 4194304, + "transformer.layers.17.4.ff.0.0.bias": 4096, + "transformer.layers.17.4.ff.2.weight": 4194304, + "transformer.layers.17.4.ff.2.bias": 1024, + "transformer.layers.18.0.weight": 2097152, + "transformer.layers.18.1.g": 1024, + "transformer.layers.18.2.to_q.weight": 1048576, + "transformer.layers.18.2.to_q.bias": 1024, + "transformer.layers.18.2.to_k.weight": 1048576, + "transformer.layers.18.2.to_k.bias": 1024, + "transformer.layers.18.2.to_v.weight": 1048576, + "transformer.layers.18.2.to_v.bias": 1024, + "transformer.layers.18.2.to_out.0.weight": 1048576, + "transformer.layers.18.2.to_out.0.bias": 1024, + "transformer.layers.18.3.g": 1024, + "transformer.layers.18.4.ff.0.0.weight": 4194304, + "transformer.layers.18.4.ff.0.0.bias": 4096, + "transformer.layers.18.4.ff.2.weight": 4194304, + "transformer.layers.18.4.ff.2.bias": 1024, + "transformer.layers.19.0.weight": 2097152, + "transformer.layers.19.1.g": 1024, + "transformer.layers.19.2.to_q.weight": 1048576, + "transformer.layers.19.2.to_q.bias": 1024, + "transformer.layers.19.2.to_k.weight": 1048576, + "transformer.layers.19.2.to_k.bias": 1024, + "transformer.layers.19.2.to_v.weight": 1048576, + "transformer.layers.19.2.to_v.bias": 1024, + "transformer.layers.19.2.to_out.0.weight": 1048576, + "transformer.layers.19.2.to_out.0.bias": 1024, + "transformer.layers.19.3.g": 1024, + "transformer.layers.19.4.ff.0.0.weight": 4194304, + "transformer.layers.19.4.ff.0.0.bias": 4096, + "transformer.layers.19.4.ff.2.weight": 4194304, + "transformer.layers.19.4.ff.2.bias": 1024, + "transformer.layers.20.0.weight": 2097152, + "transformer.layers.20.1.g": 1024, + "transformer.layers.20.2.to_q.weight": 1048576, + "transformer.layers.20.2.to_q.bias": 1024, + "transformer.layers.20.2.to_k.weight": 1048576, + "transformer.layers.20.2.to_k.bias": 1024, + "transformer.layers.20.2.to_v.weight": 1048576, + "transformer.layers.20.2.to_v.bias": 1024, + "transformer.layers.20.2.to_out.0.weight": 1048576, + "transformer.layers.20.2.to_out.0.bias": 1024, + "transformer.layers.20.3.g": 1024, + "transformer.layers.20.4.ff.0.0.weight": 4194304, + "transformer.layers.20.4.ff.0.0.bias": 4096, + "transformer.layers.20.4.ff.2.weight": 4194304, + "transformer.layers.20.4.ff.2.bias": 1024, + "transformer.layers.21.0.weight": 2097152, + "transformer.layers.21.1.g": 1024, + "transformer.layers.21.2.to_q.weight": 1048576, + "transformer.layers.21.2.to_q.bias": 1024, + "transformer.layers.21.2.to_k.weight": 1048576, + "transformer.layers.21.2.to_k.bias": 1024, + "transformer.layers.21.2.to_v.weight": 1048576, + "transformer.layers.21.2.to_v.bias": 1024, + "transformer.layers.21.2.to_out.0.weight": 1048576, + "transformer.layers.21.2.to_out.0.bias": 1024, + "transformer.layers.21.3.g": 1024, + "transformer.layers.21.4.ff.0.0.weight": 4194304, + "transformer.layers.21.4.ff.0.0.bias": 4096, + "transformer.layers.21.4.ff.2.weight": 4194304, + "transformer.layers.21.4.ff.2.bias": 1024, + "transformer.layers.22.0.weight": 2097152, + "transformer.layers.22.1.g": 1024, + "transformer.layers.22.2.to_q.weight": 1048576, + "transformer.layers.22.2.to_q.bias": 1024, + "transformer.layers.22.2.to_k.weight": 1048576, + "transformer.layers.22.2.to_k.bias": 1024, + "transformer.layers.22.2.to_v.weight": 1048576, + "transformer.layers.22.2.to_v.bias": 1024, + "transformer.layers.22.2.to_out.0.weight": 1048576, + "transformer.layers.22.2.to_out.0.bias": 1024, + "transformer.layers.22.3.g": 1024, + "transformer.layers.22.4.ff.0.0.weight": 4194304, + "transformer.layers.22.4.ff.0.0.bias": 4096, + "transformer.layers.22.4.ff.2.weight": 4194304, + "transformer.layers.22.4.ff.2.bias": 1024, + "transformer.layers.23.0.weight": 2097152, + "transformer.layers.23.1.g": 1024, + "transformer.layers.23.2.to_q.weight": 1048576, + "transformer.layers.23.2.to_q.bias": 1024, + "transformer.layers.23.2.to_k.weight": 1048576, + "transformer.layers.23.2.to_k.bias": 1024, + "transformer.layers.23.2.to_v.weight": 1048576, + "transformer.layers.23.2.to_v.bias": 1024, + "transformer.layers.23.2.to_out.0.weight": 1048576, + "transformer.layers.23.2.to_out.0.bias": 1024, + "transformer.layers.23.3.g": 1024, + "transformer.layers.23.4.ff.0.0.weight": 4194304, + "transformer.layers.23.4.ff.0.0.bias": 4096, + "transformer.layers.23.4.ff.2.weight": 4194304, + "transformer.layers.23.4.ff.2.bias": 1024, + "transformer.layers.24.0.weight": 2097152, + "transformer.layers.24.1.g": 1024, + "transformer.layers.24.2.to_q.weight": 1048576, + "transformer.layers.24.2.to_q.bias": 1024, + "transformer.layers.24.2.to_k.weight": 1048576, + "transformer.layers.24.2.to_k.bias": 1024, + "transformer.layers.24.2.to_v.weight": 1048576, + "transformer.layers.24.2.to_v.bias": 1024, + "transformer.layers.24.2.to_out.0.weight": 1048576, + "transformer.layers.24.2.to_out.0.bias": 1024, + "transformer.layers.24.3.g": 1024, + "transformer.layers.24.4.ff.0.0.weight": 4194304, + "transformer.layers.24.4.ff.0.0.bias": 4096, + "transformer.layers.24.4.ff.2.weight": 4194304, + "transformer.layers.24.4.ff.2.bias": 1024, + "transformer.layers.25.0.weight": 2097152, + "transformer.layers.25.1.g": 1024, + "transformer.layers.25.2.to_q.weight": 1048576, + "transformer.layers.25.2.to_q.bias": 1024, + "transformer.layers.25.2.to_k.weight": 1048576, + "transformer.layers.25.2.to_k.bias": 1024, + "transformer.layers.25.2.to_v.weight": 1048576, + "transformer.layers.25.2.to_v.bias": 1024, + "transformer.layers.25.2.to_out.0.weight": 1048576, + "transformer.layers.25.2.to_out.0.bias": 1024, + "transformer.layers.25.3.g": 1024, + "transformer.layers.25.4.ff.0.0.weight": 4194304, + "transformer.layers.25.4.ff.0.0.bias": 4096, + "transformer.layers.25.4.ff.2.weight": 4194304, + "transformer.layers.25.4.ff.2.bias": 1024, + "transformer.norm_out.g": 1024, + "transformer.proj_out.weight": 102400, + "transformer.proj_out.bias": 100 + }, + "important_layers": [ + "transformer.time_embed.time_mlp.0.weight", + "transformer.time_embed.time_mlp.2.weight", + "transformer.text_embed.text_embed.weight", + "transformer.input_embed.proj.weight", + "transformer.input_embed.conv_pos_embed.conv1d.0.weight", + "transformer.input_embed.conv_pos_embed.conv1d.2.weight", + "transformer.layers.0.2.to_q.weight", + "transformer.layers.0.2.to_k.weight", + "transformer.layers.0.2.to_v.weight", + "transformer.layers.0.2.to_out.0.weight", + "transformer.layers.0.4.ff.0.0.weight", + "transformer.layers.0.4.ff.2.weight", + "transformer.layers.1.2.to_q.weight", + "transformer.layers.1.2.to_k.weight", + "transformer.layers.1.2.to_v.weight", + "transformer.layers.1.2.to_out.0.weight", + "transformer.layers.1.4.ff.0.0.weight", + "transformer.layers.1.4.ff.2.weight", + "transformer.layers.2.2.to_q.weight", + "transformer.layers.2.2.to_k.weight", + "transformer.layers.2.2.to_v.weight", + "transformer.layers.2.2.to_out.0.weight", + "transformer.layers.2.4.ff.0.0.weight", + "transformer.layers.2.4.ff.2.weight", + "transformer.layers.3.2.to_q.weight", + "transformer.layers.3.2.to_k.weight", + "transformer.layers.3.2.to_v.weight", + "transformer.layers.3.2.to_out.0.weight", + "transformer.layers.3.4.ff.0.0.weight", + "transformer.layers.3.4.ff.2.weight", + "transformer.layers.4.2.to_q.weight", + "transformer.layers.4.2.to_k.weight", + "transformer.layers.4.2.to_v.weight", + "transformer.layers.4.2.to_out.0.weight", + "transformer.layers.4.4.ff.0.0.weight", + "transformer.layers.4.4.ff.2.weight", + "transformer.layers.5.2.to_q.weight", + "transformer.layers.5.2.to_k.weight", + "transformer.layers.5.2.to_v.weight", + "transformer.layers.5.2.to_out.0.weight", + "transformer.layers.5.4.ff.0.0.weight", + "transformer.layers.5.4.ff.2.weight", + "transformer.layers.6.2.to_q.weight", + "transformer.layers.6.2.to_k.weight", + "transformer.layers.6.2.to_v.weight", + "transformer.layers.6.2.to_out.0.weight", + "transformer.layers.6.4.ff.0.0.weight", + "transformer.layers.6.4.ff.2.weight", + "transformer.layers.7.2.to_q.weight", + "transformer.layers.7.2.to_k.weight", + "transformer.layers.7.2.to_v.weight", + "transformer.layers.7.2.to_out.0.weight", + "transformer.layers.7.4.ff.0.0.weight", + "transformer.layers.7.4.ff.2.weight", + "transformer.layers.8.4.ff.0.0.weight", + "transformer.layers.8.4.ff.2.weight", + "transformer.layers.9.4.ff.0.0.weight", + "transformer.layers.9.4.ff.2.weight", + "transformer.layers.10.4.ff.0.0.weight", + "transformer.layers.10.4.ff.2.weight", + "transformer.layers.11.4.ff.0.0.weight", + "transformer.layers.11.4.ff.2.weight", + "transformer.layers.12.4.ff.0.0.weight", + "transformer.layers.12.4.ff.2.weight", + "transformer.layers.13.0.weight", + "transformer.layers.13.4.ff.0.0.weight", + "transformer.layers.13.4.ff.2.weight", + "transformer.layers.14.0.weight", + "transformer.layers.14.4.ff.0.0.weight", + "transformer.layers.14.4.ff.2.weight", + "transformer.layers.15.0.weight", + "transformer.layers.15.4.ff.0.0.weight", + "transformer.layers.15.4.ff.2.weight", + "transformer.layers.16.4.ff.0.0.weight", + "transformer.layers.16.4.ff.2.weight", + "transformer.layers.17.4.ff.0.0.weight", + "transformer.layers.17.4.ff.2.weight", + "transformer.layers.18.4.ff.0.0.weight", + "transformer.layers.18.4.ff.2.weight", + "transformer.layers.19.4.ff.0.0.weight", + "transformer.layers.19.4.ff.2.weight", + "transformer.layers.20.4.ff.0.0.weight", + "transformer.layers.20.4.ff.2.weight", + "transformer.layers.21.4.ff.0.0.weight", + "transformer.layers.21.4.ff.2.weight", + "transformer.layers.22.4.ff.0.0.weight", + "transformer.layers.22.4.ff.2.weight", + "transformer.layers.23.4.ff.0.0.weight", + "transformer.layers.23.4.ff.2.weight", + "transformer.layers.24.4.ff.0.0.weight", + "transformer.layers.24.4.ff.2.weight", + "transformer.layers.25.4.ff.0.0.weight", + "transformer.layers.25.4.ff.2.weight" + ], + "bottleneck_layers": [], + "recommendations": { + "focus_layers": [ + "transformer.time_embed.time_mlp.0.weight", + "transformer.time_embed.time_mlp.2.weight", + "transformer.text_embed.text_embed.weight", + "transformer.input_embed.proj.weight", + "transformer.input_embed.conv_pos_embed.conv1d.0.weight", + "transformer.input_embed.conv_pos_embed.conv1d.2.weight", + "transformer.layers.0.2.to_q.weight", + "transformer.layers.0.2.to_k.weight", + "transformer.layers.0.2.to_v.weight", + "transformer.layers.0.2.to_out.0.weight", + "transformer.layers.0.4.ff.0.0.weight", + "transformer.layers.0.4.ff.2.weight", + "transformer.layers.1.2.to_q.weight", + "transformer.layers.1.2.to_k.weight", + "transformer.layers.1.2.to_v.weight", + "transformer.layers.1.2.to_out.0.weight", + "transformer.layers.1.4.ff.0.0.weight", + "transformer.layers.1.4.ff.2.weight", + "transformer.layers.2.2.to_q.weight", + "transformer.layers.2.2.to_k.weight", + "transformer.layers.2.2.to_v.weight", + "transformer.layers.2.2.to_out.0.weight", + "transformer.layers.2.4.ff.0.0.weight", + "transformer.layers.2.4.ff.2.weight", + "transformer.layers.3.2.to_q.weight", + "transformer.layers.3.2.to_k.weight", + "transformer.layers.3.2.to_v.weight", + "transformer.layers.3.2.to_out.0.weight", + "transformer.layers.3.4.ff.0.0.weight", + "transformer.layers.3.4.ff.2.weight", + "transformer.layers.4.2.to_q.weight", + "transformer.layers.4.2.to_k.weight", + "transformer.layers.4.2.to_v.weight", + "transformer.layers.4.2.to_out.0.weight", + "transformer.layers.4.4.ff.0.0.weight", + "transformer.layers.4.4.ff.2.weight", + "transformer.layers.5.2.to_q.weight", + "transformer.layers.5.2.to_k.weight", + "transformer.layers.5.2.to_v.weight", + "transformer.layers.5.2.to_out.0.weight", + "transformer.layers.5.4.ff.0.0.weight", + "transformer.layers.5.4.ff.2.weight", + "transformer.layers.6.2.to_q.weight", + "transformer.layers.6.2.to_k.weight", + "transformer.layers.6.2.to_v.weight", + "transformer.layers.6.2.to_out.0.weight", + "transformer.layers.6.4.ff.0.0.weight", + "transformer.layers.6.4.ff.2.weight", + "transformer.layers.7.2.to_q.weight", + "transformer.layers.7.2.to_k.weight", + "transformer.layers.7.2.to_v.weight", + "transformer.layers.7.2.to_out.0.weight", + "transformer.layers.7.4.ff.0.0.weight", + "transformer.layers.7.4.ff.2.weight", + "transformer.layers.8.4.ff.0.0.weight", + "transformer.layers.8.4.ff.2.weight", + "transformer.layers.9.4.ff.0.0.weight", + "transformer.layers.9.4.ff.2.weight", + "transformer.layers.10.4.ff.0.0.weight", + "transformer.layers.10.4.ff.2.weight", + "transformer.layers.11.4.ff.0.0.weight", + "transformer.layers.11.4.ff.2.weight", + "transformer.layers.12.4.ff.0.0.weight", + "transformer.layers.12.4.ff.2.weight", + "transformer.layers.13.0.weight", + "transformer.layers.13.4.ff.0.0.weight", + "transformer.layers.13.4.ff.2.weight", + "transformer.layers.14.0.weight", + "transformer.layers.14.4.ff.0.0.weight", + "transformer.layers.14.4.ff.2.weight", + "transformer.layers.15.0.weight", + "transformer.layers.15.4.ff.0.0.weight", + "transformer.layers.15.4.ff.2.weight", + "transformer.layers.16.4.ff.0.0.weight", + "transformer.layers.16.4.ff.2.weight", + "transformer.layers.17.4.ff.0.0.weight", + "transformer.layers.17.4.ff.2.weight", + "transformer.layers.18.4.ff.0.0.weight", + "transformer.layers.18.4.ff.2.weight", + "transformer.layers.19.4.ff.0.0.weight", + "transformer.layers.19.4.ff.2.weight", + "transformer.layers.20.4.ff.0.0.weight", + "transformer.layers.20.4.ff.2.weight", + "transformer.layers.21.4.ff.0.0.weight", + "transformer.layers.21.4.ff.2.weight", + "transformer.layers.22.4.ff.0.0.weight", + "transformer.layers.22.4.ff.2.weight", + "transformer.layers.23.4.ff.0.0.weight", + "transformer.layers.23.4.ff.2.weight", + "transformer.layers.24.4.ff.0.0.weight", + "transformer.layers.24.4.ff.2.weight", + "transformer.layers.25.4.ff.0.0.weight", + "transformer.layers.25.4.ff.2.weight" + ] + }, + "total_parameters": 391, + "total_elements": 360755948, + "param_ranges": { + "transformer.time_embed.time_mlp.0.weight": { + "min": -0.43111443519592285, + "max": 0.2988463342189789, + "mean": -0.0025462331250309944, + "std": 0.04255734384059906, + "sparsity": 0.0, + "shape": [ + 1024, + 256 + ] + }, + "transformer.time_embed.time_mlp.0.bias": { + "min": -0.06311740726232529, + "max": 0.10821832716464996, + "mean": 0.0006233985768631101, + "std": 0.03409506380558014, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.time_embed.time_mlp.2.weight": { + "min": -0.41270628571510315, + "max": 0.8365904092788696, + "mean": -0.0002062078274320811, + "std": 0.024108584970235825, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.time_embed.time_mlp.2.bias": { + "min": -0.11594842374324799, + "max": 0.323304146528244, + "mean": -0.0009396584937348962, + "std": 0.019620178267359734, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.text_embed.text_embed.weight": { + "min": -2.8046321868896484, + "max": 2.8845088481903076, + "mean": -0.00036305765388533473, + "std": 0.615403413772583, + "sparsity": 0.0, + "shape": [ + 2546, + 100 + ] + }, + "transformer.input_embed.proj.weight": { + "min": -0.2803097069263458, + "max": 0.3821697235107422, + "mean": 0.0004250165948178619, + "std": 0.042748384177684784, + "sparsity": 0.0, + "shape": [ + 1024, + 300 + ] + }, + "transformer.input_embed.proj.bias": { + "min": -0.22351907193660736, + "max": 0.21069680154323578, + "mean": -0.004498748108744621, + "std": 0.04097301885485649, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.0.weight": { + "min": -0.4281409978866577, + "max": 0.47565823793411255, + "mean": 3.041478066734271e-06, + "std": 0.024508286267518997, + "sparsity": 0.0, + "shape": [ + 1024, + 64, + 31 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.0.bias": { + "min": -0.32690364122390747, + "max": 0.15677706897258759, + "mean": -0.04671286791563034, + "std": 0.05161474645137787, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.2.weight": { + "min": -0.41106897592544556, + "max": 0.3550392687320709, + "mean": -0.00012950549717061222, + "std": 0.023600473999977112, + "sparsity": 0.0, + "shape": [ + 1024, + 64, + 31 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.2.bias": { + "min": -0.23076964914798737, + "max": 0.2638300061225891, + "mean": -0.029151970520615578, + "std": 0.049401458352804184, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.1.g": { + "min": 0.25456827878952026, + "max": 0.8219638466835022, + "mean": 0.525442898273468, + "std": 0.08086482435464859, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_q.weight": { + "min": -0.2974269390106201, + "max": 0.26618602871894836, + "mean": -0.0004250289057381451, + "std": 0.0321008674800396, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_q.bias": { + "min": -0.09282378107309341, + "max": 0.12510952353477478, + "mean": 0.0006503364420495927, + "std": 0.025732681155204773, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_k.weight": { + "min": -0.29088306427001953, + "max": 0.28188201785087585, + "mean": -7.563710096292198e-05, + "std": 0.030931729823350906, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_k.bias": { + "min": -5.909866809844971, + "max": 5.824496746063232, + "mean": -0.009385589510202408, + "std": 1.2966406345367432, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_v.weight": { + "min": -0.4253852665424347, + "max": 0.34430131316185, + "mean": 9.75119328359142e-05, + "std": 0.02995217591524124, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_v.bias": { + "min": -0.028903231024742126, + "max": 0.027659673243761063, + "mean": -0.00031527443206869066, + "std": 0.012571859173476696, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_out.0.weight": { + "min": -0.45454347133636475, + "max": 0.44891107082366943, + "mean": 2.3480326490243897e-05, + "std": 0.023853568360209465, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_out.0.bias": { + "min": -0.08878406882286072, + "max": 0.09124661237001419, + "mean": 0.002279076725244522, + "std": 0.019516194239258766, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.3.g": { + "min": 0.2667350471019745, + "max": 1.0590577125549316, + "mean": 0.5311722159385681, + "std": 0.10455667227506638, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.4.ff.0.0.weight": { + "min": -0.5753205418586731, + "max": 0.6092038154602051, + "mean": -0.0004317538405302912, + "std": 0.038596246391534805, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.0.4.ff.0.0.bias": { + "min": -0.18242540955543518, + "max": 0.04575135558843613, + "mean": -0.02945941686630249, + "std": 0.04261056333780289, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.0.4.ff.2.weight": { + "min": -1.167878270149231, + "max": 1.6351370811462402, + "mean": 0.00032057490898296237, + "std": 0.02769383229315281, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.0.4.ff.2.bias": { + "min": -0.1625949591398239, + "max": 0.2059435099363327, + "mean": -0.02112039364874363, + "std": 0.027941575273871422, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.1.g": { + "min": 0.22422762215137482, + "max": 0.8458681702613831, + "mean": 0.4875890910625458, + "std": 0.07528901100158691, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_q.weight": { + "min": -0.2560153305530548, + "max": 0.3063727021217346, + "mean": -8.626433555036783e-06, + "std": 0.033470120280981064, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_q.bias": { + "min": -0.09546571969985962, + "max": 0.11066073924303055, + "mean": 5.8840945712290704e-05, + "std": 0.026972563937306404, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_k.weight": { + "min": -0.2978975474834442, + "max": 0.29693126678466797, + "mean": 5.199259248911403e-05, + "std": 0.03254008665680885, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_k.bias": { + "min": -5.169106960296631, + "max": 5.089260578155518, + "mean": -0.014622640796005726, + "std": 1.1580101251602173, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_v.weight": { + "min": -0.3452591896057129, + "max": 0.3437287509441376, + "mean": 7.87251628935337e-05, + "std": 0.030058259144425392, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_v.bias": { + "min": -0.03609376400709152, + "max": 0.03314271569252014, + "mean": -0.00014089577598497272, + "std": 0.013021372258663177, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_out.0.weight": { + "min": -0.3159167468547821, + "max": 0.37570273876190186, + "mean": -2.126370236510411e-05, + "std": 0.024055330082774162, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_out.0.bias": { + "min": -0.10549593716859818, + "max": 0.1221165731549263, + "mean": -0.0019639446400105953, + "std": 0.028849009424448013, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.3.g": { + "min": 0.3116210103034973, + "max": 1.1235315799713135, + "mean": 0.6662613153457642, + "std": 0.09780054539442062, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.4.ff.0.0.weight": { + "min": -0.872847855091095, + "max": 0.6278241872787476, + "mean": 0.0016755674732849002, + "std": 0.047437313944101334, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.1.4.ff.0.0.bias": { + "min": -0.2716394066810608, + "max": 0.03413696587085724, + "mean": -0.0466003455221653, + "std": 0.04061445966362953, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.1.4.ff.2.weight": { + "min": -0.9222021102905273, + "max": 0.9650114178657532, + "mean": 0.0010224997531622648, + "std": 0.04070303216576576, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.1.4.ff.2.bias": { + "min": -0.14480018615722656, + "max": 0.07504245638847351, + "mean": -0.00909046083688736, + "std": 0.025704393163323402, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.1.g": { + "min": 0.23979389667510986, + "max": 0.7145018577575684, + "mean": 0.4472465217113495, + "std": 0.059433478862047195, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_q.weight": { + "min": -0.2733098268508911, + "max": 0.2983761131763458, + "mean": 9.066419806913473e-06, + "std": 0.03547072410583496, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_q.bias": { + "min": -0.11928554624319077, + "max": 0.11867407709360123, + "mean": 0.0007565614068880677, + "std": 0.02763325348496437, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_k.weight": { + "min": -0.28173530101776123, + "max": 0.2804112136363983, + "mean": -7.68975296523422e-05, + "std": 0.03510041534900665, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_k.bias": { + "min": -2.51193904876709, + "max": 2.5239455699920654, + "mean": 0.026779357343912125, + "std": 0.5869050621986389, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_v.weight": { + "min": -0.2215055674314499, + "max": 0.2721182703971863, + "mean": 2.8998874768149108e-06, + "std": 0.030730824917554855, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_v.bias": { + "min": -0.03334304690361023, + "max": 0.031320393085479736, + "mean": 0.00011074724898207933, + "std": 0.012403324246406555, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_out.0.weight": { + "min": -0.23567309975624084, + "max": 0.2320062220096588, + "mean": 5.707715899916366e-05, + "std": 0.025695981457829475, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_out.0.bias": { + "min": -0.13582320511341095, + "max": 0.1279149055480957, + "mean": -0.005496869329363108, + "std": 0.03996486961841583, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.3.g": { + "min": 0.3545507788658142, + "max": 1.1755321025848389, + "mean": 0.7105286121368408, + "std": 0.10380106419324875, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.4.ff.0.0.weight": { + "min": -0.618323802947998, + "max": 0.5557036995887756, + "mean": 0.0011603902094066143, + "std": 0.046115029603242874, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.2.4.ff.0.0.bias": { + "min": -0.18935386836528778, + "max": 0.024935415014624596, + "mean": -0.03484790399670601, + "std": 0.028624996542930603, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.2.4.ff.2.weight": { + "min": -1.1329621076583862, + "max": 0.9724080562591553, + "mean": 0.00035803488572128117, + "std": 0.042342979460954666, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.2.4.ff.2.bias": { + "min": -0.5985916256904602, + "max": 0.06294681131839752, + "mean": -0.0048767137341201305, + "std": 0.028625035658478737, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.1.g": { + "min": 0.37523797154426575, + "max": 0.9426477551460266, + "mean": 0.5925332903862, + "std": 0.06714636832475662, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_q.weight": { + "min": -0.3922964930534363, + "max": 0.37001147866249084, + "mean": 7.055637979647145e-05, + "std": 0.03718561306595802, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_q.bias": { + "min": -0.11894690245389938, + "max": 0.13649211823940277, + "mean": 0.0009205802925862372, + "std": 0.029216548427939415, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_k.weight": { + "min": -0.6203529834747314, + "max": 0.509852409362793, + "mean": 1.5258530766004696e-05, + "std": 0.03643907234072685, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_k.bias": { + "min": -8.195601463317871, + "max": 8.798324584960938, + "mean": -0.10935366153717041, + "std": 1.6999714374542236, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_v.weight": { + "min": -0.27709993720054626, + "max": 0.24029740691184998, + "mean": 5.252830669633113e-05, + "std": 0.032612841576337814, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_v.bias": { + "min": -0.05198528617620468, + "max": 0.03960206359624863, + "mean": 8.789013372734189e-05, + "std": 0.012959298677742481, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_out.0.weight": { + "min": -0.23129259049892426, + "max": 0.23536467552185059, + "mean": -2.1845989977009594e-05, + "std": 0.029389241710305214, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_out.0.bias": { + "min": -0.2045416533946991, + "max": 0.10547658056020737, + "mean": -0.004024041350930929, + "std": 0.03263028338551521, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.3.g": { + "min": 0.33950191736221313, + "max": 1.0151382684707642, + "mean": 0.7007080316543579, + "std": 0.09671688079833984, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.4.ff.0.0.weight": { + "min": -0.5657932162284851, + "max": 0.8349727988243103, + "mean": 0.00041512559982948005, + "std": 0.04229608178138733, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.3.4.ff.0.0.bias": { + "min": -0.21222105622291565, + "max": 0.030380746349692345, + "mean": -0.03218400478363037, + "std": 0.026512378826737404, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.3.4.ff.2.weight": { + "min": -0.7566999793052673, + "max": 0.7205860018730164, + "mean": -1.3569264410762116e-05, + "std": 0.036836523562669754, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.3.4.ff.2.bias": { + "min": -0.2636493444442749, + "max": 0.10622138530015945, + "mean": -0.0030191433615982533, + "std": 0.0288657546043396, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.1.g": { + "min": 0.284244179725647, + "max": 0.6968931555747986, + "mean": 0.49943026900291443, + "std": 0.046561453491449356, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_q.weight": { + "min": -0.27927035093307495, + "max": 0.23469851911067963, + "mean": -0.00011116769746877253, + "std": 0.038758207112550735, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_q.bias": { + "min": -0.15420791506767273, + "max": 0.12671181559562683, + "mean": -0.002232905477285385, + "std": 0.03338504582643509, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_k.weight": { + "min": -0.41528424620628357, + "max": 0.6604220271110535, + "mean": -1.9215509382775053e-05, + "std": 0.03909698873758316, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_k.bias": { + "min": -4.243428707122803, + "max": 4.728596210479736, + "mean": -0.020457647740840912, + "std": 1.0080652236938477, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_v.weight": { + "min": -0.24574802815914154, + "max": 0.20800377428531647, + "mean": 4.4111799070378765e-05, + "std": 0.0339629240334034, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_v.bias": { + "min": -0.03446226194500923, + "max": 0.04489393159747124, + "mean": -1.5458615962415934e-05, + "std": 0.012629742734134197, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_out.0.weight": { + "min": -0.2015937864780426, + "max": 0.20673099160194397, + "mean": -2.9244030884001404e-05, + "std": 0.03102072887122631, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_out.0.bias": { + "min": -0.20010024309158325, + "max": 0.11358015239238739, + "mean": -0.0029013892635703087, + "std": 0.03451463207602501, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.3.g": { + "min": 0.36685705184936523, + "max": 1.0600172281265259, + "mean": 0.6705178022384644, + "std": 0.06640052795410156, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.4.ff.0.0.weight": { + "min": -0.39914920926094055, + "max": 0.5031230449676514, + "mean": -3.865663893520832e-05, + "std": 0.04113178327679634, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.4.4.ff.0.0.bias": { + "min": -0.12865233421325684, + "max": 0.026885882019996643, + "mean": -0.030540671199560165, + "std": 0.02188955619931221, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.4.4.ff.2.weight": { + "min": -0.4503399133682251, + "max": 0.4341718554496765, + "mean": 7.837524026399478e-05, + "std": 0.03489154577255249, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.4.4.ff.2.bias": { + "min": -0.2677534520626068, + "max": 0.07295451313257217, + "mean": -0.0010977284982800484, + "std": 0.023126663640141487, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.1.g": { + "min": 0.28732216358184814, + "max": 0.687613844871521, + "mean": 0.5245327353477478, + "std": 0.047577910125255585, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_q.weight": { + "min": -0.22290916740894318, + "max": 0.22416770458221436, + "mean": 1.5896670447546057e-05, + "std": 0.03894934430718422, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_q.bias": { + "min": -0.13659609854221344, + "max": 0.10938586294651031, + "mean": 0.0002443990088067949, + "std": 0.029240434989333153, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_k.weight": { + "min": -0.37579256296157837, + "max": 0.43812817335128784, + "mean": -9.537441655993462e-06, + "std": 0.03928641602396965, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_k.bias": { + "min": -3.8499395847320557, + "max": 5.004647254943848, + "mean": 0.009758757427334785, + "std": 0.8455180525779724, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_v.weight": { + "min": -0.2236318439245224, + "max": 0.22071507573127747, + "mean": -4.0232407627627254e-07, + "std": 0.034410055726766586, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_v.bias": { + "min": -0.04383794590830803, + "max": 0.03584868088364601, + "mean": -0.00026072480250149965, + "std": 0.012076611630618572, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_out.0.weight": { + "min": -0.21360361576080322, + "max": 0.1891404688358307, + "mean": -1.7133981600636616e-05, + "std": 0.03153670206665993, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_out.0.bias": { + "min": -0.18102218210697174, + "max": 0.12101027369499207, + "mean": -0.002398766577243805, + "std": 0.04126044735312462, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.3.g": { + "min": 0.422617107629776, + "max": 0.9454182982444763, + "mean": 0.6626853942871094, + "std": 0.05683305859565735, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.4.ff.0.0.weight": { + "min": -0.3716322183609009, + "max": 0.47696027159690857, + "mean": -8.185259503079578e-05, + "std": 0.040890805423259735, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.5.4.ff.0.0.bias": { + "min": -0.2088262289762497, + "max": 0.027207661420106888, + "mean": -0.03023664839565754, + "std": 0.021368583664298058, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.5.4.ff.2.weight": { + "min": -0.3415319621562958, + "max": 0.735925555229187, + "mean": 8.314158185385168e-05, + "std": 0.034767184406518936, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.5.4.ff.2.bias": { + "min": -0.24044273793697357, + "max": 0.05069386586546898, + "mean": -0.0011902841506525874, + "std": 0.020465629175305367, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.1.g": { + "min": 0.30604928731918335, + "max": 0.6555026769638062, + "mean": 0.5250788331031799, + "std": 0.04609908536076546, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_q.weight": { + "min": -0.3050762414932251, + "max": 0.21783104538917542, + "mean": 6.997165473876521e-05, + "std": 0.039496470242738724, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_q.bias": { + "min": -0.14947636425495148, + "max": 0.13131970167160034, + "mean": 0.00033609665115363896, + "std": 0.03047223575413227, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_k.weight": { + "min": -0.2578710615634918, + "max": 0.20255950093269348, + "mean": 3.1238341762218624e-05, + "std": 0.03948673978447914, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_k.bias": { + "min": -2.339573621749878, + "max": 2.379251480102539, + "mean": -0.02625335566699505, + "std": 0.4500052034854889, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_v.weight": { + "min": -0.1892782300710678, + "max": 0.21099112927913666, + "mean": 3.7314141081878915e-05, + "std": 0.03479423746466637, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_v.bias": { + "min": -0.03169188275933266, + "max": 0.03571836277842522, + "mean": -0.00019686334417201579, + "std": 0.012292133644223213, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_out.0.weight": { + "min": -0.1888986974954605, + "max": 0.17091436684131622, + "mean": -6.82127574691549e-05, + "std": 0.032170820981264114, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_out.0.bias": { + "min": -0.13952063024044037, + "max": 0.13709284365177155, + "mean": -0.0025128263514488935, + "std": 0.0512898713350296, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.3.g": { + "min": 0.4670536518096924, + "max": 0.9585899710655212, + "mean": 0.6689007878303528, + "std": 0.05285040661692619, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.4.ff.0.0.weight": { + "min": -0.3248884379863739, + "max": 0.3098326325416565, + "mean": -1.0356043276260607e-06, + "std": 0.04094681516289711, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.6.4.ff.0.0.bias": { + "min": -0.12497521936893463, + "max": 0.02554607018828392, + "mean": -0.030699055641889572, + "std": 0.019824611023068428, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.6.4.ff.2.weight": { + "min": -0.4409962594509125, + "max": 0.44632241129875183, + "mean": 9.430450154468417e-05, + "std": 0.03512001410126686, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.6.4.ff.2.bias": { + "min": -0.22476668655872345, + "max": 0.051897041499614716, + "mean": -0.0011790284188464284, + "std": 0.018472088500857353, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.1.g": { + "min": 0.3393557369709015, + "max": 0.7416696548461914, + "mean": 0.5586937069892883, + "std": 0.04142747446894646, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_q.weight": { + "min": -0.2734062075614929, + "max": 0.2793632745742798, + "mean": 2.0294006390031427e-05, + "std": 0.04105808213353157, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_q.bias": { + "min": -0.13707204163074493, + "max": 0.14009879529476166, + "mean": 0.0004904167726635933, + "std": 0.02664206363260746, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_k.weight": { + "min": -0.49139103293418884, + "max": 0.35644298791885376, + "mean": 8.893347694538534e-05, + "std": 0.04069600626826286, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_k.bias": { + "min": -2.2994801998138428, + "max": 1.7469841241836548, + "mean": -0.021084124222397804, + "std": 0.500186562538147, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_v.weight": { + "min": -0.2184700220823288, + "max": 0.1981830596923828, + "mean": -4.060107676195912e-05, + "std": 0.03423382714390755, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_v.bias": { + "min": -0.04127173125743866, + "max": 0.03881501033902168, + "mean": -0.00013771075464319438, + "std": 0.012880227528512478, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_out.0.weight": { + "min": -0.17825232446193695, + "max": 0.18374156951904297, + "mean": 4.785084456671029e-05, + "std": 0.031557004898786545, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_out.0.bias": { + "min": -0.18023589253425598, + "max": 0.18417657911777496, + "mean": -0.002215688582509756, + "std": 0.05483615770936012, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.3.g": { + "min": 0.4742925763130188, + "max": 1.0284452438354492, + "mean": 0.6453101634979248, + "std": 0.05053440108895302, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.4.ff.0.0.weight": { + "min": -0.27223968505859375, + "max": 0.30990350246429443, + "mean": 0.00011251836258452386, + "std": 0.04068317264318466, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.7.4.ff.0.0.bias": { + "min": -0.10583628714084625, + "max": 0.02672600746154785, + "mean": -0.02951621636748314, + "std": 0.01793462224304676, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.7.4.ff.2.weight": { + "min": -0.339975506067276, + "max": 0.3303821086883545, + "mean": 5.460641114041209e-05, + "std": 0.034413956105709076, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.7.4.ff.2.bias": { + "min": -0.1819038987159729, + "max": 0.0424266941845417, + "mean": -0.0010654201032593846, + "std": 0.01721329055726528, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.1.g": { + "min": 0.3252944052219391, + "max": 0.688383936882019, + "mean": 0.5112100839614868, + "std": 0.036942265927791595, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_q.weight": { + "min": -0.2345394641160965, + "max": 0.22607795894145966, + "mean": -3.624632518040016e-05, + "std": 0.039177343249320984, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_q.bias": { + "min": -0.11556069552898407, + "max": 0.13209758698940277, + "mean": 0.00015118884039111435, + "std": 0.029196659103035927, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_k.weight": { + "min": -0.3532617390155792, + "max": 0.2856779992580414, + "mean": 7.000558980507776e-06, + "std": 0.0392458438873291, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_k.bias": { + "min": -4.136237621307373, + "max": 3.547076940536499, + "mean": -0.011597944423556328, + "std": 0.6828959584236145, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_v.weight": { + "min": -0.21137045323848724, + "max": 0.20969942212104797, + "mean": 3.464317342150025e-05, + "std": 0.03448577970266342, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_v.bias": { + "min": -0.03584721311926842, + "max": 0.048106979578733444, + "mean": 0.0007941541844047606, + "std": 0.012865344993770123, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_out.0.weight": { + "min": -0.2109234631061554, + "max": 0.19350647926330566, + "mean": -1.076167109204107e-06, + "std": 0.03169678896665573, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_out.0.bias": { + "min": -0.18694967031478882, + "max": 0.17746947705745697, + "mean": -0.002843617694452405, + "std": 0.0586174838244915, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.3.g": { + "min": 0.474641889333725, + "max": 1.0443058013916016, + "mean": 0.6514294147491455, + "std": 0.0498916432261467, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.4.ff.0.0.weight": { + "min": -0.24857543408870697, + "max": 0.3296365737915039, + "mean": 0.00018093036487698555, + "std": 0.040571410208940506, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.8.4.ff.0.0.bias": { + "min": -0.12483743578195572, + "max": 0.024654541164636612, + "mean": -0.030496058985590935, + "std": 0.01760769635438919, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.8.4.ff.2.weight": { + "min": -0.4221171438694, + "max": 0.4831203818321228, + "mean": 1.3900153135182336e-06, + "std": 0.03539836406707764, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.8.4.ff.2.bias": { + "min": -0.15169401466846466, + "max": 0.043601393699645996, + "mean": 4.186587466392666e-05, + "std": 0.014870981685817242, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.1.g": { + "min": 0.31570297479629517, + "max": 0.6836181879043579, + "mean": 0.5528991222381592, + "std": 0.04067207872867584, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_q.weight": { + "min": -0.20685237646102905, + "max": 0.22020350396633148, + "mean": 3.1496565497945994e-05, + "std": 0.038300175219774246, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_q.bias": { + "min": -0.13801881670951843, + "max": 0.1128397211432457, + "mean": 1.9543484086170793e-05, + "std": 0.02582789771258831, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_k.weight": { + "min": -0.4035792350769043, + "max": 0.37189632654190063, + "mean": 2.57877072726842e-05, + "std": 0.03818116337060928, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_k.bias": { + "min": -3.776683807373047, + "max": 2.873103380203247, + "mean": 0.0011591403745114803, + "std": 0.5172097086906433, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_v.weight": { + "min": -0.20364898443222046, + "max": 0.19804270565509796, + "mean": 2.963895894936286e-05, + "std": 0.03429786115884781, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_v.bias": { + "min": -0.05086854100227356, + "max": 0.03999151289463043, + "mean": -0.00042562291491776705, + "std": 0.01342119462788105, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_out.0.weight": { + "min": -0.19656670093536377, + "max": 0.20230703055858612, + "mean": -1.2472472008084878e-05, + "std": 0.031806014478206635, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_out.0.bias": { + "min": -0.19329077005386353, + "max": 0.1953459531068802, + "mean": -0.002963340375572443, + "std": 0.06254669278860092, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.3.g": { + "min": 0.3491152226924896, + "max": 1.0867162942886353, + "mean": 0.6672079563140869, + "std": 0.055482182651758194, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.4.ff.0.0.weight": { + "min": -0.22604526579380035, + "max": 0.25199154019355774, + "mean": 0.00035888003185391426, + "std": 0.04076085984706879, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.9.4.ff.0.0.bias": { + "min": -0.09107685089111328, + "max": 0.043750207871198654, + "mean": -0.030080880969762802, + "std": 0.017612501978874207, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.9.4.ff.2.weight": { + "min": -0.354022353887558, + "max": 0.3047710955142975, + "mean": -4.505186007008888e-05, + "std": 0.03712347894906998, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.9.4.ff.2.bias": { + "min": -0.16208632290363312, + "max": 0.06347470730543137, + "mean": -7.683466537855566e-05, + "std": 0.01941368170082569, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.1.g": { + "min": 0.34881117939949036, + "max": 0.7244766354560852, + "mean": 0.5423683524131775, + "std": 0.039119552820920944, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_q.weight": { + "min": -0.21985284984111786, + "max": 0.22366879880428314, + "mean": -1.1181864465470426e-05, + "std": 0.03923165425658226, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_q.bias": { + "min": -0.11856226623058319, + "max": 0.17077098786830902, + "mean": 0.0002904185967054218, + "std": 0.025113951414823532, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_k.weight": { + "min": -0.24732813239097595, + "max": 0.30149152874946594, + "mean": -3.663568713818677e-05, + "std": 0.03893101587891579, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_k.bias": { + "min": -3.509943962097168, + "max": 3.719674825668335, + "mean": 0.015853645280003548, + "std": 0.7831405401229858, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_v.weight": { + "min": -0.21940433979034424, + "max": 0.2380109429359436, + "mean": -1.3181561371311545e-05, + "std": 0.036304209381341934, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_v.bias": { + "min": -0.04728918895125389, + "max": 0.05147355794906616, + "mean": 0.00047950932639651, + "std": 0.01351844146847725, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_out.0.weight": { + "min": -0.21457946300506592, + "max": 0.21772831678390503, + "mean": 5.6543191021773964e-05, + "std": 0.03361648693680763, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_out.0.bias": { + "min": -0.21175915002822876, + "max": 0.2316361367702484, + "mean": -0.005104508716613054, + "std": 0.06187352165579796, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.3.g": { + "min": 0.36198312044143677, + "max": 1.1043850183486938, + "mean": 0.6993494629859924, + "std": 0.0538649819791317, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.4.ff.0.0.weight": { + "min": -0.23541490733623505, + "max": 0.24545514583587646, + "mean": 0.0004635048389900476, + "std": 0.0412699356675148, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.10.4.ff.0.0.bias": { + "min": -0.09819761663675308, + "max": 0.06812109053134918, + "mean": -0.03143283352255821, + "std": 0.018124457448720932, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.10.4.ff.2.weight": { + "min": -0.302616149187088, + "max": 0.3526079058647156, + "mean": -8.239349699579179e-05, + "std": 0.04027572274208069, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.10.4.ff.2.bias": { + "min": -0.1525425761938095, + "max": 0.14988082647323608, + "mean": 0.00025950101553462446, + "std": 0.02303888648748398, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.1.g": { + "min": 0.9994731545448303, + "max": 1.0051331520080566, + "mean": 1.0006828308105469, + "std": 0.0018997839652001858, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_q.weight": { + "min": -0.031253598630428314, + "max": 0.03125074878334999, + "mean": -1.9291795979370363e-05, + "std": 0.018041806295514107, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_q.bias": { + "min": -0.031226053833961487, + "max": 0.030990969389677048, + "mean": -0.0010842140763998032, + "std": 0.01795150525867939, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_k.weight": { + "min": -0.03125230595469475, + "max": 0.031255852431058884, + "mean": 3.5468428905005567e-06, + "std": 0.01804220862686634, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_k.bias": { + "min": -0.031155752018094063, + "max": 0.031177222728729248, + "mean": 0.0003338717215228826, + "std": 0.018063681200146675, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_v.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_v.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_out.0.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_out.0.bias": { + "min": -0.00039401825051754713, + "max": 0.00042413949267938733, + "mean": 2.811485501297284e-06, + "std": 0.00013175072672311217, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.3.g": { + "min": 0.9984285831451416, + "max": 1.0057381391525269, + "mean": 1.0001252889633179, + "std": 0.0012227989500388503, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.4.ff.0.0.weight": { + "min": -0.03248310461640358, + "max": 0.03276699408888817, + "mean": -6.534818567160983e-06, + "std": 0.01804283820092678, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.11.4.ff.0.0.bias": { + "min": -0.03245115652680397, + "max": 0.032321732491254807, + "mean": -6.833355291746557e-05, + "std": 0.017962154000997543, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.11.4.ff.2.weight": { + "min": -0.001046429155394435, + "max": 0.001021245145238936, + "mean": 1.2730889693557401e-06, + "std": 0.00019014839199371636, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.11.4.ff.2.bias": { + "min": -0.00038878852501511574, + "max": 0.0004429140826687217, + "mean": 4.41432621300919e-06, + "std": 0.00012222054647281766, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.1.g": { + "min": 0.3831113874912262, + "max": 0.7217056155204773, + "mean": 0.5806930065155029, + "std": 0.03891616314649582, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_q.weight": { + "min": -0.23930218815803528, + "max": 0.19694408774375916, + "mean": 2.6163981601712294e-05, + "std": 0.03746587410569191, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_q.bias": { + "min": -0.11892960965633392, + "max": 0.16658687591552734, + "mean": 0.0009876482654362917, + "std": 0.027559131383895874, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_k.weight": { + "min": -0.2469177097082138, + "max": 0.5011630058288574, + "mean": -5.039005191065371e-05, + "std": 0.037623330950737, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_k.bias": { + "min": -3.9455182552337646, + "max": 3.7725064754486084, + "mean": -0.003572634421288967, + "std": 0.6815741658210754, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_v.weight": { + "min": -0.2276747226715088, + "max": 0.25224873423576355, + "mean": -1.156590678874636e-05, + "std": 0.03743501380085945, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_v.bias": { + "min": -0.0717209130525589, + "max": 0.08072538673877716, + "mean": -0.0005185012123547494, + "std": 0.01566058024764061, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_out.0.weight": { + "min": -0.2281697541475296, + "max": 0.25840428471565247, + "mean": -2.8510152333183214e-05, + "std": 0.03542180359363556, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_out.0.bias": { + "min": -0.2006748765707016, + "max": 0.21532072126865387, + "mean": -0.005526356864720583, + "std": 0.06832510232925415, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.3.g": { + "min": 0.4052578806877136, + "max": 1.1931043863296509, + "mean": 0.7380141019821167, + "std": 0.05553331598639488, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.4.ff.0.0.weight": { + "min": -0.2216469943523407, + "max": 0.24624952673912048, + "mean": 0.0005209938390180469, + "std": 0.04133738949894905, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.12.4.ff.0.0.bias": { + "min": -0.10345429182052612, + "max": 0.024157993495464325, + "mean": -0.03266732394695282, + "std": 0.018895410001277924, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.12.4.ff.2.weight": { + "min": -0.4506717622280121, + "max": 0.4234609603881836, + "mean": -0.00043505526264198124, + "std": 0.04689793288707733, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.12.4.ff.2.bias": { + "min": -0.2517058551311493, + "max": 0.4705328345298767, + "mean": 0.0032054544426500797, + "std": 0.044538334012031555, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.0.weight": { + "min": -0.31723225116729736, + "max": 0.3334876596927643, + "mean": -2.5067403839784674e-05, + "std": 0.021288011223077774, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.13.1.g": { + "min": 0.32461482286453247, + "max": 0.6871254444122314, + "mean": 0.5709946155548096, + "std": 0.044712185859680176, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_q.weight": { + "min": -0.16488447785377502, + "max": 0.174674391746521, + "mean": -4.878301842836663e-05, + "std": 0.033181823790073395, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_q.bias": { + "min": -0.18708285689353943, + "max": 0.14329394698143005, + "mean": 4.1025952668860555e-05, + "std": 0.02970319241285324, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_k.weight": { + "min": -0.3814561367034912, + "max": 0.2463892698287964, + "mean": -9.789278919924982e-06, + "std": 0.03276311233639717, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_k.bias": { + "min": -3.6606388092041016, + "max": 3.2944271564483643, + "mean": -0.01427321694791317, + "std": 0.9851539731025696, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_v.weight": { + "min": -0.23539957404136658, + "max": 0.2480521947145462, + "mean": -1.7979342374019325e-05, + "std": 0.04169878736138344, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_v.bias": { + "min": -0.07279200851917267, + "max": 0.15470217168331146, + "mean": 0.0006656068144366145, + "std": 0.02517576329410076, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_out.0.weight": { + "min": -0.2668735086917877, + "max": 0.2486240267753601, + "mean": -1.5421055650222115e-05, + "std": 0.04013972356915474, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_out.0.bias": { + "min": -0.18993628025054932, + "max": 0.19500213861465454, + "mean": -0.0012349991593509912, + "std": 0.06668674200773239, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.3.g": { + "min": 0.32912659645080566, + "max": 1.003253698348999, + "mean": 0.7192496061325073, + "std": 0.052594345062971115, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.4.ff.0.0.weight": { + "min": -0.2322535365819931, + "max": 0.24589639902114868, + "mean": 0.00018273374007549137, + "std": 0.0409013107419014, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.13.4.ff.0.0.bias": { + "min": -0.11447025835514069, + "max": 0.018959810957312584, + "mean": -0.04247897118330002, + "std": 0.018857870250940323, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.13.4.ff.2.weight": { + "min": -0.39094480872154236, + "max": 0.4085846245288849, + "mean": -2.156081063731108e-05, + "std": 0.0485350526869297, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.13.4.ff.2.bias": { + "min": -0.6941088438034058, + "max": 0.413074254989624, + "mean": 0.0008494330104440451, + "std": 0.060315798968076706, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.0.weight": { + "min": -0.0010608690790832043, + "max": 1.0004838705062866, + "mean": 0.0004881545901298523, + "std": 0.0220896415412426, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.14.1.g": { + "min": 0.9995023608207703, + "max": 1.004894495010376, + "mean": 1.0006191730499268, + "std": 0.0017806595424190164, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_q.weight": { + "min": -0.031253229826688766, + "max": 0.0312533862888813, + "mean": -2.1022129658376798e-05, + "std": 0.018033137544989586, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_q.bias": { + "min": -0.03121466003358364, + "max": 0.031230736523866653, + "mean": -0.0006770135369151831, + "std": 0.017827749252319336, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_k.weight": { + "min": -0.031253378838300705, + "max": 0.03125477209687233, + "mean": -8.833090760163032e-06, + "std": 0.018032172694802284, + "sparsity": 9.5367431640625e-07, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_k.bias": { + "min": -0.031231535598635674, + "max": 0.031244806945323944, + "mean": -0.0007297678967006505, + "std": 0.01794254779815674, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_v.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_v.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_out.0.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_out.0.bias": { + "min": -0.00039897009264677763, + "max": 0.00031239030067808926, + "mean": -2.7656624297378585e-06, + "std": 0.00010500323696760461, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.3.g": { + "min": 0.9984675645828247, + "max": 1.005997896194458, + "mean": 0.9998568296432495, + "std": 0.0012546924408525229, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.4.ff.0.0.weight": { + "min": -0.032396964728832245, + "max": 0.032092805951833725, + "mean": -3.513969204504974e-08, + "std": 0.018030446022748947, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.14.4.ff.0.0.bias": { + "min": -0.03191046044230461, + "max": 0.03107621893286705, + "mean": -0.00026303951744921505, + "std": 0.018048185855150223, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.14.4.ff.2.weight": { + "min": -0.0011175514664500952, + "max": 0.0010112477466464043, + "mean": -6.1762216319039e-07, + "std": 0.0001866686943685636, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.14.4.ff.2.bias": { + "min": -0.0003427659103181213, + "max": 0.00032113981433212757, + "mean": -2.040310619122465e-06, + "std": 9.538298763800412e-05, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.0.weight": { + "min": -0.23462186753749847, + "max": 0.27271148562431335, + "mean": 6.776777354389196e-06, + "std": 0.018810205161571503, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.15.1.g": { + "min": 0.32134121656417847, + "max": 0.696171224117279, + "mean": 0.5816354155540466, + "std": 0.045965857803821564, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_q.weight": { + "min": -0.18210144340991974, + "max": 0.19822537899017334, + "mean": -1.1569689377211034e-05, + "std": 0.03318428248167038, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_q.bias": { + "min": -0.16075287759304047, + "max": 0.1296185702085495, + "mean": -0.0010708055924624205, + "std": 0.03414905443787575, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_k.weight": { + "min": -0.33257541060447693, + "max": 0.31164395809173584, + "mean": -1.0188834494329058e-05, + "std": 0.03223486989736557, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_k.bias": { + "min": -7.815314769744873, + "max": 8.776156425476074, + "mean": 0.09355179965496063, + "std": 1.6212124824523926, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_v.weight": { + "min": -0.2341691255569458, + "max": 0.2423291653394699, + "mean": 4.1637467802502215e-05, + "std": 0.040857378393411636, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_v.bias": { + "min": -0.0760289877653122, + "max": 0.065830759704113, + "mean": 0.00048469315515831113, + "std": 0.019415758550167084, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_out.0.weight": { + "min": -0.24639879167079926, + "max": 0.23466575145721436, + "mean": -3.0853516364004463e-06, + "std": 0.03943203389644623, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_out.0.bias": { + "min": -0.16285879909992218, + "max": 0.16076169908046722, + "mean": 0.0016295814421027899, + "std": 0.0652732104063034, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.3.g": { + "min": 0.5568758845329285, + "max": 0.9466937184333801, + "mean": 0.7129064202308655, + "std": 0.0403011329472065, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.4.ff.0.0.weight": { + "min": -0.22882379591464996, + "max": 0.25551655888557434, + "mean": -4.5426822907757014e-05, + "std": 0.0405760332942009, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.15.4.ff.0.0.bias": { + "min": -0.1351136714220047, + "max": 0.022313008084893227, + "mean": -0.04135293886065483, + "std": 0.01838735118508339, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.15.4.ff.2.weight": { + "min": -0.4227588474750519, + "max": 0.3930455446243286, + "mean": -4.085732143721543e-06, + "std": 0.047785546630620956, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.15.4.ff.2.bias": { + "min": -0.6080650687217712, + "max": 0.6521760821342468, + "mean": 0.0015855736564844847, + "std": 0.05685455724596977, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.0.weight": { + "min": -0.2519088387489319, + "max": 0.3208920359611511, + "mean": -6.068687071092427e-06, + "std": 0.01961320824921131, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.16.1.g": { + "min": 0.3596932888031006, + "max": 0.6842364072799683, + "mean": 0.5706857442855835, + "std": 0.042946916073560715, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_q.weight": { + "min": -0.22081606090068817, + "max": 0.1773088276386261, + "mean": -3.454893158050254e-05, + "std": 0.03429890051484108, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_q.bias": { + "min": -0.1636391431093216, + "max": 0.23335042595863342, + "mean": 0.00035607549943961203, + "std": 0.032843589782714844, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_k.weight": { + "min": -0.26433637738227844, + "max": 0.24021653831005096, + "mean": -5.268204404274002e-05, + "std": 0.033897630870342255, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_k.bias": { + "min": -4.859472751617432, + "max": 5.095940113067627, + "mean": 0.043871667236089706, + "std": 1.2294032573699951, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_v.weight": { + "min": -0.24689450860023499, + "max": 0.2507416307926178, + "mean": 7.20950702088885e-05, + "std": 0.04398806765675545, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_v.bias": { + "min": -0.062653087079525, + "max": 0.05465509742498398, + "mean": 0.0006480686133727431, + "std": 0.01719220168888569, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_out.0.weight": { + "min": -0.287101686000824, + "max": 0.27245277166366577, + "mean": -5.0120852392865345e-05, + "std": 0.04298638179898262, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_out.0.bias": { + "min": -0.16084662079811096, + "max": 0.17058779299259186, + "mean": -0.002887619426473975, + "std": 0.05928964540362358, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.3.g": { + "min": 0.5198022723197937, + "max": 0.9352366328239441, + "mean": 0.7134757041931152, + "std": 0.03851567581295967, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.4.ff.0.0.weight": { + "min": -0.23833467066287994, + "max": 0.24947485327720642, + "mean": 0.0004647623864002526, + "std": 0.040455412119627, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.16.4.ff.0.0.bias": { + "min": -0.1449345052242279, + "max": 0.041161470115184784, + "mean": -0.039693716913461685, + "std": 0.020549351349473, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.16.4.ff.2.weight": { + "min": -0.5341992378234863, + "max": 0.584149181842804, + "mean": 5.933919965173118e-06, + "std": 0.048861313611269, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.16.4.ff.2.bias": { + "min": -0.5195870399475098, + "max": 0.4941606819629669, + "mean": 0.0023631826043128967, + "std": 0.05346201732754707, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.0.weight": { + "min": -0.27384015917778015, + "max": 0.3156191408634186, + "mean": 1.960434929060284e-06, + "std": 0.020050089806318283, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.17.1.g": { + "min": 0.3661290407180786, + "max": 0.7137707471847534, + "mean": 0.5931426286697388, + "std": 0.045923035591840744, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_q.weight": { + "min": -0.21142390370368958, + "max": 0.1996057629585266, + "mean": 3.067640500376001e-05, + "std": 0.034866977483034134, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_q.bias": { + "min": -0.18729116022586823, + "max": 0.20393171906471252, + "mean": 0.0009568152017891407, + "std": 0.031525619328022, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_k.weight": { + "min": -0.29027533531188965, + "max": 0.34051838517189026, + "mean": -4.7230056225089356e-05, + "std": 0.03458789736032486, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_k.bias": { + "min": -3.881865978240967, + "max": 3.3913497924804688, + "mean": 0.014454022981226444, + "std": 0.8585575819015503, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_v.weight": { + "min": -0.22494949400424957, + "max": 0.25041675567626953, + "mean": -3.845839273708407e-06, + "std": 0.0422312431037426, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_v.bias": { + "min": -0.055274393409490585, + "max": 0.04683299362659454, + "mean": -1.701708242762834e-05, + "std": 0.015851490199565887, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_out.0.weight": { + "min": -0.29334571957588196, + "max": 0.2907007336616516, + "mean": -7.57977295506862e-06, + "std": 0.04194618761539459, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_out.0.bias": { + "min": -0.1247822642326355, + "max": 0.2594626247882843, + "mean": -0.0032404293306171894, + "std": 0.0531664676964283, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.3.g": { + "min": 0.4562881588935852, + "max": 0.8474717736244202, + "mean": 0.7055672407150269, + "std": 0.035394009202718735, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.4.ff.0.0.weight": { + "min": -0.5121109485626221, + "max": 0.34823864698410034, + "mean": 0.0003428200143389404, + "std": 0.04020027443766594, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.17.4.ff.0.0.bias": { + "min": -0.1863405406475067, + "max": 0.039554521441459656, + "mean": -0.03938986361026764, + "std": 0.02135385014116764, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.17.4.ff.2.weight": { + "min": -0.5456476807594299, + "max": 0.5576444864273071, + "mean": -7.10671374690719e-05, + "std": 0.050736188888549805, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.17.4.ff.2.bias": { + "min": -0.5122882723808289, + "max": 0.6650155782699585, + "mean": 0.0024437594693154097, + "std": 0.049542441964149475, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.0.weight": { + "min": -0.3326261341571808, + "max": 0.26606664061546326, + "mean": 3.3996070669672918e-06, + "std": 0.01938733644783497, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.18.1.g": { + "min": 0.32209691405296326, + "max": 0.7689979672431946, + "mean": 0.651018500328064, + "std": 0.045278150588274, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_q.weight": { + "min": -0.25021034479141235, + "max": 0.22022569179534912, + "mean": -2.263453097839374e-06, + "std": 0.0365014486014843, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_q.bias": { + "min": -0.32728204131126404, + "max": 0.28722772002220154, + "mean": -0.0006871280493214726, + "std": 0.038576990365982056, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_k.weight": { + "min": -0.3110663890838623, + "max": 0.37101635336875916, + "mean": 6.483237666543573e-05, + "std": 0.03624214604496956, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_k.bias": { + "min": -4.721696376800537, + "max": 5.813023090362549, + "mean": 0.037980761379003525, + "std": 1.4134187698364258, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_v.weight": { + "min": -0.22233453392982483, + "max": 0.20630262792110443, + "mean": -7.52985361032188e-05, + "std": 0.0424862764775753, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_v.bias": { + "min": -0.07775042951107025, + "max": 0.051466166973114014, + "mean": -0.0009254277683794498, + "std": 0.0164100993424654, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_out.0.weight": { + "min": -0.3309888541698456, + "max": 0.3296257257461548, + "mean": -4.630289367923979e-06, + "std": 0.04279271885752678, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_out.0.bias": { + "min": -0.2851186692714691, + "max": 0.11168244481086731, + "mean": -0.0012053586542606354, + "std": 0.04700839892029762, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.3.g": { + "min": 0.4862569272518158, + "max": 0.8893836140632629, + "mean": 0.7374457716941833, + "std": 0.03831757605075836, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.4.ff.0.0.weight": { + "min": -0.3624440133571625, + "max": 0.27509352564811707, + "mean": 5.130700083100237e-05, + "std": 0.040646348148584366, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.18.4.ff.0.0.bias": { + "min": -0.24782374501228333, + "max": 0.04648653045296669, + "mean": -0.0392659492790699, + "std": 0.023277943953871727, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.18.4.ff.2.weight": { + "min": -0.6279041171073914, + "max": 0.5983599424362183, + "mean": -6.208260310813785e-05, + "std": 0.05311836674809456, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.18.4.ff.2.bias": { + "min": -0.7105586528778076, + "max": 0.266210675239563, + "mean": 0.0009207880357280374, + "std": 0.05124485120177269, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.0.weight": { + "min": -0.3435235619544983, + "max": 0.30372199416160583, + "mean": 2.971426056319615e-07, + "std": 0.019135644659399986, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.19.1.g": { + "min": 0.34978553652763367, + "max": 0.7852374911308289, + "mean": 0.6388005018234253, + "std": 0.04921075701713562, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_q.weight": { + "min": -0.20607401430606842, + "max": 0.20750851929187775, + "mean": -5.96779900661204e-05, + "std": 0.037695422768592834, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_q.bias": { + "min": -0.2588743567466736, + "max": 0.2684256136417389, + "mean": -0.00040556711610406637, + "std": 0.04462844133377075, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_k.weight": { + "min": -0.3547278344631195, + "max": 0.32300710678100586, + "mean": -6.988519089645706e-06, + "std": 0.03720381483435631, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_k.bias": { + "min": -5.265876293182373, + "max": 4.207967281341553, + "mean": -0.026429325342178345, + "std": 1.0068732500076294, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_v.weight": { + "min": -0.2394271194934845, + "max": 0.24428503215312958, + "mean": -2.5281191483372822e-05, + "std": 0.04321092739701271, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_v.bias": { + "min": -0.06252460181713104, + "max": 0.056893154978752136, + "mean": 0.000347302237059921, + "std": 0.014152363874018192, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_out.0.weight": { + "min": -0.4372415244579315, + "max": 0.3737826347351074, + "mean": 1.467342644900782e-05, + "std": 0.04412253573536873, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_out.0.bias": { + "min": -0.09628994017839432, + "max": 0.17628277838230133, + "mean": -0.0006604281952604651, + "std": 0.03514600917696953, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.3.g": { + "min": 0.4217767119407654, + "max": 1.0722668170928955, + "mean": 0.7484005689620972, + "std": 0.04209807515144348, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.4.ff.0.0.weight": { + "min": -0.2667092978954315, + "max": 0.2975556254386902, + "mean": -7.937644113553688e-05, + "std": 0.04080634191632271, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.19.4.ff.0.0.bias": { + "min": -0.1854698657989502, + "max": 0.04349794238805771, + "mean": -0.03681644797325134, + "std": 0.02560725063085556, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.19.4.ff.2.weight": { + "min": -0.4579220414161682, + "max": 0.48784998059272766, + "mean": 4.282052395865321e-05, + "std": 0.05421200394630432, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.19.4.ff.2.bias": { + "min": -0.2866349518299103, + "max": 0.5520289540290833, + "mean": -0.0008793525630608201, + "std": 0.04783879220485687, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.0.weight": { + "min": -0.29281285405158997, + "max": 0.32289794087409973, + "mean": 6.245412805583328e-06, + "std": 0.019969133660197258, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.20.1.g": { + "min": 0.29108351469039917, + "max": 0.7621498107910156, + "mean": 0.6508013010025024, + "std": 0.05207887664437294, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_q.weight": { + "min": -0.2440386265516281, + "max": 0.2621654272079468, + "mean": -5.880815479031298e-06, + "std": 0.03961231932044029, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_q.bias": { + "min": -0.2678271532058716, + "max": 0.2002498358488083, + "mean": -0.0008784097735770047, + "std": 0.05178229510784149, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_k.weight": { + "min": -0.27257686853408813, + "max": 0.2541964650154114, + "mean": 4.526807060756255e-06, + "std": 0.038709431886672974, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_k.bias": { + "min": -12.982023239135742, + "max": 15.968067169189453, + "mean": 0.03324813023209572, + "std": 1.9908379316329956, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_v.weight": { + "min": -0.2077104151248932, + "max": 0.22651426494121552, + "mean": -7.221860869321972e-05, + "std": 0.040554750710725784, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_v.bias": { + "min": -0.06946562975645065, + "max": 0.06337178498506546, + "mean": 0.00015520014858338982, + "std": 0.01475033164024353, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_out.0.weight": { + "min": -0.46565988659858704, + "max": 0.3208334743976593, + "mean": 1.9561422959668562e-05, + "std": 0.040589939802885056, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_out.0.bias": { + "min": -0.064049631357193, + "max": 0.11550958454608917, + "mean": 0.0011937393574044108, + "std": 0.02470548450946808, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.3.g": { + "min": 0.3747756779193878, + "max": 0.9347750544548035, + "mean": 0.7509442567825317, + "std": 0.04021797329187393, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.4.ff.0.0.weight": { + "min": -0.2801269292831421, + "max": 0.27387121319770813, + "mean": -0.00016841593605931848, + "std": 0.040997058153152466, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.20.4.ff.0.0.bias": { + "min": -0.19878797233104706, + "max": 0.05111948773264885, + "mean": -0.032027605921030045, + "std": 0.025102604180574417, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.20.4.ff.2.weight": { + "min": -0.6596145033836365, + "max": 0.537032425403595, + "mean": -4.937778794555925e-05, + "std": 0.05284846946597099, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.20.4.ff.2.bias": { + "min": -0.1930496245622635, + "max": 0.5826522707939148, + "mean": -0.0005124770104885101, + "std": 0.04108353331685066, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.0.weight": { + "min": -0.41787075996398926, + "max": 0.37214192748069763, + "mean": 6.244237738428637e-06, + "std": 0.021621638908982277, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.21.1.g": { + "min": 0.21441777050495148, + "max": 0.7472008466720581, + "mean": 0.6494799852371216, + "std": 0.05431411787867546, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_q.weight": { + "min": -0.20989972352981567, + "max": 0.19592680037021637, + "mean": 4.0151899156626314e-05, + "std": 0.039461154490709305, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_q.bias": { + "min": -0.329771488904953, + "max": 0.25982508063316345, + "mean": -0.003228080226108432, + "std": 0.056280527263879776, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_k.weight": { + "min": -0.2062487006187439, + "max": 0.2551846504211426, + "mean": 5.400779264164157e-05, + "std": 0.038563843816518784, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_k.bias": { + "min": -6.2493767738342285, + "max": 6.938913345336914, + "mean": 0.04840244725346565, + "std": 1.3855851888656616, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_v.weight": { + "min": -0.21009960770606995, + "max": 0.23065192997455597, + "mean": -5.2159043661959e-06, + "std": 0.041313353925943375, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_v.bias": { + "min": -0.0439465157687664, + "max": 0.03601067140698433, + "mean": -2.0584266167134047e-06, + "std": 0.012799846939742565, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_out.0.weight": { + "min": -0.39804428815841675, + "max": 0.34499886631965637, + "mean": -5.5499749578302726e-05, + "std": 0.04238968715071678, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_out.0.bias": { + "min": -0.055174216628074646, + "max": 0.06293413788080215, + "mean": 0.00036305427784100175, + "std": 0.01867016963660717, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.3.g": { + "min": 0.3503042459487915, + "max": 1.0480320453643799, + "mean": 0.7894532084465027, + "std": 0.048786185681819916, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.4.ff.0.0.weight": { + "min": -0.3337661623954773, + "max": 0.3864375650882721, + "mean": -0.00016956219042185694, + "std": 0.04148184508085251, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.21.4.ff.0.0.bias": { + "min": -0.15768638253211975, + "max": 0.05907022953033447, + "mean": -0.031832221895456314, + "std": 0.0251291636377573, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.21.4.ff.2.weight": { + "min": -0.6973653435707092, + "max": 0.47017383575439453, + "mean": -8.81649466464296e-05, + "std": 0.051795393228530884, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.21.4.ff.2.bias": { + "min": -0.24848268926143646, + "max": 0.32916560769081116, + "mean": -0.0002544308081269264, + "std": 0.041454534977674484, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.0.weight": { + "min": -0.2872900664806366, + "max": 0.3505076766014099, + "mean": -2.3586867428093683e-06, + "std": 0.024236176162958145, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.22.1.g": { + "min": 0.19670914113521576, + "max": 0.7788708806037903, + "mean": 0.6702359914779663, + "std": 0.05864134803414345, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_q.weight": { + "min": -0.2293103188276291, + "max": 0.23172836005687714, + "mean": -2.0263662008801475e-05, + "std": 0.04043755307793617, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_q.bias": { + "min": -0.2201755940914154, + "max": 0.2412194311618805, + "mean": 0.0007778588915243745, + "std": 0.05583813413977623, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_k.weight": { + "min": -0.21645531058311462, + "max": 0.2269156575202942, + "mean": -7.186527363955975e-05, + "std": 0.03937343880534172, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_k.bias": { + "min": -8.91368579864502, + "max": 9.076720237731934, + "mean": -0.0012592850252985954, + "std": 1.8490537405014038, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_v.weight": { + "min": -0.2699006199836731, + "max": 0.2594479024410248, + "mean": 4.3596926843747497e-05, + "std": 0.03840681165456772, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_v.bias": { + "min": -0.05783012881875038, + "max": 0.057821568101644516, + "mean": 0.0003521823091432452, + "std": 0.014716818928718567, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_out.0.weight": { + "min": -0.26518943905830383, + "max": 0.2887333035469055, + "mean": -6.169862172100693e-05, + "std": 0.03907295688986778, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_out.0.bias": { + "min": -0.04396004229784012, + "max": 0.037220947444438934, + "mean": -9.395174856763333e-05, + "std": 0.013354334980249405, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.3.g": { + "min": 0.33940210938453674, + "max": 1.0958820581436157, + "mean": 0.8637964129447937, + "std": 0.06389264762401581, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.4.ff.0.0.weight": { + "min": -0.4235352873802185, + "max": 0.41927266120910645, + "mean": 0.000313018070301041, + "std": 0.04350249841809273, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.22.4.ff.0.0.bias": { + "min": -0.21509824693202972, + "max": 0.17092689871788025, + "mean": -0.0294746495783329, + "std": 0.03193298354744911, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.22.4.ff.2.weight": { + "min": -0.6005915999412537, + "max": 0.5609812140464783, + "mean": -0.00015016092220321298, + "std": 0.05344870314002037, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.22.4.ff.2.bias": { + "min": -0.17891772091388702, + "max": 0.3774968683719635, + "mean": 0.0013590974267572165, + "std": 0.03732309862971306, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.0.weight": { + "min": -0.39461401104927063, + "max": 0.36924391984939575, + "mean": 3.7040204915683717e-05, + "std": 0.028616365045309067, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.23.1.g": { + "min": 0.29045382142066956, + "max": 0.8264784812927246, + "mean": 0.7055213451385498, + "std": 0.0678410679101944, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_q.weight": { + "min": -0.9263020753860474, + "max": 1.0267603397369385, + "mean": -2.6431953301653266e-05, + "std": 0.04762791842222214, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_q.bias": { + "min": -0.8796241879463196, + "max": 0.8164305686950684, + "mean": -0.0003041320014744997, + "std": 0.0956113338470459, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_k.weight": { + "min": -0.27020347118377686, + "max": 0.241440087556839, + "mean": -2.271639823447913e-05, + "std": 0.038950297981500626, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_k.bias": { + "min": -23.76431655883789, + "max": 22.871889114379883, + "mean": -0.09189724177122116, + "std": 4.073054313659668, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_v.weight": { + "min": -0.22821645438671112, + "max": 0.24578580260276794, + "mean": -2.5681954866740853e-05, + "std": 0.03863786533474922, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_v.bias": { + "min": -0.06044214218854904, + "max": 0.04586166515946388, + "mean": -0.00014234766422305256, + "std": 0.014693022705614567, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_out.0.weight": { + "min": -0.3386403024196625, + "max": 0.3753957748413086, + "mean": 7.493808880099095e-06, + "std": 0.04081406444311142, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_out.0.bias": { + "min": -0.04647579416632652, + "max": 0.19592434167861938, + "mean": 0.00027245082310400903, + "std": 0.01356989610940218, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.3.g": { + "min": 0.3743247389793396, + "max": 1.133009910583496, + "mean": 0.8900730609893799, + "std": 0.06399820744991302, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.4.ff.0.0.weight": { + "min": -0.44806551933288574, + "max": 0.5433648824691772, + "mean": 2.4754037440288812e-05, + "std": 0.04556819051504135, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.23.4.ff.0.0.bias": { + "min": -0.22422385215759277, + "max": 0.08793910592794418, + "mean": -0.03202162683010101, + "std": 0.03776844963431358, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.23.4.ff.2.weight": { + "min": -0.7274155616760254, + "max": 0.6907259225845337, + "mean": 3.4943295759148896e-05, + "std": 0.05178087204694748, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.23.4.ff.2.bias": { + "min": -0.17463494837284088, + "max": 0.2185920923948288, + "mean": 3.897436545230448e-05, + "std": 0.031783733516931534, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.0.weight": { + "min": -0.34052687883377075, + "max": 0.37423866987228394, + "mean": 4.304847971070558e-05, + "std": 0.034138280898332596, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.24.1.g": { + "min": 0.3175727128982544, + "max": 1.290410041809082, + "mean": 0.6015003323554993, + "std": 0.08363870531320572, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_q.weight": { + "min": -0.28354600071907043, + "max": 0.260841429233551, + "mean": -3.130652658001054e-06, + "std": 0.035979557782411575, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_q.bias": { + "min": -0.23592722415924072, + "max": 0.2057497352361679, + "mean": 0.00023727506049908698, + "std": 0.056021153926849365, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_k.weight": { + "min": -0.43595167994499207, + "max": 0.32549113035202026, + "mean": 2.434775342408102e-05, + "std": 0.034129101783037186, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_k.bias": { + "min": -5.553627967834473, + "max": 7.324089527130127, + "mean": -0.007399275898933411, + "std": 0.7001854181289673, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_v.weight": { + "min": -0.34464672207832336, + "max": 0.3639456331729889, + "mean": 0.0001033150329021737, + "std": 0.047829318791627884, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_v.bias": { + "min": -0.0738968476653099, + "max": 0.060446880757808685, + "mean": 0.0009350795298814774, + "std": 0.014948361553251743, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_out.0.weight": { + "min": -0.2562582790851593, + "max": 0.28724488615989685, + "mean": 4.657229510485195e-06, + "std": 0.0415559858083725, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_out.0.bias": { + "min": -0.05538095533847809, + "max": 0.06288731843233109, + "mean": 0.00013551797019317746, + "std": 0.007167231757193804, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.3.g": { + "min": 0.4939861297607422, + "max": 1.2202398777008057, + "mean": 1.013412356376648, + "std": 0.1173911765217781, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.4.ff.0.0.weight": { + "min": -1.0939209461212158, + "max": 1.0473735332489014, + "mean": -4.927456029690802e-05, + "std": 0.05241009593009949, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.24.4.ff.0.0.bias": { + "min": -0.22382217645645142, + "max": 0.1730560064315796, + "mean": -0.027248641476035118, + "std": 0.03636055067181587, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.24.4.ff.2.weight": { + "min": -0.8865154385566711, + "max": 0.9247081279754639, + "mean": -0.00014585975441150367, + "std": 0.0532848984003067, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.24.4.ff.2.bias": { + "min": -0.17122139036655426, + "max": 0.38014623522758484, + "mean": 0.0033699313644319773, + "std": 0.03990361467003822, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.0.weight": { + "min": -0.7786033749580383, + "max": 0.7243013381958008, + "mean": 1.8795288269757293e-05, + "std": 0.046159159392118454, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.25.1.g": { + "min": 0.3385763168334961, + "max": 1.4310884475708008, + "mean": 0.9482859969139099, + "std": 0.20665791630744934, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_q.weight": { + "min": -1.745840311050415, + "max": 1.7046537399291992, + "mean": 0.00022703518334310502, + "std": 0.15869012475013733, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_q.bias": { + "min": -1.2008079290390015, + "max": 1.1013628244400024, + "mean": -0.009554527699947357, + "std": 0.20401149988174438, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_k.weight": { + "min": -0.4215790033340454, + "max": 0.427647203207016, + "mean": 6.439993012463674e-05, + "std": 0.048017047345638275, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_k.bias": { + "min": -19.76506996154785, + "max": 19.559972763061523, + "mean": -0.24841785430908203, + "std": 4.7801384925842285, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_v.weight": { + "min": -0.32463034987449646, + "max": 0.4392913281917572, + "mean": -1.1934026588278357e-05, + "std": 0.046162351965904236, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_v.bias": { + "min": -0.03394031897187233, + "max": 0.03703805059194565, + "mean": 0.0006406006286852062, + "std": 0.012916130013763905, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_out.0.weight": { + "min": -0.7043119668960571, + "max": 0.6668245792388916, + "mean": 4.3251380702713504e-05, + "std": 0.05788382515311241, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_out.0.bias": { + "min": -0.07238046824932098, + "max": 0.06770296394824982, + "mean": -0.00013378039875533432, + "std": 0.012917297892272472, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.3.g": { + "min": 0.38019153475761414, + "max": 1.391236424446106, + "mean": 1.0665456056594849, + "std": 0.21965359151363373, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.4.ff.0.0.weight": { + "min": -0.6170499324798584, + "max": 0.718601405620575, + "mean": 0.00011217871360713616, + "std": 0.058021701872348785, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.25.4.ff.0.0.bias": { + "min": -0.21975933015346527, + "max": 0.22518815100193024, + "mean": 0.006216429639607668, + "std": 0.049728311598300934, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.25.4.ff.2.weight": { + "min": -0.6300503015518188, + "max": 0.8897712826728821, + "mean": 1.1653193723759614e-05, + "std": 0.023531364277005196, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.25.4.ff.2.bias": { + "min": -0.5075116753578186, + "max": 0.47451627254486084, + "mean": -0.0030209918040782213, + "std": 0.06935632228851318, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.norm_out.g": { + "min": 0.5379416942596436, + "max": 1.1812505722045898, + "mean": 0.7826943397521973, + "std": 0.0987553521990776, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.proj_out.weight": { + "min": -0.26785895228385925, + "max": 0.21342454850673676, + "mean": -0.0002236703730886802, + "std": 0.05399824678897858, + "sparsity": 0.0, + "shape": [ + 100, + 1024 + ] + }, + "transformer.proj_out.bias": { + "min": -0.23829060792922974, + "max": 0.014859253540635109, + "mean": -0.043948449194431305, + "std": 0.034328024834394455, + "sparsity": 0.0, + "shape": [ + 100 + ] + } + } +} \ No newline at end of file