diff --git "a/model_analysis.json" "b/model_analysis.json" new file mode 100644--- /dev/null +++ "b/model_analysis.json" @@ -0,0 +1,4683 @@ +{ + "layer_types": { + "transformer": 391 + }, + "parameter_counts": { + "transformer.time_embed.time_mlp.0.weight": 262144, + "transformer.time_embed.time_mlp.0.bias": 1024, + "transformer.time_embed.time_mlp.2.weight": 1048576, + "transformer.time_embed.time_mlp.2.bias": 1024, + "transformer.text_embed.text_embed.weight": 254600, + "transformer.input_embed.proj.weight": 307200, + "transformer.input_embed.proj.bias": 1024, + "transformer.input_embed.conv_pos_embed.conv1d.0.weight": 2031616, + "transformer.input_embed.conv_pos_embed.conv1d.0.bias": 1024, + "transformer.input_embed.conv_pos_embed.conv1d.2.weight": 2031616, + "transformer.input_embed.conv_pos_embed.conv1d.2.bias": 1024, + "transformer.layers.0.1.g": 1024, + "transformer.layers.0.2.to_q.weight": 1048576, + "transformer.layers.0.2.to_q.bias": 1024, + "transformer.layers.0.2.to_k.weight": 1048576, + "transformer.layers.0.2.to_k.bias": 1024, + "transformer.layers.0.2.to_v.weight": 1048576, + "transformer.layers.0.2.to_v.bias": 1024, + "transformer.layers.0.2.to_out.0.weight": 1048576, + "transformer.layers.0.2.to_out.0.bias": 1024, + "transformer.layers.0.3.g": 1024, + "transformer.layers.0.4.ff.0.0.weight": 4194304, + "transformer.layers.0.4.ff.0.0.bias": 4096, + "transformer.layers.0.4.ff.2.weight": 4194304, + "transformer.layers.0.4.ff.2.bias": 1024, + "transformer.layers.1.1.g": 1024, + "transformer.layers.1.2.to_q.weight": 1048576, + "transformer.layers.1.2.to_q.bias": 1024, + "transformer.layers.1.2.to_k.weight": 1048576, + "transformer.layers.1.2.to_k.bias": 1024, + "transformer.layers.1.2.to_v.weight": 1048576, + "transformer.layers.1.2.to_v.bias": 1024, + "transformer.layers.1.2.to_out.0.weight": 1048576, + "transformer.layers.1.2.to_out.0.bias": 1024, + "transformer.layers.1.3.g": 1024, + "transformer.layers.1.4.ff.0.0.weight": 4194304, + "transformer.layers.1.4.ff.0.0.bias": 4096, + "transformer.layers.1.4.ff.2.weight": 4194304, + "transformer.layers.1.4.ff.2.bias": 1024, + "transformer.layers.2.1.g": 1024, + "transformer.layers.2.2.to_q.weight": 1048576, + "transformer.layers.2.2.to_q.bias": 1024, + "transformer.layers.2.2.to_k.weight": 1048576, + "transformer.layers.2.2.to_k.bias": 1024, + "transformer.layers.2.2.to_v.weight": 1048576, + "transformer.layers.2.2.to_v.bias": 1024, + "transformer.layers.2.2.to_out.0.weight": 1048576, + "transformer.layers.2.2.to_out.0.bias": 1024, + "transformer.layers.2.3.g": 1024, + "transformer.layers.2.4.ff.0.0.weight": 4194304, + "transformer.layers.2.4.ff.0.0.bias": 4096, + "transformer.layers.2.4.ff.2.weight": 4194304, + "transformer.layers.2.4.ff.2.bias": 1024, + "transformer.layers.3.1.g": 1024, + "transformer.layers.3.2.to_q.weight": 1048576, + "transformer.layers.3.2.to_q.bias": 1024, + "transformer.layers.3.2.to_k.weight": 1048576, + "transformer.layers.3.2.to_k.bias": 1024, + "transformer.layers.3.2.to_v.weight": 1048576, + "transformer.layers.3.2.to_v.bias": 1024, + "transformer.layers.3.2.to_out.0.weight": 1048576, + "transformer.layers.3.2.to_out.0.bias": 1024, + "transformer.layers.3.3.g": 1024, + "transformer.layers.3.4.ff.0.0.weight": 4194304, + "transformer.layers.3.4.ff.0.0.bias": 4096, + "transformer.layers.3.4.ff.2.weight": 4194304, + "transformer.layers.3.4.ff.2.bias": 1024, + "transformer.layers.4.1.g": 1024, + "transformer.layers.4.2.to_q.weight": 1048576, + "transformer.layers.4.2.to_q.bias": 1024, + "transformer.layers.4.2.to_k.weight": 1048576, + "transformer.layers.4.2.to_k.bias": 1024, + "transformer.layers.4.2.to_v.weight": 1048576, + "transformer.layers.4.2.to_v.bias": 1024, + "transformer.layers.4.2.to_out.0.weight": 1048576, + "transformer.layers.4.2.to_out.0.bias": 1024, + "transformer.layers.4.3.g": 1024, + "transformer.layers.4.4.ff.0.0.weight": 4194304, + "transformer.layers.4.4.ff.0.0.bias": 4096, + "transformer.layers.4.4.ff.2.weight": 4194304, + "transformer.layers.4.4.ff.2.bias": 1024, + "transformer.layers.5.1.g": 1024, + "transformer.layers.5.2.to_q.weight": 1048576, + "transformer.layers.5.2.to_q.bias": 1024, + "transformer.layers.5.2.to_k.weight": 1048576, + "transformer.layers.5.2.to_k.bias": 1024, + "transformer.layers.5.2.to_v.weight": 1048576, + "transformer.layers.5.2.to_v.bias": 1024, + "transformer.layers.5.2.to_out.0.weight": 1048576, + "transformer.layers.5.2.to_out.0.bias": 1024, + "transformer.layers.5.3.g": 1024, + "transformer.layers.5.4.ff.0.0.weight": 4194304, + "transformer.layers.5.4.ff.0.0.bias": 4096, + "transformer.layers.5.4.ff.2.weight": 4194304, + "transformer.layers.5.4.ff.2.bias": 1024, + "transformer.layers.6.1.g": 1024, + "transformer.layers.6.2.to_q.weight": 1048576, + "transformer.layers.6.2.to_q.bias": 1024, + "transformer.layers.6.2.to_k.weight": 1048576, + "transformer.layers.6.2.to_k.bias": 1024, + "transformer.layers.6.2.to_v.weight": 1048576, + "transformer.layers.6.2.to_v.bias": 1024, + "transformer.layers.6.2.to_out.0.weight": 1048576, + "transformer.layers.6.2.to_out.0.bias": 1024, + "transformer.layers.6.3.g": 1024, + "transformer.layers.6.4.ff.0.0.weight": 4194304, + "transformer.layers.6.4.ff.0.0.bias": 4096, + "transformer.layers.6.4.ff.2.weight": 4194304, + "transformer.layers.6.4.ff.2.bias": 1024, + "transformer.layers.7.1.g": 1024, + "transformer.layers.7.2.to_q.weight": 1048576, + "transformer.layers.7.2.to_q.bias": 1024, + "transformer.layers.7.2.to_k.weight": 1048576, + "transformer.layers.7.2.to_k.bias": 1024, + "transformer.layers.7.2.to_v.weight": 1048576, + "transformer.layers.7.2.to_v.bias": 1024, + "transformer.layers.7.2.to_out.0.weight": 1048576, + "transformer.layers.7.2.to_out.0.bias": 1024, + "transformer.layers.7.3.g": 1024, + "transformer.layers.7.4.ff.0.0.weight": 4194304, + "transformer.layers.7.4.ff.0.0.bias": 4096, + "transformer.layers.7.4.ff.2.weight": 4194304, + "transformer.layers.7.4.ff.2.bias": 1024, + "transformer.layers.8.1.g": 1024, + "transformer.layers.8.2.to_q.weight": 1048576, + "transformer.layers.8.2.to_q.bias": 1024, + "transformer.layers.8.2.to_k.weight": 1048576, + "transformer.layers.8.2.to_k.bias": 1024, + "transformer.layers.8.2.to_v.weight": 1048576, + "transformer.layers.8.2.to_v.bias": 1024, + "transformer.layers.8.2.to_out.0.weight": 1048576, + "transformer.layers.8.2.to_out.0.bias": 1024, + "transformer.layers.8.3.g": 1024, + "transformer.layers.8.4.ff.0.0.weight": 4194304, + "transformer.layers.8.4.ff.0.0.bias": 4096, + "transformer.layers.8.4.ff.2.weight": 4194304, + "transformer.layers.8.4.ff.2.bias": 1024, + "transformer.layers.9.1.g": 1024, + "transformer.layers.9.2.to_q.weight": 1048576, + "transformer.layers.9.2.to_q.bias": 1024, + "transformer.layers.9.2.to_k.weight": 1048576, + "transformer.layers.9.2.to_k.bias": 1024, + "transformer.layers.9.2.to_v.weight": 1048576, + "transformer.layers.9.2.to_v.bias": 1024, + "transformer.layers.9.2.to_out.0.weight": 1048576, + "transformer.layers.9.2.to_out.0.bias": 1024, + "transformer.layers.9.3.g": 1024, + "transformer.layers.9.4.ff.0.0.weight": 4194304, + "transformer.layers.9.4.ff.0.0.bias": 4096, + "transformer.layers.9.4.ff.2.weight": 4194304, + "transformer.layers.9.4.ff.2.bias": 1024, + "transformer.layers.10.1.g": 1024, + "transformer.layers.10.2.to_q.weight": 1048576, + "transformer.layers.10.2.to_q.bias": 1024, + "transformer.layers.10.2.to_k.weight": 1048576, + "transformer.layers.10.2.to_k.bias": 1024, + "transformer.layers.10.2.to_v.weight": 1048576, + "transformer.layers.10.2.to_v.bias": 1024, + "transformer.layers.10.2.to_out.0.weight": 1048576, + "transformer.layers.10.2.to_out.0.bias": 1024, + "transformer.layers.10.3.g": 1024, + "transformer.layers.10.4.ff.0.0.weight": 4194304, + "transformer.layers.10.4.ff.0.0.bias": 4096, + "transformer.layers.10.4.ff.2.weight": 4194304, + "transformer.layers.10.4.ff.2.bias": 1024, + "transformer.layers.11.1.g": 1024, + "transformer.layers.11.2.to_q.weight": 1048576, + "transformer.layers.11.2.to_q.bias": 1024, + "transformer.layers.11.2.to_k.weight": 1048576, + "transformer.layers.11.2.to_k.bias": 1024, + "transformer.layers.11.2.to_v.weight": 1048576, + "transformer.layers.11.2.to_v.bias": 1024, + "transformer.layers.11.2.to_out.0.weight": 1048576, + "transformer.layers.11.2.to_out.0.bias": 1024, + "transformer.layers.11.3.g": 1024, + "transformer.layers.11.4.ff.0.0.weight": 4194304, + "transformer.layers.11.4.ff.0.0.bias": 4096, + "transformer.layers.11.4.ff.2.weight": 4194304, + "transformer.layers.11.4.ff.2.bias": 1024, + "transformer.layers.12.1.g": 1024, + "transformer.layers.12.2.to_q.weight": 1048576, + "transformer.layers.12.2.to_q.bias": 1024, + "transformer.layers.12.2.to_k.weight": 1048576, + "transformer.layers.12.2.to_k.bias": 1024, + "transformer.layers.12.2.to_v.weight": 1048576, + "transformer.layers.12.2.to_v.bias": 1024, + "transformer.layers.12.2.to_out.0.weight": 1048576, + "transformer.layers.12.2.to_out.0.bias": 1024, + "transformer.layers.12.3.g": 1024, + "transformer.layers.12.4.ff.0.0.weight": 4194304, + "transformer.layers.12.4.ff.0.0.bias": 4096, + "transformer.layers.12.4.ff.2.weight": 4194304, + "transformer.layers.12.4.ff.2.bias": 1024, + "transformer.layers.13.0.weight": 2097152, + "transformer.layers.13.1.g": 1024, + "transformer.layers.13.2.to_q.weight": 1048576, + "transformer.layers.13.2.to_q.bias": 1024, + "transformer.layers.13.2.to_k.weight": 1048576, + "transformer.layers.13.2.to_k.bias": 1024, + "transformer.layers.13.2.to_v.weight": 1048576, + "transformer.layers.13.2.to_v.bias": 1024, + "transformer.layers.13.2.to_out.0.weight": 1048576, + "transformer.layers.13.2.to_out.0.bias": 1024, + "transformer.layers.13.3.g": 1024, + "transformer.layers.13.4.ff.0.0.weight": 4194304, + "transformer.layers.13.4.ff.0.0.bias": 4096, + "transformer.layers.13.4.ff.2.weight": 4194304, + "transformer.layers.13.4.ff.2.bias": 1024, + "transformer.layers.14.0.weight": 2097152, + "transformer.layers.14.1.g": 1024, + "transformer.layers.14.2.to_q.weight": 1048576, + "transformer.layers.14.2.to_q.bias": 1024, + "transformer.layers.14.2.to_k.weight": 1048576, + "transformer.layers.14.2.to_k.bias": 1024, + "transformer.layers.14.2.to_v.weight": 1048576, + "transformer.layers.14.2.to_v.bias": 1024, + "transformer.layers.14.2.to_out.0.weight": 1048576, + "transformer.layers.14.2.to_out.0.bias": 1024, + "transformer.layers.14.3.g": 1024, + "transformer.layers.14.4.ff.0.0.weight": 4194304, + "transformer.layers.14.4.ff.0.0.bias": 4096, + "transformer.layers.14.4.ff.2.weight": 4194304, + "transformer.layers.14.4.ff.2.bias": 1024, + "transformer.layers.15.0.weight": 2097152, + "transformer.layers.15.1.g": 1024, + "transformer.layers.15.2.to_q.weight": 1048576, + "transformer.layers.15.2.to_q.bias": 1024, + "transformer.layers.15.2.to_k.weight": 1048576, + "transformer.layers.15.2.to_k.bias": 1024, + "transformer.layers.15.2.to_v.weight": 1048576, + "transformer.layers.15.2.to_v.bias": 1024, + "transformer.layers.15.2.to_out.0.weight": 1048576, + "transformer.layers.15.2.to_out.0.bias": 1024, + "transformer.layers.15.3.g": 1024, + "transformer.layers.15.4.ff.0.0.weight": 4194304, + "transformer.layers.15.4.ff.0.0.bias": 4096, + "transformer.layers.15.4.ff.2.weight": 4194304, + "transformer.layers.15.4.ff.2.bias": 1024, + "transformer.layers.16.0.weight": 2097152, + "transformer.layers.16.1.g": 1024, + "transformer.layers.16.2.to_q.weight": 1048576, + "transformer.layers.16.2.to_q.bias": 1024, + "transformer.layers.16.2.to_k.weight": 1048576, + "transformer.layers.16.2.to_k.bias": 1024, + "transformer.layers.16.2.to_v.weight": 1048576, + "transformer.layers.16.2.to_v.bias": 1024, + "transformer.layers.16.2.to_out.0.weight": 1048576, + "transformer.layers.16.2.to_out.0.bias": 1024, + "transformer.layers.16.3.g": 1024, + "transformer.layers.16.4.ff.0.0.weight": 4194304, + "transformer.layers.16.4.ff.0.0.bias": 4096, + "transformer.layers.16.4.ff.2.weight": 4194304, + "transformer.layers.16.4.ff.2.bias": 1024, + "transformer.layers.17.0.weight": 2097152, + "transformer.layers.17.1.g": 1024, + "transformer.layers.17.2.to_q.weight": 1048576, + "transformer.layers.17.2.to_q.bias": 1024, + "transformer.layers.17.2.to_k.weight": 1048576, + "transformer.layers.17.2.to_k.bias": 1024, + "transformer.layers.17.2.to_v.weight": 1048576, + "transformer.layers.17.2.to_v.bias": 1024, + "transformer.layers.17.2.to_out.0.weight": 1048576, + "transformer.layers.17.2.to_out.0.bias": 1024, + "transformer.layers.17.3.g": 1024, + "transformer.layers.17.4.ff.0.0.weight": 4194304, + "transformer.layers.17.4.ff.0.0.bias": 4096, + "transformer.layers.17.4.ff.2.weight": 4194304, + "transformer.layers.17.4.ff.2.bias": 1024, + "transformer.layers.18.0.weight": 2097152, + "transformer.layers.18.1.g": 1024, + "transformer.layers.18.2.to_q.weight": 1048576, + "transformer.layers.18.2.to_q.bias": 1024, + "transformer.layers.18.2.to_k.weight": 1048576, + "transformer.layers.18.2.to_k.bias": 1024, + "transformer.layers.18.2.to_v.weight": 1048576, + "transformer.layers.18.2.to_v.bias": 1024, + "transformer.layers.18.2.to_out.0.weight": 1048576, + "transformer.layers.18.2.to_out.0.bias": 1024, + "transformer.layers.18.3.g": 1024, + "transformer.layers.18.4.ff.0.0.weight": 4194304, + "transformer.layers.18.4.ff.0.0.bias": 4096, + "transformer.layers.18.4.ff.2.weight": 4194304, + "transformer.layers.18.4.ff.2.bias": 1024, + "transformer.layers.19.0.weight": 2097152, + "transformer.layers.19.1.g": 1024, + "transformer.layers.19.2.to_q.weight": 1048576, + "transformer.layers.19.2.to_q.bias": 1024, + "transformer.layers.19.2.to_k.weight": 1048576, + "transformer.layers.19.2.to_k.bias": 1024, + "transformer.layers.19.2.to_v.weight": 1048576, + "transformer.layers.19.2.to_v.bias": 1024, + "transformer.layers.19.2.to_out.0.weight": 1048576, + "transformer.layers.19.2.to_out.0.bias": 1024, + "transformer.layers.19.3.g": 1024, + "transformer.layers.19.4.ff.0.0.weight": 4194304, + "transformer.layers.19.4.ff.0.0.bias": 4096, + "transformer.layers.19.4.ff.2.weight": 4194304, + "transformer.layers.19.4.ff.2.bias": 1024, + "transformer.layers.20.0.weight": 2097152, + "transformer.layers.20.1.g": 1024, + "transformer.layers.20.2.to_q.weight": 1048576, + "transformer.layers.20.2.to_q.bias": 1024, + "transformer.layers.20.2.to_k.weight": 1048576, + "transformer.layers.20.2.to_k.bias": 1024, + "transformer.layers.20.2.to_v.weight": 1048576, + "transformer.layers.20.2.to_v.bias": 1024, + "transformer.layers.20.2.to_out.0.weight": 1048576, + "transformer.layers.20.2.to_out.0.bias": 1024, + "transformer.layers.20.3.g": 1024, + "transformer.layers.20.4.ff.0.0.weight": 4194304, + "transformer.layers.20.4.ff.0.0.bias": 4096, + "transformer.layers.20.4.ff.2.weight": 4194304, + "transformer.layers.20.4.ff.2.bias": 1024, + "transformer.layers.21.0.weight": 2097152, + "transformer.layers.21.1.g": 1024, + "transformer.layers.21.2.to_q.weight": 1048576, + "transformer.layers.21.2.to_q.bias": 1024, + "transformer.layers.21.2.to_k.weight": 1048576, + "transformer.layers.21.2.to_k.bias": 1024, + "transformer.layers.21.2.to_v.weight": 1048576, + "transformer.layers.21.2.to_v.bias": 1024, + "transformer.layers.21.2.to_out.0.weight": 1048576, + "transformer.layers.21.2.to_out.0.bias": 1024, + "transformer.layers.21.3.g": 1024, + "transformer.layers.21.4.ff.0.0.weight": 4194304, + "transformer.layers.21.4.ff.0.0.bias": 4096, + "transformer.layers.21.4.ff.2.weight": 4194304, + "transformer.layers.21.4.ff.2.bias": 1024, + "transformer.layers.22.0.weight": 2097152, + "transformer.layers.22.1.g": 1024, + "transformer.layers.22.2.to_q.weight": 1048576, + "transformer.layers.22.2.to_q.bias": 1024, + "transformer.layers.22.2.to_k.weight": 1048576, + "transformer.layers.22.2.to_k.bias": 1024, + "transformer.layers.22.2.to_v.weight": 1048576, + "transformer.layers.22.2.to_v.bias": 1024, + "transformer.layers.22.2.to_out.0.weight": 1048576, + "transformer.layers.22.2.to_out.0.bias": 1024, + "transformer.layers.22.3.g": 1024, + "transformer.layers.22.4.ff.0.0.weight": 4194304, + "transformer.layers.22.4.ff.0.0.bias": 4096, + "transformer.layers.22.4.ff.2.weight": 4194304, + "transformer.layers.22.4.ff.2.bias": 1024, + "transformer.layers.23.0.weight": 2097152, + "transformer.layers.23.1.g": 1024, + "transformer.layers.23.2.to_q.weight": 1048576, + "transformer.layers.23.2.to_q.bias": 1024, + "transformer.layers.23.2.to_k.weight": 1048576, + "transformer.layers.23.2.to_k.bias": 1024, + "transformer.layers.23.2.to_v.weight": 1048576, + "transformer.layers.23.2.to_v.bias": 1024, + "transformer.layers.23.2.to_out.0.weight": 1048576, + "transformer.layers.23.2.to_out.0.bias": 1024, + "transformer.layers.23.3.g": 1024, + "transformer.layers.23.4.ff.0.0.weight": 4194304, + "transformer.layers.23.4.ff.0.0.bias": 4096, + "transformer.layers.23.4.ff.2.weight": 4194304, + "transformer.layers.23.4.ff.2.bias": 1024, + "transformer.layers.24.0.weight": 2097152, + "transformer.layers.24.1.g": 1024, + "transformer.layers.24.2.to_q.weight": 1048576, + "transformer.layers.24.2.to_q.bias": 1024, + "transformer.layers.24.2.to_k.weight": 1048576, + "transformer.layers.24.2.to_k.bias": 1024, + "transformer.layers.24.2.to_v.weight": 1048576, + "transformer.layers.24.2.to_v.bias": 1024, + "transformer.layers.24.2.to_out.0.weight": 1048576, + "transformer.layers.24.2.to_out.0.bias": 1024, + "transformer.layers.24.3.g": 1024, + "transformer.layers.24.4.ff.0.0.weight": 4194304, + "transformer.layers.24.4.ff.0.0.bias": 4096, + "transformer.layers.24.4.ff.2.weight": 4194304, + "transformer.layers.24.4.ff.2.bias": 1024, + "transformer.layers.25.0.weight": 2097152, + "transformer.layers.25.1.g": 1024, + "transformer.layers.25.2.to_q.weight": 1048576, + "transformer.layers.25.2.to_q.bias": 1024, + "transformer.layers.25.2.to_k.weight": 1048576, + "transformer.layers.25.2.to_k.bias": 1024, + "transformer.layers.25.2.to_v.weight": 1048576, + "transformer.layers.25.2.to_v.bias": 1024, + "transformer.layers.25.2.to_out.0.weight": 1048576, + "transformer.layers.25.2.to_out.0.bias": 1024, + "transformer.layers.25.3.g": 1024, + "transformer.layers.25.4.ff.0.0.weight": 4194304, + "transformer.layers.25.4.ff.0.0.bias": 4096, + "transformer.layers.25.4.ff.2.weight": 4194304, + "transformer.layers.25.4.ff.2.bias": 1024, + "transformer.norm_out.g": 1024, + "transformer.proj_out.weight": 102400, + "transformer.proj_out.bias": 100 + }, + "important_layers": [ + "transformer.time_embed.time_mlp.0.weight", + "transformer.time_embed.time_mlp.2.weight", + "transformer.text_embed.text_embed.weight", + "transformer.input_embed.proj.weight", + "transformer.input_embed.conv_pos_embed.conv1d.0.weight", + "transformer.input_embed.conv_pos_embed.conv1d.2.weight", + "transformer.layers.0.2.to_q.weight", + "transformer.layers.0.2.to_k.weight", + "transformer.layers.0.2.to_v.weight", + "transformer.layers.0.2.to_out.0.weight", + "transformer.layers.0.4.ff.0.0.weight", + "transformer.layers.0.4.ff.2.weight", + "transformer.layers.1.2.to_q.weight", + "transformer.layers.1.2.to_k.weight", + "transformer.layers.1.2.to_v.weight", + "transformer.layers.1.2.to_out.0.weight", + "transformer.layers.1.4.ff.0.0.weight", + "transformer.layers.1.4.ff.2.weight", + "transformer.layers.2.2.to_q.weight", + "transformer.layers.2.2.to_k.weight", + "transformer.layers.2.2.to_v.weight", + "transformer.layers.2.2.to_out.0.weight", + "transformer.layers.2.4.ff.0.0.weight", + "transformer.layers.2.4.ff.2.weight", + "transformer.layers.3.2.to_q.weight", + "transformer.layers.3.2.to_k.weight", + "transformer.layers.3.2.to_v.weight", + "transformer.layers.3.2.to_out.0.weight", + "transformer.layers.3.4.ff.0.0.weight", + "transformer.layers.3.4.ff.2.weight", + "transformer.layers.4.2.to_q.weight", + "transformer.layers.4.2.to_k.weight", + "transformer.layers.4.2.to_v.weight", + "transformer.layers.4.2.to_out.0.weight", + "transformer.layers.4.4.ff.0.0.weight", + "transformer.layers.4.4.ff.2.weight", + "transformer.layers.5.2.to_q.weight", + "transformer.layers.5.2.to_k.weight", + "transformer.layers.5.2.to_v.weight", + "transformer.layers.5.2.to_out.0.weight", + "transformer.layers.5.4.ff.0.0.weight", + "transformer.layers.5.4.ff.2.weight", + "transformer.layers.6.2.to_q.weight", + "transformer.layers.6.2.to_k.weight", + "transformer.layers.6.2.to_v.weight", + "transformer.layers.6.2.to_out.0.weight", + "transformer.layers.6.4.ff.0.0.weight", + "transformer.layers.6.4.ff.2.weight", + "transformer.layers.7.2.to_q.weight", + "transformer.layers.7.2.to_k.weight", + "transformer.layers.7.2.to_v.weight", + "transformer.layers.7.2.to_out.0.weight", + "transformer.layers.7.4.ff.0.0.weight", + "transformer.layers.7.4.ff.2.weight", + "transformer.layers.8.4.ff.0.0.weight", + "transformer.layers.8.4.ff.2.weight", + "transformer.layers.9.4.ff.0.0.weight", + "transformer.layers.9.4.ff.2.weight", + "transformer.layers.10.4.ff.0.0.weight", + "transformer.layers.10.4.ff.2.weight", + "transformer.layers.11.4.ff.0.0.weight", + "transformer.layers.11.4.ff.2.weight", + "transformer.layers.12.4.ff.0.0.weight", + "transformer.layers.12.4.ff.2.weight", + "transformer.layers.13.0.weight", + "transformer.layers.13.4.ff.0.0.weight", + "transformer.layers.13.4.ff.2.weight", + "transformer.layers.14.0.weight", + "transformer.layers.14.4.ff.0.0.weight", + "transformer.layers.14.4.ff.2.weight", + "transformer.layers.15.0.weight", + "transformer.layers.15.4.ff.0.0.weight", + "transformer.layers.15.4.ff.2.weight", + "transformer.layers.16.4.ff.0.0.weight", + "transformer.layers.16.4.ff.2.weight", + "transformer.layers.17.4.ff.0.0.weight", + "transformer.layers.17.4.ff.2.weight", + "transformer.layers.18.4.ff.0.0.weight", + "transformer.layers.18.4.ff.2.weight", + "transformer.layers.19.4.ff.0.0.weight", + "transformer.layers.19.4.ff.2.weight", + "transformer.layers.20.4.ff.0.0.weight", + "transformer.layers.20.4.ff.2.weight", + "transformer.layers.21.4.ff.0.0.weight", + "transformer.layers.21.4.ff.2.weight", + "transformer.layers.22.4.ff.0.0.weight", + "transformer.layers.22.4.ff.2.weight", + "transformer.layers.23.4.ff.0.0.weight", + "transformer.layers.23.4.ff.2.weight", + "transformer.layers.24.4.ff.0.0.weight", + "transformer.layers.24.4.ff.2.weight", + "transformer.layers.25.4.ff.0.0.weight", + "transformer.layers.25.4.ff.2.weight" + ], + "bottleneck_layers": [], + "recommendations": { + "focus_layers": [ + "transformer.time_embed.time_mlp.0.weight", + "transformer.time_embed.time_mlp.2.weight", + "transformer.text_embed.text_embed.weight", + "transformer.input_embed.proj.weight", + "transformer.input_embed.conv_pos_embed.conv1d.0.weight", + "transformer.input_embed.conv_pos_embed.conv1d.2.weight", + "transformer.layers.0.2.to_q.weight", + "transformer.layers.0.2.to_k.weight", + "transformer.layers.0.2.to_v.weight", + "transformer.layers.0.2.to_out.0.weight", + "transformer.layers.0.4.ff.0.0.weight", + "transformer.layers.0.4.ff.2.weight", + "transformer.layers.1.2.to_q.weight", + "transformer.layers.1.2.to_k.weight", + "transformer.layers.1.2.to_v.weight", + "transformer.layers.1.2.to_out.0.weight", + "transformer.layers.1.4.ff.0.0.weight", + "transformer.layers.1.4.ff.2.weight", + "transformer.layers.2.2.to_q.weight", + "transformer.layers.2.2.to_k.weight", + "transformer.layers.2.2.to_v.weight", + "transformer.layers.2.2.to_out.0.weight", + "transformer.layers.2.4.ff.0.0.weight", + "transformer.layers.2.4.ff.2.weight", + "transformer.layers.3.2.to_q.weight", + "transformer.layers.3.2.to_k.weight", + "transformer.layers.3.2.to_v.weight", + "transformer.layers.3.2.to_out.0.weight", + "transformer.layers.3.4.ff.0.0.weight", + "transformer.layers.3.4.ff.2.weight", + "transformer.layers.4.2.to_q.weight", + "transformer.layers.4.2.to_k.weight", + "transformer.layers.4.2.to_v.weight", + "transformer.layers.4.2.to_out.0.weight", + "transformer.layers.4.4.ff.0.0.weight", + "transformer.layers.4.4.ff.2.weight", + "transformer.layers.5.2.to_q.weight", + "transformer.layers.5.2.to_k.weight", + "transformer.layers.5.2.to_v.weight", + "transformer.layers.5.2.to_out.0.weight", + "transformer.layers.5.4.ff.0.0.weight", + "transformer.layers.5.4.ff.2.weight", + "transformer.layers.6.2.to_q.weight", + "transformer.layers.6.2.to_k.weight", + "transformer.layers.6.2.to_v.weight", + "transformer.layers.6.2.to_out.0.weight", + "transformer.layers.6.4.ff.0.0.weight", + "transformer.layers.6.4.ff.2.weight", + "transformer.layers.7.2.to_q.weight", + "transformer.layers.7.2.to_k.weight", + "transformer.layers.7.2.to_v.weight", + "transformer.layers.7.2.to_out.0.weight", + "transformer.layers.7.4.ff.0.0.weight", + "transformer.layers.7.4.ff.2.weight", + "transformer.layers.8.4.ff.0.0.weight", + "transformer.layers.8.4.ff.2.weight", + "transformer.layers.9.4.ff.0.0.weight", + "transformer.layers.9.4.ff.2.weight", + "transformer.layers.10.4.ff.0.0.weight", + "transformer.layers.10.4.ff.2.weight", + "transformer.layers.11.4.ff.0.0.weight", + "transformer.layers.11.4.ff.2.weight", + "transformer.layers.12.4.ff.0.0.weight", + "transformer.layers.12.4.ff.2.weight", + "transformer.layers.13.0.weight", + "transformer.layers.13.4.ff.0.0.weight", + "transformer.layers.13.4.ff.2.weight", + "transformer.layers.14.0.weight", + "transformer.layers.14.4.ff.0.0.weight", + "transformer.layers.14.4.ff.2.weight", + "transformer.layers.15.0.weight", + "transformer.layers.15.4.ff.0.0.weight", + "transformer.layers.15.4.ff.2.weight", + "transformer.layers.16.4.ff.0.0.weight", + "transformer.layers.16.4.ff.2.weight", + "transformer.layers.17.4.ff.0.0.weight", + "transformer.layers.17.4.ff.2.weight", + "transformer.layers.18.4.ff.0.0.weight", + "transformer.layers.18.4.ff.2.weight", + "transformer.layers.19.4.ff.0.0.weight", + "transformer.layers.19.4.ff.2.weight", + "transformer.layers.20.4.ff.0.0.weight", + "transformer.layers.20.4.ff.2.weight", + "transformer.layers.21.4.ff.0.0.weight", + "transformer.layers.21.4.ff.2.weight", + "transformer.layers.22.4.ff.0.0.weight", + "transformer.layers.22.4.ff.2.weight", + "transformer.layers.23.4.ff.0.0.weight", + "transformer.layers.23.4.ff.2.weight", + "transformer.layers.24.4.ff.0.0.weight", + "transformer.layers.24.4.ff.2.weight", + "transformer.layers.25.4.ff.0.0.weight", + "transformer.layers.25.4.ff.2.weight" + ] + }, + "total_parameters": 391, + "total_elements": 360755948, + "param_ranges": { + "transformer.time_embed.time_mlp.0.weight": { + "min": -0.43036678433418274, + "max": 0.2982814610004425, + "mean": -0.0025639168452471495, + "std": 0.04256023094058037, + "sparsity": 0.0, + "shape": [ + 1024, + 256 + ] + }, + "transformer.time_embed.time_mlp.0.bias": { + "min": -0.06307890266180038, + "max": 0.10733882337808609, + "mean": 0.000591748976148665, + "std": 0.034078747034072876, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.time_embed.time_mlp.2.weight": { + "min": -0.41281235218048096, + "max": 0.8368205428123474, + "mean": -0.00020580022828653455, + "std": 0.02411011978983879, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.time_embed.time_mlp.2.bias": { + "min": -0.11508890986442566, + "max": 0.3209010660648346, + "mean": -0.0009312849142588675, + "std": 0.01954229176044464, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.text_embed.text_embed.weight": { + "min": -2.7886247634887695, + "max": 2.8676700592041016, + "mean": -0.0003673843457363546, + "std": 0.6154846549034119, + "sparsity": 0.0, + "shape": [ + 2546, + 100 + ] + }, + "transformer.input_embed.proj.weight": { + "min": -0.27876999974250793, + "max": 0.3816433846950531, + "mean": 0.00041971245082095265, + "std": 0.0427577942609787, + "sparsity": 0.0, + "shape": [ + 1024, + 300 + ] + }, + "transformer.input_embed.proj.bias": { + "min": -0.22179193794727325, + "max": 0.20910178124904633, + "mean": -0.00449436716735363, + "std": 0.0408766008913517, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.0.weight": { + "min": -0.4284907877445221, + "max": 0.4762955904006958, + "mean": 1.3556076510212733e-06, + "std": 0.024511976167559624, + "sparsity": 0.0, + "shape": [ + 1024, + 64, + 31 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.0.bias": { + "min": -0.32450857758522034, + "max": 0.15602749586105347, + "mean": -0.04666242375969887, + "std": 0.05150512233376503, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.2.weight": { + "min": -0.4105537235736847, + "max": 0.35443225502967834, + "mean": -0.00012739744852297008, + "std": 0.023602385073900223, + "sparsity": 0.0, + "shape": [ + 1024, + 64, + 31 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.2.bias": { + "min": -0.22917909920215607, + "max": 0.2621273994445801, + "mean": -0.029117178171873093, + "std": 0.049283698201179504, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.1.g": { + "min": 0.2544216215610504, + "max": 0.8185670971870422, + "mean": 0.5252723693847656, + "std": 0.08049405366182327, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_q.weight": { + "min": -0.2967362403869629, + "max": 0.26540544629096985, + "mean": -0.0004257934633642435, + "std": 0.032104942947626114, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_q.bias": { + "min": -0.09282971918582916, + "max": 0.12431935220956802, + "mean": 0.000645699561573565, + "std": 0.02571764960885048, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_k.weight": { + "min": -0.2909117043018341, + "max": 0.28097161650657654, + "mean": -7.593112241011113e-05, + "std": 0.030932165682315826, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_k.bias": { + "min": -5.890472888946533, + "max": 5.805418491363525, + "mean": -0.009322225116193295, + "std": 1.2942466735839844, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_v.weight": { + "min": -0.42496761679649353, + "max": 0.3436029851436615, + "mean": 9.743953705765307e-05, + "std": 0.029953880235552788, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_v.bias": { + "min": -0.028933702036738396, + "max": 0.027695059776306152, + "mean": -0.00032178848050534725, + "std": 0.012570273131132126, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_out.0.weight": { + "min": -0.45337191224098206, + "max": 0.44843629002571106, + "mean": 2.4102073439280502e-05, + "std": 0.023851700127124786, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_out.0.bias": { + "min": -0.08852554112672806, + "max": 0.09096554666757584, + "mean": 0.0022833123803138733, + "std": 0.01949877291917801, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.3.g": { + "min": 0.2666127681732178, + "max": 1.0543620586395264, + "mean": 0.5309467911720276, + "std": 0.10404026508331299, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.4.ff.0.0.weight": { + "min": -0.5751341581344604, + "max": 0.6088229417800903, + "mean": -0.0004320710140746087, + "std": 0.0386008694767952, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.0.4.ff.0.0.bias": { + "min": -0.18247970938682556, + "max": 0.04547928646206856, + "mean": -0.029448386281728745, + "std": 0.04255641624331474, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.0.4.ff.2.weight": { + "min": -1.166790246963501, + "max": 1.6334140300750732, + "mean": 0.00032607169123366475, + "std": 0.02769557386636734, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.0.4.ff.2.bias": { + "min": -0.16213519871234894, + "max": 0.2053978145122528, + "mean": -0.021131210029125214, + "std": 0.02792428247630596, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.1.g": { + "min": 0.22390854358673096, + "max": 0.8422228693962097, + "mean": 0.4874723255634308, + "std": 0.0749419778585434, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_q.weight": { + "min": -0.2551497519016266, + "max": 0.3057706952095032, + "mean": -7.631031621713191e-06, + "std": 0.03347672149538994, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_q.bias": { + "min": -0.09526324272155762, + "max": 0.11054196208715439, + "mean": 5.9016994782723486e-05, + "std": 0.026952214539051056, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_k.weight": { + "min": -0.29700320959091187, + "max": 0.29560279846191406, + "mean": 5.1945076847914606e-05, + "std": 0.03254617378115654, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_k.bias": { + "min": -5.157034397125244, + "max": 5.077272891998291, + "mean": -0.014557666145265102, + "std": 1.1561598777770996, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_v.weight": { + "min": -0.34469008445739746, + "max": 0.3430800437927246, + "mean": 7.922034274088219e-05, + "std": 0.03006283938884735, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_v.bias": { + "min": -0.03611171245574951, + "max": 0.03316429257392883, + "mean": -0.00014332182763610035, + "std": 0.013021831400692463, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_out.0.weight": { + "min": -0.3155629634857178, + "max": 0.3745230734348297, + "mean": -2.0780769773409702e-05, + "std": 0.024060120806097984, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_out.0.bias": { + "min": -0.10523121803998947, + "max": 0.12181323021650314, + "mean": -0.0019697900861501694, + "std": 0.028833730146288872, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.3.g": { + "min": 0.31127864122390747, + "max": 1.118981957435608, + "mean": 0.6661038398742676, + "std": 0.09739536792039871, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.4.ff.0.0.weight": { + "min": -0.8734181523323059, + "max": 0.6272271275520325, + "mean": 0.0016762978630140424, + "std": 0.04744264855980873, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.1.4.ff.0.0.bias": { + "min": -0.27110713720321655, + "max": 0.03433133661746979, + "mean": -0.04661067947745323, + "std": 0.04056624323129654, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.1.4.ff.2.weight": { + "min": -0.9215274453163147, + "max": 0.9644713997840881, + "mean": 0.0010202918201684952, + "std": 0.0407060943543911, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.1.4.ff.2.bias": { + "min": -0.1444365382194519, + "max": 0.07489711046218872, + "mean": -0.00908645335584879, + "std": 0.02568359486758709, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.1.g": { + "min": 0.23954921960830688, + "max": 0.7114554047584534, + "mean": 0.44711926579475403, + "std": 0.059072595089673996, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_q.weight": { + "min": -0.27211347222328186, + "max": 0.29757410287857056, + "mean": 9.160639820038341e-06, + "std": 0.03547541797161102, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_q.bias": { + "min": -0.11930356919765472, + "max": 0.1185561791062355, + "mean": 0.0007570894667878747, + "std": 0.027588583528995514, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_k.weight": { + "min": -0.2805509567260742, + "max": 0.2793390452861786, + "mean": -7.711815123911947e-05, + "std": 0.03510286659002304, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_k.bias": { + "min": -2.5059573650360107, + "max": 2.5179529190063477, + "mean": 0.02672126702964306, + "std": 0.5862834453582764, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_v.weight": { + "min": -0.22094596922397614, + "max": 0.27129310369491577, + "mean": 2.4950504666776396e-06, + "std": 0.030734829604625702, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_v.bias": { + "min": -0.03352592885494232, + "max": 0.03140881285071373, + "mean": 0.00011744203220587224, + "std": 0.012399573810398579, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_out.0.weight": { + "min": -0.23510752618312836, + "max": 0.23160243034362793, + "mean": 5.7065204600803554e-05, + "std": 0.02570049650967121, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_out.0.bias": { + "min": -0.13554446399211884, + "max": 0.1277279406785965, + "mean": -0.005496564321219921, + "std": 0.039924751967191696, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.3.g": { + "min": 0.3543228507041931, + "max": 1.169933795928955, + "mean": 0.7103918194770813, + "std": 0.10339365899562836, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.4.ff.0.0.weight": { + "min": -0.6172032952308655, + "max": 0.5551565885543823, + "mean": 0.0011604262981563807, + "std": 0.04612047225236893, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.2.4.ff.0.0.bias": { + "min": -0.18880973756313324, + "max": 0.02472936362028122, + "mean": -0.034827686846256256, + "std": 0.028596267104148865, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.2.4.ff.2.weight": { + "min": -1.1306864023208618, + "max": 0.9699204564094543, + "mean": 0.00035697812563739717, + "std": 0.0423479862511158, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.2.4.ff.2.bias": { + "min": -0.5971187949180603, + "max": 0.06284646689891815, + "mean": -0.00487535959109664, + "std": 0.028591454029083252, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.1.g": { + "min": 0.37525925040245056, + "max": 0.938994288444519, + "mean": 0.5923536419868469, + "std": 0.06656986474990845, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_q.weight": { + "min": -0.3912387192249298, + "max": 0.3688672184944153, + "mean": 7.05350175849162e-05, + "std": 0.03718964010477066, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_q.bias": { + "min": -0.11892075091600418, + "max": 0.13641902804374695, + "mean": 0.0009228037670254707, + "std": 0.029190916568040848, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_k.weight": { + "min": -0.6183786392211914, + "max": 0.5081523060798645, + "mean": 1.5137170521484222e-05, + "std": 0.036442697048187256, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_k.bias": { + "min": -8.175475120544434, + "max": 8.77673053741455, + "mean": -0.10916879773139954, + "std": 1.6969348192214966, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_v.weight": { + "min": -0.27656111121177673, + "max": 0.23974747955799103, + "mean": 5.267578671919182e-05, + "std": 0.03261591121554375, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_v.bias": { + "min": -0.051889754831790924, + "max": 0.03952917456626892, + "mean": 9.714082989376038e-05, + "std": 0.012956415303051472, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_out.0.weight": { + "min": -0.23051224648952484, + "max": 0.23422203958034515, + "mean": -2.1783589545520954e-05, + "std": 0.029392505064606667, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_out.0.bias": { + "min": -0.20386114716529846, + "max": 0.105349101126194, + "mean": -0.004017278086394072, + "std": 0.032608963549137115, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.3.g": { + "min": 0.3398902118206024, + "max": 1.0104986429214478, + "mean": 0.7006295919418335, + "std": 0.09645849466323853, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.4.ff.0.0.weight": { + "min": -0.5644850134849548, + "max": 0.8330016136169434, + "mean": 0.0004154921043664217, + "std": 0.04230193421244621, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.3.4.ff.0.0.bias": { + "min": -0.21176111698150635, + "max": 0.030274739488959312, + "mean": -0.03216158226132393, + "std": 0.02647627517580986, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.3.4.ff.2.weight": { + "min": -0.7536418437957764, + "max": 0.7178125381469727, + "mean": -1.392904141539475e-05, + "std": 0.03684176877140999, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.3.4.ff.2.bias": { + "min": -0.2630210220813751, + "max": 0.10589547455310822, + "mean": -0.0030209862161427736, + "std": 0.028848819434642792, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.1.g": { + "min": 0.2840619385242462, + "max": 0.6940633654594421, + "mean": 0.4993802607059479, + "std": 0.04630398005247116, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_q.weight": { + "min": -0.27834540605545044, + "max": 0.23377880454063416, + "mean": -0.00011083983554271981, + "std": 0.03876272216439247, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_q.bias": { + "min": -0.15375865995883942, + "max": 0.12639263272285461, + "mean": -0.002223189687356353, + "std": 0.03333896026015282, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_k.weight": { + "min": -0.413473516702652, + "max": 0.6594987511634827, + "mean": -1.9574425095925108e-05, + "std": 0.039102163165807724, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_k.bias": { + "min": -4.232054233551025, + "max": 4.715608596801758, + "mean": -0.020489608868956566, + "std": 1.0068248510360718, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_v.weight": { + "min": -0.24494825303554535, + "max": 0.20708487927913666, + "mean": 4.434686343302019e-05, + "std": 0.03396739438176155, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_v.bias": { + "min": -0.034493304789066315, + "max": 0.04486649110913277, + "mean": -2.654863055795431e-05, + "std": 0.012638254091143608, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_out.0.weight": { + "min": -0.2005356252193451, + "max": 0.2055814564228058, + "mean": -3.0033888833713718e-05, + "std": 0.031025094911456108, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_out.0.bias": { + "min": -0.19959698617458344, + "max": 0.11300574988126755, + "mean": -0.002902751788496971, + "std": 0.03449735790491104, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.3.g": { + "min": 0.3668424189090729, + "max": 1.05502188205719, + "mean": 0.6704874634742737, + "std": 0.06617505103349686, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.4.ff.0.0.weight": { + "min": -0.3976363241672516, + "max": 0.5017815828323364, + "mean": -3.87727704946883e-05, + "std": 0.041137050837278366, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.4.4.ff.0.0.bias": { + "min": -0.12772123515605927, + "max": 0.026762252673506737, + "mean": -0.03051420859992504, + "std": 0.021863147616386414, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.4.4.ff.2.weight": { + "min": -0.44920089840888977, + "max": 0.4333121180534363, + "mean": 7.599063974339515e-05, + "std": 0.034896738827228546, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.4.4.ff.2.bias": { + "min": -0.2671979069709778, + "max": 0.07298687100410461, + "mean": -0.0010975392069667578, + "std": 0.023116325959563255, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.1.g": { + "min": 0.28697913885116577, + "max": 0.6839067339897156, + "mean": 0.5244333744049072, + "std": 0.047293804585933685, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_q.weight": { + "min": -0.22255805134773254, + "max": 0.22290681302547455, + "mean": 1.621080627955962e-05, + "std": 0.03895403817296028, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_q.bias": { + "min": -0.13629747927188873, + "max": 0.109336718916893, + "mean": 0.0002461877593304962, + "std": 0.02917083166539669, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_k.weight": { + "min": -0.3738900125026703, + "max": 0.43744465708732605, + "mean": -9.668656275607646e-06, + "std": 0.03929208964109421, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_k.bias": { + "min": -3.840332269668579, + "max": 4.992400646209717, + "mean": 0.009748304262757301, + "std": 0.8444803953170776, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_v.weight": { + "min": -0.22292070090770721, + "max": 0.21977820992469788, + "mean": -4.448638719622977e-07, + "std": 0.03441440686583519, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_v.bias": { + "min": -0.04357949644327164, + "max": 0.03590534254908562, + "mean": -0.000258232990745455, + "std": 0.012078864499926567, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_out.0.weight": { + "min": -0.21297886967658997, + "max": 0.18814441561698914, + "mean": -1.71422834682744e-05, + "std": 0.031540658324956894, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_out.0.bias": { + "min": -0.1805071383714676, + "max": 0.12073972076177597, + "mean": -0.00239769509062171, + "std": 0.04125608131289482, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.3.g": { + "min": 0.4227307438850403, + "max": 0.9400621056556702, + "mean": 0.662601888179779, + "std": 0.056538671255111694, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.4.ff.0.0.weight": { + "min": -0.37151503562927246, + "max": 0.4761146008968353, + "mean": -8.195374539354816e-05, + "std": 0.040896203368902206, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.5.4.ff.0.0.bias": { + "min": -0.20797580480575562, + "max": 0.027151037007570267, + "mean": -0.030222713947296143, + "std": 0.021336952224373817, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.5.4.ff.2.weight": { + "min": -0.33968234062194824, + "max": 0.7333835959434509, + "mean": 8.077031816355884e-05, + "std": 0.034772153943777084, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.5.4.ff.2.bias": { + "min": -0.23987196385860443, + "max": 0.05037139728665352, + "mean": -0.0011877692304551601, + "std": 0.020454443991184235, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.1.g": { + "min": 0.30607396364212036, + "max": 0.652435839176178, + "mean": 0.5250428915023804, + "std": 0.04590361937880516, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_q.weight": { + "min": -0.3039066791534424, + "max": 0.21754606068134308, + "mean": 7.030011329334229e-05, + "std": 0.03950100764632225, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_q.bias": { + "min": -0.14914348721504211, + "max": 0.13110090792179108, + "mean": 0.00035085732815787196, + "std": 0.030418941751122475, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_k.weight": { + "min": -0.2568054795265198, + "max": 0.20193904638290405, + "mean": 3.147923416690901e-05, + "std": 0.03949080780148506, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_k.bias": { + "min": -2.3329901695251465, + "max": 2.3725619316101074, + "mean": -0.02622254565358162, + "std": 0.4494195282459259, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_v.weight": { + "min": -0.18853308260440826, + "max": 0.2103482335805893, + "mean": 3.745816502487287e-05, + "std": 0.03479913994669914, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_v.bias": { + "min": -0.03156094253063202, + "max": 0.035385265946388245, + "mean": -0.0001973491598619148, + "std": 0.012292337603867054, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_out.0.weight": { + "min": -0.1882481426000595, + "max": 0.17012155055999756, + "mean": -6.810311606386676e-05, + "std": 0.03217574581503868, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_out.0.bias": { + "min": -0.13900111615657806, + "max": 0.13692621886730194, + "mean": -0.002514890395104885, + "std": 0.051281191408634186, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.3.g": { + "min": 0.46707433462142944, + "max": 0.9541991353034973, + "mean": 0.6688030958175659, + "std": 0.052486222237348557, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.4.ff.0.0.weight": { + "min": -0.32425403594970703, + "max": 0.30980852246284485, + "mean": -1.290425643674098e-06, + "std": 0.040951915085315704, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.6.4.ff.0.0.bias": { + "min": -0.12465585768222809, + "max": 0.02537902072072029, + "mean": -0.030681122094392776, + "std": 0.0198006983846426, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.6.4.ff.2.weight": { + "min": -0.43958571553230286, + "max": 0.44490763545036316, + "mean": 9.539163875160739e-05, + "std": 0.0351250097155571, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.6.4.ff.2.bias": { + "min": -0.2243558019399643, + "max": 0.0517578125, + "mean": -0.0011802279623225331, + "std": 0.018464019522070885, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.1.g": { + "min": 0.33896756172180176, + "max": 0.7381694912910461, + "mean": 0.5586157441139221, + "std": 0.04119841381907463, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_q.weight": { + "min": -0.27227938175201416, + "max": 0.27836883068084717, + "mean": 1.999387313844636e-05, + "std": 0.041062600910663605, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_q.bias": { + "min": -0.13660800457000732, + "max": 0.1392778903245926, + "mean": 0.0004841584013774991, + "std": 0.02658114954829216, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_k.weight": { + "min": -0.4896349310874939, + "max": 0.3551800847053528, + "mean": 8.872073522070423e-05, + "std": 0.04069973900914192, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_k.bias": { + "min": -2.293769121170044, + "max": 1.742555856704712, + "mean": -0.02106180600821972, + "std": 0.49974092841148376, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_v.weight": { + "min": -0.2175416797399521, + "max": 0.19781090319156647, + "mean": -4.052485746797174e-05, + "std": 0.03423763066530228, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_v.bias": { + "min": -0.04145532101392746, + "max": 0.038727227598428726, + "mean": -0.00013765225594397634, + "std": 0.012874336913228035, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_out.0.weight": { + "min": -0.177314892411232, + "max": 0.1832207590341568, + "mean": 4.75629567517899e-05, + "std": 0.03156043216586113, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_out.0.bias": { + "min": -0.1798381805419922, + "max": 0.18348462879657745, + "mean": -0.002212759107351303, + "std": 0.054820165038108826, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.3.g": { + "min": 0.4742435812950134, + "max": 1.0238897800445557, + "mean": 0.6451865434646606, + "std": 0.05008064582943916, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.4.ff.0.0.weight": { + "min": -0.2714613080024719, + "max": 0.3092961311340332, + "mean": 0.00011265614011790603, + "std": 0.04068758338689804, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.7.4.ff.0.0.bias": { + "min": -0.1055683121085167, + "max": 0.026772309094667435, + "mean": -0.029506118968129158, + "std": 0.017915068194270134, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.7.4.ff.2.weight": { + "min": -0.33880147337913513, + "max": 0.3287900686264038, + "mean": 5.556903124670498e-05, + "std": 0.03441847860813141, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.7.4.ff.2.bias": { + "min": -0.18144568800926208, + "max": 0.04239530488848686, + "mean": -0.001068950048647821, + "std": 0.017201630398631096, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.1.g": { + "min": 0.3253972828388214, + "max": 0.68559730052948, + "mean": 0.5111000537872314, + "std": 0.03672371804714203, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_q.weight": { + "min": -0.23373860120773315, + "max": 0.22572296857833862, + "mean": -3.580976772354916e-05, + "std": 0.039181455969810486, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_q.bias": { + "min": -0.11530666053295135, + "max": 0.1317266821861267, + "mean": 0.00015847355825826526, + "std": 0.029152128845453262, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_k.weight": { + "min": -0.3521575629711151, + "max": 0.2847552001476288, + "mean": 7.120977898011915e-06, + "std": 0.039250005036592484, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_k.bias": { + "min": -4.126590728759766, + "max": 3.538623332977295, + "mean": -0.01155401673167944, + "std": 0.6819069385528564, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_v.weight": { + "min": -0.21105211973190308, + "max": 0.20891818404197693, + "mean": 3.4748343750834465e-05, + "std": 0.03448968380689621, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_v.bias": { + "min": -0.03559347987174988, + "max": 0.04803197458386421, + "mean": 0.0007964627584442496, + "std": 0.012855397537350655, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_out.0.weight": { + "min": -0.21038679778575897, + "max": 0.1929050087928772, + "mean": -1.3255728390504373e-06, + "std": 0.0317002572119236, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_out.0.bias": { + "min": -0.18651214241981506, + "max": 0.17674075067043304, + "mean": -0.002840832807123661, + "std": 0.05859901383519173, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.3.g": { + "min": 0.4748058021068573, + "max": 1.0396208763122559, + "mean": 0.6513342261314392, + "std": 0.049332328140735626, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.4.ff.0.0.weight": { + "min": -0.2482759803533554, + "max": 0.3290877640247345, + "mean": 0.00018071771773975343, + "std": 0.04057670012116432, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.8.4.ff.0.0.bias": { + "min": -0.12517917156219482, + "max": 0.02484654076397419, + "mean": -0.030485937371850014, + "std": 0.017585651949048042, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.8.4.ff.2.weight": { + "min": -0.42004328966140747, + "max": 0.48050060868263245, + "mean": -1.1724823707481846e-06, + "std": 0.03540315851569176, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.8.4.ff.2.bias": { + "min": -0.15136678516864777, + "max": 0.04356072470545769, + "mean": 4.775111301569268e-05, + "std": 0.014870403334498405, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.1.g": { + "min": 0.3155934810638428, + "max": 0.6807596683502197, + "mean": 0.5528346300125122, + "std": 0.04051977023482323, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_q.weight": { + "min": -0.2063884735107422, + "max": 0.21910899877548218, + "mean": 3.103859489783645e-05, + "std": 0.038303472101688385, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_q.bias": { + "min": -0.13769029080867767, + "max": 0.1125277578830719, + "mean": 1.9220009562559426e-05, + "std": 0.02578623965382576, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_k.weight": { + "min": -0.40236374735832214, + "max": 0.37038296461105347, + "mean": 2.613713513710536e-05, + "std": 0.03818493336439133, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_k.bias": { + "min": -3.7654759883880615, + "max": 2.864607572555542, + "mean": 0.0011372193694114685, + "std": 0.51633220911026, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_v.weight": { + "min": -0.20273104310035706, + "max": 0.1974526047706604, + "mean": 2.9206170438556e-05, + "std": 0.034301165491342545, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_v.bias": { + "min": -0.05080447345972061, + "max": 0.0398997887969017, + "mean": -0.00042000875691883266, + "std": 0.013411123305559158, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_out.0.weight": { + "min": -0.19611378014087677, + "max": 0.20161780714988708, + "mean": -1.2710506780422293e-05, + "std": 0.03180883079767227, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_out.0.bias": { + "min": -0.19288454949855804, + "max": 0.1946749985218048, + "mean": -0.002961306367069483, + "std": 0.06252170354127884, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.3.g": { + "min": 0.3495900332927704, + "max": 1.0818731784820557, + "mean": 0.6670873165130615, + "std": 0.054898131638765335, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.4.ff.0.0.weight": { + "min": -0.22497375309467316, + "max": 0.25112366676330566, + "mean": 0.00035900043440051377, + "std": 0.04076608642935753, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.9.4.ff.0.0.bias": { + "min": -0.09095952659845352, + "max": 0.0440162755548954, + "mean": -0.030070394277572632, + "std": 0.017598489299416542, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.9.4.ff.2.weight": { + "min": -0.35297849774360657, + "max": 0.3037008047103882, + "mean": -4.511567021836527e-05, + "std": 0.03712863847613335, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.9.4.ff.2.bias": { + "min": -0.1615627110004425, + "max": 0.06344226002693176, + "mean": -7.402076153084636e-05, + "std": 0.019400237128138542, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.1.g": { + "min": 0.3484867811203003, + "max": 0.7205584049224854, + "mean": 0.5422928333282471, + "std": 0.03884059190750122, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_q.weight": { + "min": -0.21938610076904297, + "max": 0.223092183470726, + "mean": -1.1128584446851164e-05, + "std": 0.0392366424202919, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_q.bias": { + "min": -0.11822070181369781, + "max": 0.1703757494688034, + "mean": 0.0002712813438847661, + "std": 0.025094762444496155, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_k.weight": { + "min": -0.2461908757686615, + "max": 0.3006460666656494, + "mean": -3.654139436548576e-05, + "std": 0.03893598914146423, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_k.bias": { + "min": -3.499889850616455, + "max": 3.708961009979248, + "mean": 0.01583799161016941, + "std": 0.781475305557251, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_v.weight": { + "min": -0.21841062605381012, + "max": 0.23724044859409332, + "mean": -1.4060610737942625e-05, + "std": 0.03630809485912323, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_v.bias": { + "min": -0.04710822552442551, + "max": 0.05138855054974556, + "mean": 0.00048449443420395255, + "std": 0.013518092222511768, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_out.0.weight": { + "min": -0.21374864876270294, + "max": 0.2171718180179596, + "mean": 5.6465847592335194e-05, + "std": 0.03361979499459267, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_out.0.bias": { + "min": -0.21103325486183167, + "max": 0.2311553806066513, + "mean": -0.005100366659462452, + "std": 0.06185431033372879, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.3.g": { + "min": 0.36209091544151306, + "max": 1.0989015102386475, + "mean": 0.6992126703262329, + "std": 0.053264226764440536, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.4.ff.0.0.weight": { + "min": -0.23423242568969727, + "max": 0.24471710622310638, + "mean": 0.00046349214971996844, + "std": 0.04127512127161026, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.10.4.ff.0.0.bias": { + "min": -0.09780248254537582, + "max": 0.06824193894863129, + "mean": -0.031424038112163544, + "std": 0.018106156960129738, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.10.4.ff.2.weight": { + "min": -0.301416277885437, + "max": 0.35142549872398376, + "mean": -8.288547542179003e-05, + "std": 0.04028111323714256, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.10.4.ff.2.bias": { + "min": -0.15196339786052704, + "max": 0.14944323897361755, + "mean": 0.0002634537231642753, + "std": 0.023027226328849792, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.1.g": { + "min": 1.0, + "max": 1.0, + "mean": 1.0, + "std": 0.0, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_q.weight": { + "min": -0.031249936670064926, + "max": 0.031249839812517166, + "mean": -1.9292721844976768e-05, + "std": 0.01804409734904766, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_q.bias": { + "min": -0.031226642429828644, + "max": 0.03100142627954483, + "mean": -0.0010842883493751287, + "std": 0.01795371063053608, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_k.weight": { + "min": -0.031249966472387314, + "max": 0.031249895691871643, + "mean": 3.5441100862954045e-06, + "std": 0.018044503405690193, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_k.bias": { + "min": -0.031156372278928757, + "max": 0.031184475868940353, + "mean": 0.0003338930255267769, + "std": 0.018065759912133217, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_v.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_v.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_out.0.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_out.0.bias": { + "min": -0.0003838505072053522, + "max": 0.00040078736492432654, + "mean": 7.502898370148614e-06, + "std": 0.00012165026419097558, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.3.g": { + "min": 0.9996746778488159, + "max": 1.0017435550689697, + "mean": 1.0005855560302734, + "std": 0.0003091032849624753, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.4.ff.0.0.weight": { + "min": -0.03248094022274017, + "max": 0.03274688497185707, + "mean": -1.2105063433409669e-05, + "std": 0.01805892214179039, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.11.4.ff.0.0.bias": { + "min": -0.031171226873993874, + "max": 0.03214619308710098, + "mean": 0.0004906345857307315, + "std": 0.017989112064242363, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.11.4.ff.2.weight": { + "min": -0.0009105296921916306, + "max": 0.001230148016475141, + "mean": 2.7432847673480865e-06, + "std": 0.0001725118636386469, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.11.4.ff.2.bias": { + "min": -0.00036263937363401055, + "max": 0.00041731935925781727, + "mean": 7.396344699373003e-06, + "std": 0.00011976793757639825, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.1.g": { + "min": 0.38287991285324097, + "max": 0.7182613015174866, + "mean": 0.5806185603141785, + "std": 0.03863256797194481, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_q.weight": { + "min": -0.23785854876041412, + "max": 0.19614756107330322, + "mean": 2.640879392856732e-05, + "std": 0.037470731884241104, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_q.bias": { + "min": -0.11855358630418777, + "max": 0.16578993201255798, + "mean": 0.0009884096216410398, + "std": 0.027530910447239876, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_k.weight": { + "min": -0.2458752989768982, + "max": 0.500349223613739, + "mean": -5.065255027147941e-05, + "std": 0.03762831538915634, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_k.bias": { + "min": -3.936182975769043, + "max": 3.763556957244873, + "mean": -0.003569458145648241, + "std": 0.6807414293289185, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_v.weight": { + "min": -0.22705353796482086, + "max": 0.251341313123703, + "mean": -1.142405926657375e-05, + "std": 0.03743990138173103, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_v.bias": { + "min": -0.07149660587310791, + "max": 0.08067727833986282, + "mean": -0.0005162369925528765, + "std": 0.015656527131795883, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_out.0.weight": { + "min": -0.22786642611026764, + "max": 0.2578106224536896, + "mean": -2.8714632207993418e-05, + "std": 0.035426877439022064, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_out.0.bias": { + "min": -0.20022797584533691, + "max": 0.21474605798721313, + "mean": -0.005530310794711113, + "std": 0.0683104544878006, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.3.g": { + "min": 0.4048909544944763, + "max": 1.1872107982635498, + "mean": 0.7378276586532593, + "std": 0.05486491322517395, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.4.ff.0.0.weight": { + "min": -0.22101044654846191, + "max": 0.2458520382642746, + "mean": 0.0005211633397266269, + "std": 0.04134228080511093, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.12.4.ff.0.0.bias": { + "min": -0.10363762825727463, + "max": 0.023918237537145615, + "mean": -0.03266144543886185, + "std": 0.018866004422307014, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.12.4.ff.2.weight": { + "min": -0.4487850069999695, + "max": 0.42181524634361267, + "mean": -0.00043266150169074535, + "std": 0.04690360650420189, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.12.4.ff.2.bias": { + "min": -0.25105422735214233, + "max": 0.46941903233528137, + "mean": 0.003198462538421154, + "std": 0.044503308832645416, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.0.weight": { + "min": -0.3172111511230469, + "max": 0.33329516649246216, + "mean": -2.550867066020146e-05, + "std": 0.021290993317961693, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.13.1.g": { + "min": 0.32461607456207275, + "max": 0.6840938329696655, + "mean": 0.5709556341171265, + "std": 0.04454263672232628, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_q.weight": { + "min": -0.16456733644008636, + "max": 0.17394505441188812, + "mean": -4.8416688514407724e-05, + "std": 0.03318499028682709, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_q.bias": { + "min": -0.1864674687385559, + "max": 0.14258594810962677, + "mean": 3.8281112210825086e-05, + "std": 0.029655346646904945, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_k.weight": { + "min": -0.3803539276123047, + "max": 0.2457817941904068, + "mean": -1.002950102702016e-05, + "std": 0.032765936106443405, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_k.bias": { + "min": -3.6502115726470947, + "max": 3.285125494003296, + "mean": -0.014261167496442795, + "std": 0.9845166206359863, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_v.weight": { + "min": -0.23496489226818085, + "max": 0.24718151986598969, + "mean": -1.8079399524140172e-05, + "std": 0.041703000664711, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_v.bias": { + "min": -0.07261228561401367, + "max": 0.15409623086452484, + "mean": 0.0006618116749450564, + "std": 0.02513669617474079, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_out.0.weight": { + "min": -0.26620712876319885, + "max": 0.24820521473884583, + "mean": -1.5344019629992545e-05, + "std": 0.04014336317777634, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_out.0.bias": { + "min": -0.18921570479869843, + "max": 0.19427257776260376, + "mean": -0.0012257307535037398, + "std": 0.0666433721780777, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.3.g": { + "min": 0.32903727889060974, + "max": 0.9973482489585876, + "mean": 0.7190757393836975, + "std": 0.051972683519124985, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.4.ff.0.0.weight": { + "min": -0.23141932487487793, + "max": 0.24504587054252625, + "mean": 0.0001826788648031652, + "std": 0.04090685769915581, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.13.4.ff.0.0.bias": { + "min": -0.11396601796150208, + "max": 0.01875537633895874, + "mean": -0.04246020317077637, + "std": 0.018833719193935394, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.13.4.ff.2.weight": { + "min": -0.38934653997421265, + "max": 0.4067343473434448, + "mean": -2.1657660909113474e-05, + "std": 0.04854125902056694, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.13.4.ff.2.bias": { + "min": -0.6919497847557068, + "max": 0.411848247051239, + "mean": 0.0008590769721195102, + "std": 0.06023983284831047, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.0.weight": { + "min": -0.000941734469961375, + "max": 1.0006029605865479, + "mean": 0.00048819385119713843, + "std": 0.02209211327135563, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.14.1.g": { + "min": 1.0, + "max": 1.0, + "mean": 1.0, + "std": 0.0, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_q.weight": { + "min": -0.031249970197677612, + "max": 0.031249817460775375, + "mean": -2.1022657165303826e-05, + "std": 0.018035436049103737, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_q.bias": { + "min": -0.03122086077928543, + "max": 0.031233571469783783, + "mean": -0.0006771883927285671, + "std": 0.01782997138798237, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_k.weight": { + "min": -0.03124987706542015, + "max": 0.031249921768903732, + "mean": -8.839062502374873e-06, + "std": 0.01803446188569069, + "sparsity": 9.5367431640625e-07, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_k.bias": { + "min": -0.031232360750436783, + "max": 0.031245984137058258, + "mean": -0.0007298353011719882, + "std": 0.017944591119885445, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_v.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_v.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_out.0.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_out.0.bias": { + "min": -0.0003224269312340766, + "max": 0.0002993023081216961, + "mean": 6.5217936935368925e-06, + "std": 0.0001044638265739195, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.3.g": { + "min": 0.9996813535690308, + "max": 1.0015599727630615, + "mean": 1.000339150428772, + "std": 0.0002295201556989923, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.4.ff.0.0.weight": { + "min": -0.032516807317733765, + "max": 0.03226118162274361, + "mean": 4.161014203418745e-06, + "std": 0.018049873411655426, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.14.4.ff.0.0.bias": { + "min": -0.031123636290431023, + "max": 0.03165753185749054, + "mean": 0.0003850722569040954, + "std": 0.018070610240101814, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.14.4.ff.2.weight": { + "min": -0.0009010994690470397, + "max": 0.0009490308002568781, + "mean": 2.8105064302508254e-06, + "std": 0.00016459461767226458, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.14.4.ff.2.bias": { + "min": -0.00032089874730445445, + "max": 0.00031345486058853567, + "mean": 6.42746908852132e-06, + "std": 0.00010272208601236343, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.0.weight": { + "min": -0.23485393822193146, + "max": 0.27267447113990784, + "mean": 6.709969511575764e-06, + "std": 0.018812596797943115, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.15.1.g": { + "min": 0.32135409116744995, + "max": 0.6922963857650757, + "mean": 0.5815727710723877, + "std": 0.045748595148324966, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_q.weight": { + "min": -0.1818080097436905, + "max": 0.19750945270061493, + "mean": -1.1748516044463031e-05, + "std": 0.03318887948989868, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_q.bias": { + "min": -0.16036057472229004, + "max": 0.12932586669921875, + "mean": -0.0010664488654583693, + "std": 0.03411008045077324, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_k.weight": { + "min": -0.33175674080848694, + "max": 0.31088003516197205, + "mean": -1.0311603546142578e-05, + "std": 0.0322394073009491, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_k.bias": { + "min": -7.791174411773682, + "max": 8.749550819396973, + "mean": 0.09336872398853302, + "std": 1.6178374290466309, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_v.weight": { + "min": -0.23367103934288025, + "max": 0.2417406141757965, + "mean": 4.146722494624555e-05, + "std": 0.04086144268512726, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_v.bias": { + "min": -0.07598260790109634, + "max": 0.06560970842838287, + "mean": 0.0004800831666216254, + "std": 0.019395504146814346, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_out.0.weight": { + "min": -0.24560654163360596, + "max": 0.23375561833381653, + "mean": -2.9877701308578253e-06, + "std": 0.03943600133061409, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_out.0.bias": { + "min": -0.1627652794122696, + "max": 0.16063357889652252, + "mean": 0.0016337584238499403, + "std": 0.06525594741106033, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.3.g": { + "min": 0.5568895936012268, + "max": 0.9421334266662598, + "mean": 0.7127605080604553, + "std": 0.03978221118450165, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.4.ff.0.0.weight": { + "min": -0.22847090661525726, + "max": 0.25493934750556946, + "mean": -4.550522498902865e-05, + "std": 0.040581200271844864, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.15.4.ff.0.0.bias": { + "min": -0.1344706267118454, + "max": 0.022221069782972336, + "mean": -0.04133939743041992, + "std": 0.01835877075791359, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.15.4.ff.2.weight": { + "min": -0.4210115969181061, + "max": 0.3920403718948364, + "mean": -4.534296749625355e-06, + "std": 0.047791384160518646, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.15.4.ff.2.bias": { + "min": -0.6062420010566711, + "max": 0.6502339243888855, + "mean": 0.0015842054272070527, + "std": 0.05679100751876831, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.0.weight": { + "min": -0.252038836479187, + "max": 0.32106301188468933, + "mean": -6.296660103544127e-06, + "std": 0.019615648314356804, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.16.1.g": { + "min": 0.35961171984672546, + "max": 0.6809778809547424, + "mean": 0.5706169605255127, + "std": 0.042782142758369446, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_q.weight": { + "min": -0.22040791809558868, + "max": 0.17709863185882568, + "mean": -3.522756742313504e-05, + "std": 0.03430448845028877, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_q.bias": { + "min": -0.16271811723709106, + "max": 0.23246890306472778, + "mean": 0.0003684491675812751, + "std": 0.03280302509665489, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_k.weight": { + "min": -0.26368996500968933, + "max": 0.23957668244838715, + "mean": -5.283607606543228e-05, + "std": 0.03390355408191681, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_k.bias": { + "min": -4.8473591804504395, + "max": 5.083388805389404, + "mean": 0.04383918642997742, + "std": 1.2279300689697266, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_v.weight": { + "min": -0.24628077447414398, + "max": 0.2501535415649414, + "mean": 7.219994586193934e-05, + "std": 0.04399203881621361, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_v.bias": { + "min": -0.062493205070495605, + "max": 0.054467517882585526, + "mean": 0.0006505983183160424, + "std": 0.01718413643538952, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_out.0.weight": { + "min": -0.2860679030418396, + "max": 0.27162545919418335, + "mean": -4.9951679102377966e-05, + "std": 0.04299019277095795, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_out.0.bias": { + "min": -0.16042187809944153, + "max": 0.1700378805398941, + "mean": -0.0028904015198349953, + "std": 0.05927493795752525, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.3.g": { + "min": 0.5196736454963684, + "max": 0.931270182132721, + "mean": 0.7133467197418213, + "std": 0.03808481991291046, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.4.ff.0.0.weight": { + "min": -0.2380017340183258, + "max": 0.24893511831760406, + "mean": 0.00046494320849888027, + "std": 0.04046032205224037, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.16.4.ff.0.0.bias": { + "min": -0.1442948430776596, + "max": 0.041139233857393265, + "mean": -0.03967897593975067, + "std": 0.020518682897090912, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.16.4.ff.2.weight": { + "min": -0.5318877696990967, + "max": 0.5818965435028076, + "mean": 6.336260412354022e-06, + "std": 0.048867613077163696, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.16.4.ff.2.bias": { + "min": -0.5183113813400269, + "max": 0.4925517439842224, + "mean": 0.0023608217015862465, + "std": 0.053406503051519394, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.0.weight": { + "min": -0.2738274037837982, + "max": 0.31547796726226807, + "mean": 1.8216255739389453e-06, + "std": 0.02005232311785221, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.17.1.g": { + "min": 0.3659067749977112, + "max": 0.7100387215614319, + "mean": 0.5930584073066711, + "std": 0.04572707787156105, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_q.weight": { + "min": -0.21076832711696625, + "max": 0.19927603006362915, + "mean": 3.0815259378869087e-05, + "std": 0.03487056866288185, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_q.bias": { + "min": -0.186960831284523, + "max": 0.20310287177562714, + "mean": 0.0009555225260555744, + "std": 0.03147275000810623, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_k.weight": { + "min": -0.28951019048690796, + "max": 0.33969932794570923, + "mean": -4.744817124446854e-05, + "std": 0.034591346979141235, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_k.bias": { + "min": -3.8711647987365723, + "max": 3.3820366859436035, + "mean": 0.01444312371313572, + "std": 0.8576698899269104, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_v.weight": { + "min": -0.2244085818529129, + "max": 0.249923974275589, + "mean": -3.961446964240167e-06, + "std": 0.04223531484603882, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_v.bias": { + "min": -0.05502909794449806, + "max": 0.04645157977938652, + "mean": -2.0665102056227624e-05, + "std": 0.01583181880414486, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_out.0.weight": { + "min": -0.2927229106426239, + "max": 0.2906007766723633, + "mean": -7.488439223379828e-06, + "std": 0.04195013642311096, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_out.0.bias": { + "min": -0.12459567189216614, + "max": 0.25878894329071045, + "mean": -0.0032436519395560026, + "std": 0.053140122443437576, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.3.g": { + "min": 0.4563407599925995, + "max": 0.8428970575332642, + "mean": 0.7054145932197571, + "std": 0.03490997478365898, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.4.ff.0.0.weight": { + "min": -0.5114501714706421, + "max": 0.3482079803943634, + "mean": 0.00034245854476466775, + "std": 0.04020575433969498, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.17.4.ff.0.0.bias": { + "min": -0.18575434386730194, + "max": 0.03953104466199875, + "mean": -0.03936902433633804, + "std": 0.021325672045350075, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.17.4.ff.2.weight": { + "min": -0.5437595248222351, + "max": 0.5556712746620178, + "mean": -7.024264050414786e-05, + "std": 0.05074309557676315, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.17.4.ff.2.bias": { + "min": -0.5109111666679382, + "max": 0.6631372570991516, + "mean": 0.002439212054014206, + "std": 0.049490757286548615, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.0.weight": { + "min": -0.33253294229507446, + "max": 0.2652721107006073, + "mean": 3.378802830411587e-06, + "std": 0.019389795139431953, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.18.1.g": { + "min": 0.3220270276069641, + "max": 0.7649413347244263, + "mean": 0.6509413719177246, + "std": 0.045111026614904404, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_q.weight": { + "min": -0.2494993954896927, + "max": 0.21881401538848877, + "mean": -2.360827238589991e-06, + "std": 0.03650495037436485, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_q.bias": { + "min": -0.3266308009624481, + "max": 0.28657323122024536, + "mean": -0.0006807027384638786, + "std": 0.038520634174346924, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_k.weight": { + "min": -0.30951929092407227, + "max": 0.36978626251220703, + "mean": 6.48990971967578e-05, + "std": 0.036245379596948624, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_k.bias": { + "min": -4.710280895233154, + "max": 5.798713684082031, + "mean": 0.037927284836769104, + "std": 1.4116240739822388, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_v.weight": { + "min": -0.22114244103431702, + "max": 0.20574785768985748, + "mean": -7.537077181041241e-05, + "std": 0.04249110445380211, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_v.bias": { + "min": -0.07735679298639297, + "max": 0.05145302414894104, + "mean": -0.0009192783036269248, + "std": 0.016400594264268875, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_out.0.weight": { + "min": -0.3307357728481293, + "max": 0.32934609055519104, + "mean": -4.647547484637471e-06, + "std": 0.042797382920980453, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_out.0.bias": { + "min": -0.28440576791763306, + "max": 0.11188910901546478, + "mean": -0.0012069176882505417, + "std": 0.0469915047287941, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.3.g": { + "min": 0.4862346351146698, + "max": 0.8851982355117798, + "mean": 0.7373509407043457, + "std": 0.03795893117785454, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.4.ff.0.0.weight": { + "min": -0.3612706959247589, + "max": 0.27453744411468506, + "mean": 5.114857412991114e-05, + "std": 0.04065178707242012, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.18.4.ff.0.0.bias": { + "min": -0.24725216627120972, + "max": 0.04655319079756737, + "mean": -0.03925145044922829, + "std": 0.023245742544531822, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.18.4.ff.2.weight": { + "min": -0.625215470790863, + "max": 0.5962166786193848, + "mean": -5.8090816310141236e-05, + "std": 0.05312598869204521, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.18.4.ff.2.bias": { + "min": -0.7085027694702148, + "max": 0.2653276026248932, + "mean": 0.0009165835799649358, + "std": 0.0511946901679039, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.0.weight": { + "min": -0.34328368306159973, + "max": 0.3035609722137451, + "mean": 1.4504064438369824e-07, + "std": 0.019138522446155548, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.19.1.g": { + "min": 0.3498779833316803, + "max": 0.7813707590103149, + "mean": 0.6387293338775635, + "std": 0.049000099301338196, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_q.weight": { + "min": -0.20522303879261017, + "max": 0.20651094615459442, + "mean": -5.9693807997973636e-05, + "std": 0.03769965097308159, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_q.bias": { + "min": -0.25792619585990906, + "max": 0.2676540017127991, + "mean": -0.0004065552493557334, + "std": 0.044568419456481934, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_k.weight": { + "min": -0.3535814583301544, + "max": 0.32190999388694763, + "mean": -7.394870408461429e-06, + "std": 0.037208281457424164, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_k.bias": { + "min": -5.253505706787109, + "max": 4.198240280151367, + "mean": -0.026390478014945984, + "std": 1.0056747198104858, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_v.weight": { + "min": -0.2384454905986786, + "max": 0.24342015385627747, + "mean": -2.5527655452606268e-05, + "std": 0.043215684592723846, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_v.bias": { + "min": -0.06227009370923042, + "max": 0.05663022771477699, + "mean": 0.0003446021000854671, + "std": 0.01414022222161293, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_out.0.weight": { + "min": -0.43697887659072876, + "max": 0.3737882673740387, + "mean": 1.4649482181994244e-05, + "std": 0.04412706196308136, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_out.0.bias": { + "min": -0.09632225334644318, + "max": 0.1757834255695343, + "mean": -0.0006590378470718861, + "std": 0.03513453155755997, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.3.g": { + "min": 0.4219363331794739, + "max": 1.0674819946289062, + "mean": 0.7483711838722229, + "std": 0.041829537600278854, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.4.ff.0.0.weight": { + "min": -0.26578643918037415, + "max": 0.29607900977134705, + "mean": -7.925635145511478e-05, + "std": 0.04081210494041443, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.19.4.ff.0.0.bias": { + "min": -0.18497370183467865, + "max": 0.04346155747771263, + "mean": -0.03679885342717171, + "std": 0.025566671043634415, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.19.4.ff.2.weight": { + "min": -0.45727846026420593, + "max": 0.48611682653427124, + "mean": 4.68605212518014e-05, + "std": 0.05422008037567139, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.19.4.ff.2.bias": { + "min": -0.285878986120224, + "max": 0.5506833791732788, + "mean": -0.0008855935884639621, + "std": 0.047791752964258194, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.0.weight": { + "min": -0.2927459478378296, + "max": 0.32270148396492004, + "mean": 6.155781647976255e-06, + "std": 0.019972333684563637, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.20.1.g": { + "min": 0.29097816348075867, + "max": 0.7588945627212524, + "mean": 0.6507570743560791, + "std": 0.05195188894867897, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_q.weight": { + "min": -0.24343979358673096, + "max": 0.2611932158470154, + "mean": -5.595570200966904e-06, + "std": 0.039616428315639496, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_q.bias": { + "min": -0.2672193646430969, + "max": 0.19968828558921814, + "mean": -0.0008741158526390791, + "std": 0.051719244569540024, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_k.weight": { + "min": -0.2713148593902588, + "max": 0.25280529260635376, + "mean": 4.686854481406044e-06, + "std": 0.03871333599090576, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_k.bias": { + "min": -12.945391654968262, + "max": 15.922587394714355, + "mean": 0.0331900492310524, + "std": 1.9867922067642212, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_v.weight": { + "min": -0.20660938322544098, + "max": 0.22584253549575806, + "mean": -7.262543658725917e-05, + "std": 0.04055970162153244, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_v.bias": { + "min": -0.06933361291885376, + "max": 0.06314393132925034, + "mean": 0.00014905043644830585, + "std": 0.014740395359694958, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_out.0.weight": { + "min": -0.46516552567481995, + "max": 0.3203747570514679, + "mean": 1.989086922549177e-05, + "std": 0.04059458523988724, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_out.0.bias": { + "min": -0.06398216634988785, + "max": 0.11521662026643753, + "mean": 0.0011892176698893309, + "std": 0.02469474822282791, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.3.g": { + "min": 0.37489306926727295, + "max": 0.9301723837852478, + "mean": 0.7509260177612305, + "std": 0.04003360494971275, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.4.ff.0.0.weight": { + "min": -0.27877017855644226, + "max": 0.27262061834335327, + "mean": -0.00016865786164999008, + "std": 0.0410030372440815, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.20.4.ff.0.0.bias": { + "min": -0.19846785068511963, + "max": 0.05112157389521599, + "mean": -0.032006848603487015, + "std": 0.02506233938038349, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.20.4.ff.2.weight": { + "min": -0.6571894884109497, + "max": 0.5354637503623962, + "mean": -4.8520763812121004e-05, + "std": 0.05285634472966194, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.20.4.ff.2.bias": { + "min": -0.19253292679786682, + "max": 0.5813104510307312, + "mean": -0.0005173450335860252, + "std": 0.04104470834136009, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.0.weight": { + "min": -0.41767504811286926, + "max": 0.3719256818294525, + "mean": 6.585116807400482e-06, + "std": 0.02162640169262886, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.21.1.g": { + "min": 0.21444188058376312, + "max": 0.7454288601875305, + "mean": 0.6494399309158325, + "std": 0.054196760058403015, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_q.weight": { + "min": -0.20942556858062744, + "max": 0.19570672512054443, + "mean": 4.021516360808164e-05, + "std": 0.03946828842163086, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_q.bias": { + "min": -0.32898303866386414, + "max": 0.2592002749443054, + "mean": -0.0032279789447784424, + "std": 0.05622360482811928, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_k.weight": { + "min": -0.2054453343153, + "max": 0.2543545663356781, + "mean": 5.45132061233744e-05, + "std": 0.03857067599892616, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_k.bias": { + "min": -6.233641624450684, + "max": 6.921432971954346, + "mean": 0.04828529804944992, + "std": 1.3836402893066406, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_v.weight": { + "min": -0.20949925482273102, + "max": 0.2304454892873764, + "mean": -4.72849160360056e-06, + "std": 0.041318491101264954, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_v.bias": { + "min": -0.04375026375055313, + "max": 0.03585176169872284, + "mean": -5.88857801631093e-07, + "std": 0.012790623120963573, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_out.0.weight": { + "min": -0.39803647994995117, + "max": 0.34512725472450256, + "mean": -5.491710908245295e-05, + "std": 0.042394764721393585, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_out.0.bias": { + "min": -0.054978147149086, + "max": 0.06269973516464233, + "mean": 0.0003556903393473476, + "std": 0.018663441762328148, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.3.g": { + "min": 0.35058680176734924, + "max": 1.043295979499817, + "mean": 0.789494514465332, + "std": 0.04858649522066116, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.4.ff.0.0.weight": { + "min": -0.33317434787750244, + "max": 0.3864516317844391, + "mean": -0.00016881646297406405, + "std": 0.041488684713840485, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.21.4.ff.0.0.bias": { + "min": -0.15732650458812714, + "max": 0.058728814125061035, + "mean": -0.03181058540940285, + "std": 0.025098087266087532, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.21.4.ff.2.weight": { + "min": -0.6958801746368408, + "max": 0.46852678060531616, + "mean": -8.982194412965328e-05, + "std": 0.05180330574512482, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.21.4.ff.2.bias": { + "min": -0.24772712588310242, + "max": 0.32808512449264526, + "mean": -0.0002515119267627597, + "std": 0.04140802100300789, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.0.weight": { + "min": -0.28731903433799744, + "max": 0.3503708243370056, + "mean": -2.625113665999379e-06, + "std": 0.024243580177426338, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.22.1.g": { + "min": 0.19668713212013245, + "max": 0.7778334617614746, + "mean": 0.670162558555603, + "std": 0.05853449925780296, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_q.weight": { + "min": -0.2283114343881607, + "max": 0.23055444657802582, + "mean": -2.0571733330143616e-05, + "std": 0.04044181853532791, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_q.bias": { + "min": -0.2195570170879364, + "max": 0.24048519134521484, + "mean": 0.000782210670877248, + "std": 0.055770643055438995, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_k.weight": { + "min": -0.21605147421360016, + "max": 0.22674262523651123, + "mean": -7.179281237768009e-05, + "std": 0.03937681019306183, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_k.bias": { + "min": -8.892273902893066, + "max": 9.054671287536621, + "mean": -0.0012077325955033302, + "std": 1.846124529838562, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_v.weight": { + "min": -0.2689066231250763, + "max": 0.2583616375923157, + "mean": 4.3370266212150455e-05, + "std": 0.03841203823685646, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_v.bias": { + "min": -0.05771247297525406, + "max": 0.05783558264374733, + "mean": 0.00035597707028500736, + "std": 0.014716549776494503, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_out.0.weight": { + "min": -0.2647928297519684, + "max": 0.28871840238571167, + "mean": -6.220719660632312e-05, + "std": 0.0390787236392498, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_out.0.bias": { + "min": -0.04365166649222374, + "max": 0.037368953227996826, + "mean": -8.94215190783143e-05, + "std": 0.013351045548915863, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.3.g": { + "min": 0.33930352330207825, + "max": 1.090523362159729, + "mean": 0.8638416528701782, + "std": 0.06374476104974747, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.4.ff.0.0.weight": { + "min": -0.4229956567287445, + "max": 0.41935035586357117, + "mean": 0.00031358242267742753, + "std": 0.04351169988512993, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.22.4.ff.0.0.bias": { + "min": -0.2143605649471283, + "max": 0.17033977806568146, + "mean": -0.029430482536554337, + "std": 0.031879011541604996, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.22.4.ff.2.weight": { + "min": -0.5980925559997559, + "max": 0.5593904852867126, + "mean": -0.0001523983955848962, + "std": 0.05345866456627846, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.22.4.ff.2.bias": { + "min": -0.17843037843704224, + "max": 0.3764672875404358, + "mean": 0.0013608136214315891, + "std": 0.037283699959516525, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.0.weight": { + "min": -0.3941720128059387, + "max": 0.3687548339366913, + "mean": 3.7372221413534135e-05, + "std": 0.02862183377146721, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.23.1.g": { + "min": 0.2906048893928528, + "max": 0.825853168964386, + "mean": 0.7055732607841492, + "std": 0.0677838996052742, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_q.weight": { + "min": -0.9263502359390259, + "max": 1.027148962020874, + "mean": -2.6785823138197884e-05, + "std": 0.04763893038034439, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_q.bias": { + "min": -0.8774253129959106, + "max": 0.8142860531806946, + "mean": -0.0003061135357711464, + "std": 0.09545911848545074, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_k.weight": { + "min": -0.2697736918926239, + "max": 0.24071107804775238, + "mean": -2.2601629098062404e-05, + "std": 0.038958579301834106, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_k.bias": { + "min": -23.70609474182129, + "max": 22.81615447998047, + "mean": -0.09178254753351212, + "std": 4.064568042755127, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_v.weight": { + "min": -0.22739385068416595, + "max": 0.24493008852005005, + "mean": -2.535741987230722e-05, + "std": 0.03864453360438347, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_v.bias": { + "min": -0.06026393920183182, + "max": 0.045535702258348465, + "mean": -0.00013921607751399279, + "std": 0.014681815169751644, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_out.0.weight": { + "min": -0.3383011817932129, + "max": 0.3741171360015869, + "mean": 6.997803211561404e-06, + "std": 0.040823448449373245, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_out.0.bias": { + "min": -0.046280112117528915, + "max": 0.19523115456104279, + "mean": 0.00027006896561942995, + "std": 0.01355893723666668, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.3.g": { + "min": 0.3735462725162506, + "max": 1.1277151107788086, + "mean": 0.8900589942932129, + "std": 0.06382670253515244, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.4.ff.0.0.weight": { + "min": -0.4478131830692291, + "max": 0.5424441695213318, + "mean": 2.4745060727582313e-05, + "std": 0.04557563737034798, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.23.4.ff.0.0.bias": { + "min": -0.22360379993915558, + "max": 0.08794356882572174, + "mean": -0.03199389576911926, + "std": 0.03773387894034386, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.23.4.ff.2.weight": { + "min": -0.7249262928962708, + "max": 0.6877928376197815, + "mean": 3.6950204957975075e-05, + "std": 0.051789939403533936, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.23.4.ff.2.bias": { + "min": -0.17425872385501862, + "max": 0.21810372173786163, + "mean": 3.0209601391106844e-05, + "std": 0.03174462914466858, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.0.weight": { + "min": -0.3392157554626465, + "max": 0.3738991320133209, + "mean": 4.299447755329311e-05, + "std": 0.03414613753557205, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.24.1.g": { + "min": 0.3178655207157135, + "max": 1.2844390869140625, + "mean": 0.6014401912689209, + "std": 0.08323848247528076, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_q.weight": { + "min": -0.2828904390335083, + "max": 0.260010302066803, + "mean": -3.007857230841182e-06, + "std": 0.03598371520638466, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_q.bias": { + "min": -0.2351931631565094, + "max": 0.20519772171974182, + "mean": 0.00022795653785578907, + "std": 0.055979955941438675, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_k.weight": { + "min": -0.43529582023620605, + "max": 0.32459068298339844, + "mean": 2.450653482810594e-05, + "std": 0.03413282707333565, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_k.bias": { + "min": -5.542441368103027, + "max": 7.307634353637695, + "mean": -0.007349876686930656, + "std": 0.6985355019569397, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_v.weight": { + "min": -0.3433660864830017, + "max": 0.3625560700893402, + "mean": 0.00010314527025911957, + "std": 0.04783623665571213, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_v.bias": { + "min": -0.07354722917079926, + "max": 0.060343291610479355, + "mean": 0.0009371445048600435, + "std": 0.014936422929167747, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_out.0.weight": { + "min": -0.25582820177078247, + "max": 0.286111980676651, + "mean": 4.655210432247259e-06, + "std": 0.04156283661723137, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_out.0.bias": { + "min": -0.05514800176024437, + "max": 0.06263813376426697, + "mean": 0.0001386886287946254, + "std": 0.007160879671573639, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.3.g": { + "min": 0.4938517212867737, + "max": 1.2188584804534912, + "mean": 1.0133963823318481, + "std": 0.11724550276994705, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.4.ff.0.0.weight": { + "min": -1.093719720840454, + "max": 1.0471616983413696, + "mean": -4.925714529235847e-05, + "std": 0.05241731181740761, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.24.4.ff.0.0.bias": { + "min": -0.2243891805410385, + "max": 0.172992542386055, + "mean": -0.027224872261285782, + "std": 0.03628592565655708, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.24.4.ff.2.weight": { + "min": -0.8836102485656738, + "max": 0.9222370386123657, + "mean": -0.0001438588951714337, + "std": 0.053294114768505096, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.24.4.ff.2.bias": { + "min": -0.17069175839424133, + "max": 0.37931114435195923, + "mean": 0.003359442111104727, + "std": 0.03984633460640907, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.0.weight": { + "min": -0.777143120765686, + "max": 0.7232267260551453, + "mean": 1.830433029681444e-05, + "std": 0.0461735762655735, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.25.1.g": { + "min": 0.3386678695678711, + "max": 1.4252641201019287, + "mean": 0.9481973648071289, + "std": 0.20639142394065857, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_q.weight": { + "min": -1.746235728263855, + "max": 1.7046191692352295, + "mean": 0.00022743589943274856, + "std": 0.1587381213903427, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_q.bias": { + "min": -1.1972129344940186, + "max": 1.0979515314102173, + "mean": -0.00952577032148838, + "std": 0.2035541981458664, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_k.weight": { + "min": -0.4209991693496704, + "max": 0.42664653062820435, + "mean": 6.461775046773255e-05, + "std": 0.04803095757961273, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_k.bias": { + "min": -19.71938133239746, + "max": 19.514814376831055, + "mean": -0.24804288148880005, + "std": 4.770266532897949, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_v.weight": { + "min": -0.32366812229156494, + "max": 0.43827319145202637, + "mean": -1.2008969861199148e-05, + "std": 0.04616396129131317, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_v.bias": { + "min": -0.03389401733875275, + "max": 0.03695628046989441, + "mean": 0.0006402541184797883, + "std": 0.012914549559354782, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_out.0.weight": { + "min": -0.7030304074287415, + "max": 0.6659538745880127, + "mean": 4.320529478718527e-05, + "std": 0.05788206309080124, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_out.0.bias": { + "min": -0.07218055427074432, + "max": 0.0675114244222641, + "mean": -0.0001346912613371387, + "std": 0.012894386425614357, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.3.g": { + "min": 0.3805386424064636, + "max": 1.3893085718154907, + "mean": 1.0665242671966553, + "std": 0.21952925622463226, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.4.ff.0.0.weight": { + "min": -0.6161316633224487, + "max": 0.717426061630249, + "mean": 0.00011223374167457223, + "std": 0.0580313578248024, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.25.4.ff.0.0.bias": { + "min": -0.21904653310775757, + "max": 0.22452397644519806, + "mean": 0.006222008261829615, + "std": 0.049658045172691345, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.25.4.ff.2.weight": { + "min": -0.6296318769454956, + "max": 0.8893842101097107, + "mean": 1.2104990673833527e-05, + "std": 0.02354114130139351, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.25.4.ff.2.bias": { + "min": -0.5061390995979309, + "max": 0.473175585269928, + "mean": -0.003011696506291628, + "std": 0.06919368356466293, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.norm_out.g": { + "min": 0.5380294322967529, + "max": 1.1777888536453247, + "mean": 0.7825304865837097, + "std": 0.09833591431379318, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.proj_out.weight": { + "min": -0.26662442088127136, + "max": 0.21249151229858398, + "mean": -0.00022446915681939572, + "std": 0.054007817059755325, + "sparsity": 0.0, + "shape": [ + 100, + 1024 + ] + }, + "transformer.proj_out.bias": { + "min": -0.23786024749279022, + "max": 0.014854340814054012, + "mean": -0.04389730468392372, + "std": 0.03425038233399391, + "sparsity": 0.0, + "shape": [ + 100 + ] + } + } +} \ No newline at end of file