diff --git "a/model_analysis.json" "b/model_analysis.json" new file mode 100644--- /dev/null +++ "b/model_analysis.json" @@ -0,0 +1,4683 @@ +{ + "layer_types": { + "transformer": 391 + }, + "parameter_counts": { + "transformer.time_embed.time_mlp.0.weight": 262144, + "transformer.time_embed.time_mlp.0.bias": 1024, + "transformer.time_embed.time_mlp.2.weight": 1048576, + "transformer.time_embed.time_mlp.2.bias": 1024, + "transformer.text_embed.text_embed.weight": 254600, + "transformer.input_embed.proj.weight": 307200, + "transformer.input_embed.proj.bias": 1024, + "transformer.input_embed.conv_pos_embed.conv1d.0.weight": 2031616, + "transformer.input_embed.conv_pos_embed.conv1d.0.bias": 1024, + "transformer.input_embed.conv_pos_embed.conv1d.2.weight": 2031616, + "transformer.input_embed.conv_pos_embed.conv1d.2.bias": 1024, + "transformer.layers.0.1.g": 1024, + "transformer.layers.0.2.to_q.weight": 1048576, + "transformer.layers.0.2.to_q.bias": 1024, + "transformer.layers.0.2.to_k.weight": 1048576, + "transformer.layers.0.2.to_k.bias": 1024, + "transformer.layers.0.2.to_v.weight": 1048576, + "transformer.layers.0.2.to_v.bias": 1024, + "transformer.layers.0.2.to_out.0.weight": 1048576, + "transformer.layers.0.2.to_out.0.bias": 1024, + "transformer.layers.0.3.g": 1024, + "transformer.layers.0.4.ff.0.0.weight": 4194304, + "transformer.layers.0.4.ff.0.0.bias": 4096, + "transformer.layers.0.4.ff.2.weight": 4194304, + "transformer.layers.0.4.ff.2.bias": 1024, + "transformer.layers.1.1.g": 1024, + "transformer.layers.1.2.to_q.weight": 1048576, + "transformer.layers.1.2.to_q.bias": 1024, + "transformer.layers.1.2.to_k.weight": 1048576, + "transformer.layers.1.2.to_k.bias": 1024, + "transformer.layers.1.2.to_v.weight": 1048576, + "transformer.layers.1.2.to_v.bias": 1024, + "transformer.layers.1.2.to_out.0.weight": 1048576, + "transformer.layers.1.2.to_out.0.bias": 1024, + "transformer.layers.1.3.g": 1024, + "transformer.layers.1.4.ff.0.0.weight": 4194304, + "transformer.layers.1.4.ff.0.0.bias": 4096, + "transformer.layers.1.4.ff.2.weight": 4194304, + "transformer.layers.1.4.ff.2.bias": 1024, + "transformer.layers.2.1.g": 1024, + "transformer.layers.2.2.to_q.weight": 1048576, + "transformer.layers.2.2.to_q.bias": 1024, + "transformer.layers.2.2.to_k.weight": 1048576, + "transformer.layers.2.2.to_k.bias": 1024, + "transformer.layers.2.2.to_v.weight": 1048576, + "transformer.layers.2.2.to_v.bias": 1024, + "transformer.layers.2.2.to_out.0.weight": 1048576, + "transformer.layers.2.2.to_out.0.bias": 1024, + "transformer.layers.2.3.g": 1024, + "transformer.layers.2.4.ff.0.0.weight": 4194304, + "transformer.layers.2.4.ff.0.0.bias": 4096, + "transformer.layers.2.4.ff.2.weight": 4194304, + "transformer.layers.2.4.ff.2.bias": 1024, + "transformer.layers.3.1.g": 1024, + "transformer.layers.3.2.to_q.weight": 1048576, + "transformer.layers.3.2.to_q.bias": 1024, + "transformer.layers.3.2.to_k.weight": 1048576, + "transformer.layers.3.2.to_k.bias": 1024, + "transformer.layers.3.2.to_v.weight": 1048576, + "transformer.layers.3.2.to_v.bias": 1024, + "transformer.layers.3.2.to_out.0.weight": 1048576, + "transformer.layers.3.2.to_out.0.bias": 1024, + "transformer.layers.3.3.g": 1024, + "transformer.layers.3.4.ff.0.0.weight": 4194304, + "transformer.layers.3.4.ff.0.0.bias": 4096, + "transformer.layers.3.4.ff.2.weight": 4194304, + "transformer.layers.3.4.ff.2.bias": 1024, + "transformer.layers.4.1.g": 1024, + "transformer.layers.4.2.to_q.weight": 1048576, + "transformer.layers.4.2.to_q.bias": 1024, + "transformer.layers.4.2.to_k.weight": 1048576, + "transformer.layers.4.2.to_k.bias": 1024, + "transformer.layers.4.2.to_v.weight": 1048576, + "transformer.layers.4.2.to_v.bias": 1024, + "transformer.layers.4.2.to_out.0.weight": 1048576, + "transformer.layers.4.2.to_out.0.bias": 1024, + "transformer.layers.4.3.g": 1024, + "transformer.layers.4.4.ff.0.0.weight": 4194304, + "transformer.layers.4.4.ff.0.0.bias": 4096, + "transformer.layers.4.4.ff.2.weight": 4194304, + "transformer.layers.4.4.ff.2.bias": 1024, + "transformer.layers.5.1.g": 1024, + "transformer.layers.5.2.to_q.weight": 1048576, + "transformer.layers.5.2.to_q.bias": 1024, + "transformer.layers.5.2.to_k.weight": 1048576, + "transformer.layers.5.2.to_k.bias": 1024, + "transformer.layers.5.2.to_v.weight": 1048576, + "transformer.layers.5.2.to_v.bias": 1024, + "transformer.layers.5.2.to_out.0.weight": 1048576, + "transformer.layers.5.2.to_out.0.bias": 1024, + "transformer.layers.5.3.g": 1024, + "transformer.layers.5.4.ff.0.0.weight": 4194304, + "transformer.layers.5.4.ff.0.0.bias": 4096, + "transformer.layers.5.4.ff.2.weight": 4194304, + "transformer.layers.5.4.ff.2.bias": 1024, + "transformer.layers.6.1.g": 1024, + "transformer.layers.6.2.to_q.weight": 1048576, + "transformer.layers.6.2.to_q.bias": 1024, + "transformer.layers.6.2.to_k.weight": 1048576, + "transformer.layers.6.2.to_k.bias": 1024, + "transformer.layers.6.2.to_v.weight": 1048576, + "transformer.layers.6.2.to_v.bias": 1024, + "transformer.layers.6.2.to_out.0.weight": 1048576, + "transformer.layers.6.2.to_out.0.bias": 1024, + "transformer.layers.6.3.g": 1024, + "transformer.layers.6.4.ff.0.0.weight": 4194304, + "transformer.layers.6.4.ff.0.0.bias": 4096, + "transformer.layers.6.4.ff.2.weight": 4194304, + "transformer.layers.6.4.ff.2.bias": 1024, + "transformer.layers.7.1.g": 1024, + "transformer.layers.7.2.to_q.weight": 1048576, + "transformer.layers.7.2.to_q.bias": 1024, + "transformer.layers.7.2.to_k.weight": 1048576, + "transformer.layers.7.2.to_k.bias": 1024, + "transformer.layers.7.2.to_v.weight": 1048576, + "transformer.layers.7.2.to_v.bias": 1024, + "transformer.layers.7.2.to_out.0.weight": 1048576, + "transformer.layers.7.2.to_out.0.bias": 1024, + "transformer.layers.7.3.g": 1024, + "transformer.layers.7.4.ff.0.0.weight": 4194304, + "transformer.layers.7.4.ff.0.0.bias": 4096, + "transformer.layers.7.4.ff.2.weight": 4194304, + "transformer.layers.7.4.ff.2.bias": 1024, + "transformer.layers.8.1.g": 1024, + "transformer.layers.8.2.to_q.weight": 1048576, + "transformer.layers.8.2.to_q.bias": 1024, + "transformer.layers.8.2.to_k.weight": 1048576, + "transformer.layers.8.2.to_k.bias": 1024, + "transformer.layers.8.2.to_v.weight": 1048576, + "transformer.layers.8.2.to_v.bias": 1024, + "transformer.layers.8.2.to_out.0.weight": 1048576, + "transformer.layers.8.2.to_out.0.bias": 1024, + "transformer.layers.8.3.g": 1024, + "transformer.layers.8.4.ff.0.0.weight": 4194304, + "transformer.layers.8.4.ff.0.0.bias": 4096, + "transformer.layers.8.4.ff.2.weight": 4194304, + "transformer.layers.8.4.ff.2.bias": 1024, + "transformer.layers.9.1.g": 1024, + "transformer.layers.9.2.to_q.weight": 1048576, + "transformer.layers.9.2.to_q.bias": 1024, + "transformer.layers.9.2.to_k.weight": 1048576, + "transformer.layers.9.2.to_k.bias": 1024, + "transformer.layers.9.2.to_v.weight": 1048576, + "transformer.layers.9.2.to_v.bias": 1024, + "transformer.layers.9.2.to_out.0.weight": 1048576, + "transformer.layers.9.2.to_out.0.bias": 1024, + "transformer.layers.9.3.g": 1024, + "transformer.layers.9.4.ff.0.0.weight": 4194304, + "transformer.layers.9.4.ff.0.0.bias": 4096, + "transformer.layers.9.4.ff.2.weight": 4194304, + "transformer.layers.9.4.ff.2.bias": 1024, + "transformer.layers.10.1.g": 1024, + "transformer.layers.10.2.to_q.weight": 1048576, + "transformer.layers.10.2.to_q.bias": 1024, + "transformer.layers.10.2.to_k.weight": 1048576, + "transformer.layers.10.2.to_k.bias": 1024, + "transformer.layers.10.2.to_v.weight": 1048576, + "transformer.layers.10.2.to_v.bias": 1024, + "transformer.layers.10.2.to_out.0.weight": 1048576, + "transformer.layers.10.2.to_out.0.bias": 1024, + "transformer.layers.10.3.g": 1024, + "transformer.layers.10.4.ff.0.0.weight": 4194304, + "transformer.layers.10.4.ff.0.0.bias": 4096, + "transformer.layers.10.4.ff.2.weight": 4194304, + "transformer.layers.10.4.ff.2.bias": 1024, + "transformer.layers.11.1.g": 1024, + "transformer.layers.11.2.to_q.weight": 1048576, + "transformer.layers.11.2.to_q.bias": 1024, + "transformer.layers.11.2.to_k.weight": 1048576, + "transformer.layers.11.2.to_k.bias": 1024, + "transformer.layers.11.2.to_v.weight": 1048576, + "transformer.layers.11.2.to_v.bias": 1024, + "transformer.layers.11.2.to_out.0.weight": 1048576, + "transformer.layers.11.2.to_out.0.bias": 1024, + "transformer.layers.11.3.g": 1024, + "transformer.layers.11.4.ff.0.0.weight": 4194304, + "transformer.layers.11.4.ff.0.0.bias": 4096, + "transformer.layers.11.4.ff.2.weight": 4194304, + "transformer.layers.11.4.ff.2.bias": 1024, + "transformer.layers.12.1.g": 1024, + "transformer.layers.12.2.to_q.weight": 1048576, + "transformer.layers.12.2.to_q.bias": 1024, + "transformer.layers.12.2.to_k.weight": 1048576, + "transformer.layers.12.2.to_k.bias": 1024, + "transformer.layers.12.2.to_v.weight": 1048576, + "transformer.layers.12.2.to_v.bias": 1024, + "transformer.layers.12.2.to_out.0.weight": 1048576, + "transformer.layers.12.2.to_out.0.bias": 1024, + "transformer.layers.12.3.g": 1024, + "transformer.layers.12.4.ff.0.0.weight": 4194304, + "transformer.layers.12.4.ff.0.0.bias": 4096, + "transformer.layers.12.4.ff.2.weight": 4194304, + "transformer.layers.12.4.ff.2.bias": 1024, + "transformer.layers.13.0.weight": 2097152, + "transformer.layers.13.1.g": 1024, + "transformer.layers.13.2.to_q.weight": 1048576, + "transformer.layers.13.2.to_q.bias": 1024, + "transformer.layers.13.2.to_k.weight": 1048576, + "transformer.layers.13.2.to_k.bias": 1024, + "transformer.layers.13.2.to_v.weight": 1048576, + "transformer.layers.13.2.to_v.bias": 1024, + "transformer.layers.13.2.to_out.0.weight": 1048576, + "transformer.layers.13.2.to_out.0.bias": 1024, + "transformer.layers.13.3.g": 1024, + "transformer.layers.13.4.ff.0.0.weight": 4194304, + "transformer.layers.13.4.ff.0.0.bias": 4096, + "transformer.layers.13.4.ff.2.weight": 4194304, + "transformer.layers.13.4.ff.2.bias": 1024, + "transformer.layers.14.0.weight": 2097152, + "transformer.layers.14.1.g": 1024, + "transformer.layers.14.2.to_q.weight": 1048576, + "transformer.layers.14.2.to_q.bias": 1024, + "transformer.layers.14.2.to_k.weight": 1048576, + "transformer.layers.14.2.to_k.bias": 1024, + "transformer.layers.14.2.to_v.weight": 1048576, + "transformer.layers.14.2.to_v.bias": 1024, + "transformer.layers.14.2.to_out.0.weight": 1048576, + "transformer.layers.14.2.to_out.0.bias": 1024, + "transformer.layers.14.3.g": 1024, + "transformer.layers.14.4.ff.0.0.weight": 4194304, + "transformer.layers.14.4.ff.0.0.bias": 4096, + "transformer.layers.14.4.ff.2.weight": 4194304, + "transformer.layers.14.4.ff.2.bias": 1024, + "transformer.layers.15.0.weight": 2097152, + "transformer.layers.15.1.g": 1024, + "transformer.layers.15.2.to_q.weight": 1048576, + "transformer.layers.15.2.to_q.bias": 1024, + "transformer.layers.15.2.to_k.weight": 1048576, + "transformer.layers.15.2.to_k.bias": 1024, + "transformer.layers.15.2.to_v.weight": 1048576, + "transformer.layers.15.2.to_v.bias": 1024, + "transformer.layers.15.2.to_out.0.weight": 1048576, + "transformer.layers.15.2.to_out.0.bias": 1024, + "transformer.layers.15.3.g": 1024, + "transformer.layers.15.4.ff.0.0.weight": 4194304, + "transformer.layers.15.4.ff.0.0.bias": 4096, + "transformer.layers.15.4.ff.2.weight": 4194304, + "transformer.layers.15.4.ff.2.bias": 1024, + "transformer.layers.16.0.weight": 2097152, + "transformer.layers.16.1.g": 1024, + "transformer.layers.16.2.to_q.weight": 1048576, + "transformer.layers.16.2.to_q.bias": 1024, + "transformer.layers.16.2.to_k.weight": 1048576, + "transformer.layers.16.2.to_k.bias": 1024, + "transformer.layers.16.2.to_v.weight": 1048576, + "transformer.layers.16.2.to_v.bias": 1024, + "transformer.layers.16.2.to_out.0.weight": 1048576, + "transformer.layers.16.2.to_out.0.bias": 1024, + "transformer.layers.16.3.g": 1024, + "transformer.layers.16.4.ff.0.0.weight": 4194304, + "transformer.layers.16.4.ff.0.0.bias": 4096, + "transformer.layers.16.4.ff.2.weight": 4194304, + "transformer.layers.16.4.ff.2.bias": 1024, + "transformer.layers.17.0.weight": 2097152, + "transformer.layers.17.1.g": 1024, + "transformer.layers.17.2.to_q.weight": 1048576, + "transformer.layers.17.2.to_q.bias": 1024, + "transformer.layers.17.2.to_k.weight": 1048576, + "transformer.layers.17.2.to_k.bias": 1024, + "transformer.layers.17.2.to_v.weight": 1048576, + "transformer.layers.17.2.to_v.bias": 1024, + "transformer.layers.17.2.to_out.0.weight": 1048576, + "transformer.layers.17.2.to_out.0.bias": 1024, + "transformer.layers.17.3.g": 1024, + "transformer.layers.17.4.ff.0.0.weight": 4194304, + "transformer.layers.17.4.ff.0.0.bias": 4096, + "transformer.layers.17.4.ff.2.weight": 4194304, + "transformer.layers.17.4.ff.2.bias": 1024, + "transformer.layers.18.0.weight": 2097152, + "transformer.layers.18.1.g": 1024, + "transformer.layers.18.2.to_q.weight": 1048576, + "transformer.layers.18.2.to_q.bias": 1024, + "transformer.layers.18.2.to_k.weight": 1048576, + "transformer.layers.18.2.to_k.bias": 1024, + "transformer.layers.18.2.to_v.weight": 1048576, + "transformer.layers.18.2.to_v.bias": 1024, + "transformer.layers.18.2.to_out.0.weight": 1048576, + "transformer.layers.18.2.to_out.0.bias": 1024, + "transformer.layers.18.3.g": 1024, + "transformer.layers.18.4.ff.0.0.weight": 4194304, + "transformer.layers.18.4.ff.0.0.bias": 4096, + "transformer.layers.18.4.ff.2.weight": 4194304, + "transformer.layers.18.4.ff.2.bias": 1024, + "transformer.layers.19.0.weight": 2097152, + "transformer.layers.19.1.g": 1024, + "transformer.layers.19.2.to_q.weight": 1048576, + "transformer.layers.19.2.to_q.bias": 1024, + "transformer.layers.19.2.to_k.weight": 1048576, + "transformer.layers.19.2.to_k.bias": 1024, + "transformer.layers.19.2.to_v.weight": 1048576, + "transformer.layers.19.2.to_v.bias": 1024, + "transformer.layers.19.2.to_out.0.weight": 1048576, + "transformer.layers.19.2.to_out.0.bias": 1024, + "transformer.layers.19.3.g": 1024, + "transformer.layers.19.4.ff.0.0.weight": 4194304, + "transformer.layers.19.4.ff.0.0.bias": 4096, + "transformer.layers.19.4.ff.2.weight": 4194304, + "transformer.layers.19.4.ff.2.bias": 1024, + "transformer.layers.20.0.weight": 2097152, + "transformer.layers.20.1.g": 1024, + "transformer.layers.20.2.to_q.weight": 1048576, + "transformer.layers.20.2.to_q.bias": 1024, + "transformer.layers.20.2.to_k.weight": 1048576, + "transformer.layers.20.2.to_k.bias": 1024, + "transformer.layers.20.2.to_v.weight": 1048576, + "transformer.layers.20.2.to_v.bias": 1024, + "transformer.layers.20.2.to_out.0.weight": 1048576, + "transformer.layers.20.2.to_out.0.bias": 1024, + "transformer.layers.20.3.g": 1024, + "transformer.layers.20.4.ff.0.0.weight": 4194304, + "transformer.layers.20.4.ff.0.0.bias": 4096, + "transformer.layers.20.4.ff.2.weight": 4194304, + "transformer.layers.20.4.ff.2.bias": 1024, + "transformer.layers.21.0.weight": 2097152, + "transformer.layers.21.1.g": 1024, + "transformer.layers.21.2.to_q.weight": 1048576, + "transformer.layers.21.2.to_q.bias": 1024, + "transformer.layers.21.2.to_k.weight": 1048576, + "transformer.layers.21.2.to_k.bias": 1024, + "transformer.layers.21.2.to_v.weight": 1048576, + "transformer.layers.21.2.to_v.bias": 1024, + "transformer.layers.21.2.to_out.0.weight": 1048576, + "transformer.layers.21.2.to_out.0.bias": 1024, + "transformer.layers.21.3.g": 1024, + "transformer.layers.21.4.ff.0.0.weight": 4194304, + "transformer.layers.21.4.ff.0.0.bias": 4096, + "transformer.layers.21.4.ff.2.weight": 4194304, + "transformer.layers.21.4.ff.2.bias": 1024, + "transformer.layers.22.0.weight": 2097152, + "transformer.layers.22.1.g": 1024, + "transformer.layers.22.2.to_q.weight": 1048576, + "transformer.layers.22.2.to_q.bias": 1024, + "transformer.layers.22.2.to_k.weight": 1048576, + "transformer.layers.22.2.to_k.bias": 1024, + "transformer.layers.22.2.to_v.weight": 1048576, + "transformer.layers.22.2.to_v.bias": 1024, + "transformer.layers.22.2.to_out.0.weight": 1048576, + "transformer.layers.22.2.to_out.0.bias": 1024, + "transformer.layers.22.3.g": 1024, + "transformer.layers.22.4.ff.0.0.weight": 4194304, + "transformer.layers.22.4.ff.0.0.bias": 4096, + "transformer.layers.22.4.ff.2.weight": 4194304, + "transformer.layers.22.4.ff.2.bias": 1024, + "transformer.layers.23.0.weight": 2097152, + "transformer.layers.23.1.g": 1024, + "transformer.layers.23.2.to_q.weight": 1048576, + "transformer.layers.23.2.to_q.bias": 1024, + "transformer.layers.23.2.to_k.weight": 1048576, + "transformer.layers.23.2.to_k.bias": 1024, + "transformer.layers.23.2.to_v.weight": 1048576, + "transformer.layers.23.2.to_v.bias": 1024, + "transformer.layers.23.2.to_out.0.weight": 1048576, + "transformer.layers.23.2.to_out.0.bias": 1024, + "transformer.layers.23.3.g": 1024, + "transformer.layers.23.4.ff.0.0.weight": 4194304, + "transformer.layers.23.4.ff.0.0.bias": 4096, + "transformer.layers.23.4.ff.2.weight": 4194304, + "transformer.layers.23.4.ff.2.bias": 1024, + "transformer.layers.24.0.weight": 2097152, + "transformer.layers.24.1.g": 1024, + "transformer.layers.24.2.to_q.weight": 1048576, + "transformer.layers.24.2.to_q.bias": 1024, + "transformer.layers.24.2.to_k.weight": 1048576, + "transformer.layers.24.2.to_k.bias": 1024, + "transformer.layers.24.2.to_v.weight": 1048576, + "transformer.layers.24.2.to_v.bias": 1024, + "transformer.layers.24.2.to_out.0.weight": 1048576, + "transformer.layers.24.2.to_out.0.bias": 1024, + "transformer.layers.24.3.g": 1024, + "transformer.layers.24.4.ff.0.0.weight": 4194304, + "transformer.layers.24.4.ff.0.0.bias": 4096, + "transformer.layers.24.4.ff.2.weight": 4194304, + "transformer.layers.24.4.ff.2.bias": 1024, + "transformer.layers.25.0.weight": 2097152, + "transformer.layers.25.1.g": 1024, + "transformer.layers.25.2.to_q.weight": 1048576, + "transformer.layers.25.2.to_q.bias": 1024, + "transformer.layers.25.2.to_k.weight": 1048576, + "transformer.layers.25.2.to_k.bias": 1024, + "transformer.layers.25.2.to_v.weight": 1048576, + "transformer.layers.25.2.to_v.bias": 1024, + "transformer.layers.25.2.to_out.0.weight": 1048576, + "transformer.layers.25.2.to_out.0.bias": 1024, + "transformer.layers.25.3.g": 1024, + "transformer.layers.25.4.ff.0.0.weight": 4194304, + "transformer.layers.25.4.ff.0.0.bias": 4096, + "transformer.layers.25.4.ff.2.weight": 4194304, + "transformer.layers.25.4.ff.2.bias": 1024, + "transformer.norm_out.g": 1024, + "transformer.proj_out.weight": 102400, + "transformer.proj_out.bias": 100 + }, + "important_layers": [ + "transformer.time_embed.time_mlp.0.weight", + "transformer.time_embed.time_mlp.2.weight", + "transformer.text_embed.text_embed.weight", + "transformer.input_embed.proj.weight", + "transformer.input_embed.conv_pos_embed.conv1d.0.weight", + "transformer.input_embed.conv_pos_embed.conv1d.2.weight", + "transformer.layers.0.2.to_q.weight", + "transformer.layers.0.2.to_k.weight", + "transformer.layers.0.2.to_v.weight", + "transformer.layers.0.2.to_out.0.weight", + "transformer.layers.0.4.ff.0.0.weight", + "transformer.layers.0.4.ff.2.weight", + "transformer.layers.1.2.to_q.weight", + "transformer.layers.1.2.to_k.weight", + "transformer.layers.1.2.to_v.weight", + "transformer.layers.1.2.to_out.0.weight", + "transformer.layers.1.4.ff.0.0.weight", + "transformer.layers.1.4.ff.2.weight", + "transformer.layers.2.2.to_q.weight", + "transformer.layers.2.2.to_k.weight", + "transformer.layers.2.2.to_v.weight", + "transformer.layers.2.2.to_out.0.weight", + "transformer.layers.2.4.ff.0.0.weight", + "transformer.layers.2.4.ff.2.weight", + "transformer.layers.3.2.to_q.weight", + "transformer.layers.3.2.to_k.weight", + "transformer.layers.3.2.to_v.weight", + "transformer.layers.3.2.to_out.0.weight", + "transformer.layers.3.4.ff.0.0.weight", + "transformer.layers.3.4.ff.2.weight", + "transformer.layers.4.2.to_q.weight", + "transformer.layers.4.2.to_k.weight", + "transformer.layers.4.2.to_v.weight", + "transformer.layers.4.2.to_out.0.weight", + "transformer.layers.4.4.ff.0.0.weight", + "transformer.layers.4.4.ff.2.weight", + "transformer.layers.5.2.to_q.weight", + "transformer.layers.5.2.to_k.weight", + "transformer.layers.5.2.to_v.weight", + "transformer.layers.5.2.to_out.0.weight", + "transformer.layers.5.4.ff.0.0.weight", + "transformer.layers.5.4.ff.2.weight", + "transformer.layers.6.2.to_q.weight", + "transformer.layers.6.2.to_k.weight", + "transformer.layers.6.2.to_v.weight", + "transformer.layers.6.2.to_out.0.weight", + "transformer.layers.6.4.ff.0.0.weight", + "transformer.layers.6.4.ff.2.weight", + "transformer.layers.7.2.to_q.weight", + "transformer.layers.7.2.to_k.weight", + "transformer.layers.7.2.to_v.weight", + "transformer.layers.7.2.to_out.0.weight", + "transformer.layers.7.4.ff.0.0.weight", + "transformer.layers.7.4.ff.2.weight", + "transformer.layers.8.4.ff.0.0.weight", + "transformer.layers.8.4.ff.2.weight", + "transformer.layers.9.4.ff.0.0.weight", + "transformer.layers.9.4.ff.2.weight", + "transformer.layers.10.4.ff.0.0.weight", + "transformer.layers.10.4.ff.2.weight", + "transformer.layers.11.4.ff.0.0.weight", + "transformer.layers.11.4.ff.2.weight", + "transformer.layers.12.4.ff.0.0.weight", + "transformer.layers.12.4.ff.2.weight", + "transformer.layers.13.0.weight", + "transformer.layers.13.4.ff.0.0.weight", + "transformer.layers.13.4.ff.2.weight", + "transformer.layers.14.0.weight", + "transformer.layers.14.4.ff.0.0.weight", + "transformer.layers.14.4.ff.2.weight", + "transformer.layers.15.0.weight", + "transformer.layers.15.4.ff.0.0.weight", + "transformer.layers.15.4.ff.2.weight", + "transformer.layers.16.4.ff.0.0.weight", + "transformer.layers.16.4.ff.2.weight", + "transformer.layers.17.4.ff.0.0.weight", + "transformer.layers.17.4.ff.2.weight", + "transformer.layers.18.4.ff.0.0.weight", + "transformer.layers.18.4.ff.2.weight", + "transformer.layers.19.4.ff.0.0.weight", + "transformer.layers.19.4.ff.2.weight", + "transformer.layers.20.4.ff.0.0.weight", + "transformer.layers.20.4.ff.2.weight", + "transformer.layers.21.4.ff.0.0.weight", + "transformer.layers.21.4.ff.2.weight", + "transformer.layers.22.4.ff.0.0.weight", + "transformer.layers.22.4.ff.2.weight", + "transformer.layers.23.4.ff.0.0.weight", + "transformer.layers.23.4.ff.2.weight", + "transformer.layers.24.4.ff.0.0.weight", + "transformer.layers.24.4.ff.2.weight", + "transformer.layers.25.4.ff.0.0.weight", + "transformer.layers.25.4.ff.2.weight" + ], + "bottleneck_layers": [], + "recommendations": { + "focus_layers": [ + "transformer.time_embed.time_mlp.0.weight", + "transformer.time_embed.time_mlp.2.weight", + "transformer.text_embed.text_embed.weight", + "transformer.input_embed.proj.weight", + "transformer.input_embed.conv_pos_embed.conv1d.0.weight", + "transformer.input_embed.conv_pos_embed.conv1d.2.weight", + "transformer.layers.0.2.to_q.weight", + "transformer.layers.0.2.to_k.weight", + "transformer.layers.0.2.to_v.weight", + "transformer.layers.0.2.to_out.0.weight", + "transformer.layers.0.4.ff.0.0.weight", + "transformer.layers.0.4.ff.2.weight", + "transformer.layers.1.2.to_q.weight", + "transformer.layers.1.2.to_k.weight", + "transformer.layers.1.2.to_v.weight", + "transformer.layers.1.2.to_out.0.weight", + "transformer.layers.1.4.ff.0.0.weight", + "transformer.layers.1.4.ff.2.weight", + "transformer.layers.2.2.to_q.weight", + "transformer.layers.2.2.to_k.weight", + "transformer.layers.2.2.to_v.weight", + "transformer.layers.2.2.to_out.0.weight", + "transformer.layers.2.4.ff.0.0.weight", + "transformer.layers.2.4.ff.2.weight", + "transformer.layers.3.2.to_q.weight", + "transformer.layers.3.2.to_k.weight", + "transformer.layers.3.2.to_v.weight", + "transformer.layers.3.2.to_out.0.weight", + "transformer.layers.3.4.ff.0.0.weight", + "transformer.layers.3.4.ff.2.weight", + "transformer.layers.4.2.to_q.weight", + "transformer.layers.4.2.to_k.weight", + "transformer.layers.4.2.to_v.weight", + "transformer.layers.4.2.to_out.0.weight", + "transformer.layers.4.4.ff.0.0.weight", + "transformer.layers.4.4.ff.2.weight", + "transformer.layers.5.2.to_q.weight", + "transformer.layers.5.2.to_k.weight", + "transformer.layers.5.2.to_v.weight", + "transformer.layers.5.2.to_out.0.weight", + "transformer.layers.5.4.ff.0.0.weight", + "transformer.layers.5.4.ff.2.weight", + "transformer.layers.6.2.to_q.weight", + "transformer.layers.6.2.to_k.weight", + "transformer.layers.6.2.to_v.weight", + "transformer.layers.6.2.to_out.0.weight", + "transformer.layers.6.4.ff.0.0.weight", + "transformer.layers.6.4.ff.2.weight", + "transformer.layers.7.2.to_q.weight", + "transformer.layers.7.2.to_k.weight", + "transformer.layers.7.2.to_v.weight", + "transformer.layers.7.2.to_out.0.weight", + "transformer.layers.7.4.ff.0.0.weight", + "transformer.layers.7.4.ff.2.weight", + "transformer.layers.8.4.ff.0.0.weight", + "transformer.layers.8.4.ff.2.weight", + "transformer.layers.9.4.ff.0.0.weight", + "transformer.layers.9.4.ff.2.weight", + "transformer.layers.10.4.ff.0.0.weight", + "transformer.layers.10.4.ff.2.weight", + "transformer.layers.11.4.ff.0.0.weight", + "transformer.layers.11.4.ff.2.weight", + "transformer.layers.12.4.ff.0.0.weight", + "transformer.layers.12.4.ff.2.weight", + "transformer.layers.13.0.weight", + "transformer.layers.13.4.ff.0.0.weight", + "transformer.layers.13.4.ff.2.weight", + "transformer.layers.14.0.weight", + "transformer.layers.14.4.ff.0.0.weight", + "transformer.layers.14.4.ff.2.weight", + "transformer.layers.15.0.weight", + "transformer.layers.15.4.ff.0.0.weight", + "transformer.layers.15.4.ff.2.weight", + "transformer.layers.16.4.ff.0.0.weight", + "transformer.layers.16.4.ff.2.weight", + "transformer.layers.17.4.ff.0.0.weight", + "transformer.layers.17.4.ff.2.weight", + "transformer.layers.18.4.ff.0.0.weight", + "transformer.layers.18.4.ff.2.weight", + "transformer.layers.19.4.ff.0.0.weight", + "transformer.layers.19.4.ff.2.weight", + "transformer.layers.20.4.ff.0.0.weight", + "transformer.layers.20.4.ff.2.weight", + "transformer.layers.21.4.ff.0.0.weight", + "transformer.layers.21.4.ff.2.weight", + "transformer.layers.22.4.ff.0.0.weight", + "transformer.layers.22.4.ff.2.weight", + "transformer.layers.23.4.ff.0.0.weight", + "transformer.layers.23.4.ff.2.weight", + "transformer.layers.24.4.ff.0.0.weight", + "transformer.layers.24.4.ff.2.weight", + "transformer.layers.25.4.ff.0.0.weight", + "transformer.layers.25.4.ff.2.weight" + ] + }, + "total_parameters": 391, + "total_elements": 360755948, + "param_ranges": { + "transformer.time_embed.time_mlp.0.weight": { + "min": -0.4302421808242798, + "max": 0.29811733961105347, + "mean": -0.0025433888658881187, + "std": 0.04256260767579079, + "sparsity": 0.0, + "shape": [ + 1024, + 256 + ] + }, + "transformer.time_embed.time_mlp.0.bias": { + "min": -0.06305147707462311, + "max": 0.10753221064805984, + "mean": 0.0006371351191774011, + "std": 0.03406313806772232, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.time_embed.time_mlp.2.weight": { + "min": -0.4127056300640106, + "max": 0.8369134068489075, + "mean": -0.00020153506193310022, + "std": 0.024111680686473846, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.time_embed.time_mlp.2.bias": { + "min": -0.11529576778411865, + "max": 0.32162028551101685, + "mean": -0.0009410998900420964, + "std": 0.019562100991606712, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.text_embed.text_embed.weight": { + "min": -2.7922351360321045, + "max": 2.8709537982940674, + "mean": -0.0003647250996436924, + "std": 0.6154845356941223, + "sparsity": 0.0, + "shape": [ + 2546, + 100 + ] + }, + "transformer.input_embed.proj.weight": { + "min": -0.2792224586009979, + "max": 0.3816443681716919, + "mean": 0.0004239956906531006, + "std": 0.04274846613407135, + "sparsity": 0.0, + "shape": [ + 1024, + 300 + ] + }, + "transformer.input_embed.proj.bias": { + "min": -0.222523033618927, + "max": 0.20966869592666626, + "mean": -0.004486067220568657, + "std": 0.040918223559856415, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.0.weight": { + "min": -0.42831921577453613, + "max": 0.4761074483394623, + "mean": 3.883292265527416e-06, + "std": 0.02451084926724434, + "sparsity": 0.0, + "shape": [ + 1024, + 64, + 31 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.0.bias": { + "min": -0.32521355152130127, + "max": 0.15685473382472992, + "mean": -0.04670340567827225, + "std": 0.05158989131450653, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.2.weight": { + "min": -0.41050970554351807, + "max": 0.3547350764274597, + "mean": -0.0001308345381403342, + "std": 0.023604650050401688, + "sparsity": 0.0, + "shape": [ + 1024, + 64, + 31 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.2.bias": { + "min": -0.22980044782161713, + "max": 0.26265424489974976, + "mean": -0.02913527563214302, + "std": 0.04935712739825249, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.1.g": { + "min": 0.25461670756340027, + "max": 0.8201668858528137, + "mean": 0.5254921317100525, + "std": 0.08082755655050278, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_q.weight": { + "min": -0.29707157611846924, + "max": 0.26584771275520325, + "mean": -0.0004257621185388416, + "std": 0.032102566212415695, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_q.bias": { + "min": -0.09281580150127411, + "max": 0.12489211559295654, + "mean": 0.0006475000409409404, + "std": 0.025739654898643494, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_k.weight": { + "min": -0.290749192237854, + "max": 0.2813739478588104, + "mean": -7.507578993681818e-05, + "std": 0.030931759625673294, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_k.bias": { + "min": -5.900395393371582, + "max": 5.815171718597412, + "mean": -0.009333068504929543, + "std": 1.295695185661316, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_v.weight": { + "min": -0.4251696765422821, + "max": 0.3438807427883148, + "mean": 9.805745503399521e-05, + "std": 0.029953517019748688, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_v.bias": { + "min": -0.029049167409539223, + "max": 0.027643660083413124, + "mean": -0.00032356681185774505, + "std": 0.012573834508657455, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_out.0.weight": { + "min": -0.4541175961494446, + "max": 0.4482012987136841, + "mean": 2.389368455624208e-05, + "std": 0.023853901773691177, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_out.0.bias": { + "min": -0.08873512595891953, + "max": 0.09103881567716599, + "mean": 0.0022877324372529984, + "std": 0.019517814740538597, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.3.g": { + "min": 0.2668094336986542, + "max": 1.0562759637832642, + "mean": 0.5312086343765259, + "std": 0.10443899780511856, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.4.ff.0.0.weight": { + "min": -0.5745095610618591, + "max": 0.6083298325538635, + "mean": -0.0004305951879359782, + "std": 0.038600798696279526, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.0.4.ff.0.0.bias": { + "min": -0.1827721893787384, + "max": 0.04561286419630051, + "mean": -0.029457518830895424, + "std": 0.042618319392204285, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.0.4.ff.2.weight": { + "min": -1.167069435119629, + "max": 1.6338956356048584, + "mean": 0.0003232666349504143, + "std": 0.02769671194255352, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.0.4.ff.2.bias": { + "min": -0.1623232066631317, + "max": 0.20567050576210022, + "mean": -0.021127892658114433, + "std": 0.027942020446062088, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.1.g": { + "min": 0.22407177090644836, + "max": 0.843936026096344, + "mean": 0.4876656234264374, + "std": 0.07522594183683395, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_q.weight": { + "min": -0.2555537223815918, + "max": 0.3058427572250366, + "mean": -6.734902854077518e-06, + "std": 0.033475104719400406, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_q.bias": { + "min": -0.09539701044559479, + "max": 0.11051826924085617, + "mean": 6.649381248280406e-05, + "std": 0.026965470984578133, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_k.weight": { + "min": -0.2971154749393463, + "max": 0.2961341142654419, + "mean": 5.3386003855848685e-05, + "std": 0.03254621848464012, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_k.bias": { + "min": -5.165225028991699, + "max": 5.085448741912842, + "mean": -0.014597215689718723, + "std": 1.1575956344604492, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_v.weight": { + "min": -0.3449501693248749, + "max": 0.3433416187763214, + "mean": 7.857720629544929e-05, + "std": 0.030061962082982063, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_v.bias": { + "min": -0.03606901317834854, + "max": 0.033370036631822586, + "mean": -0.0001412129495292902, + "std": 0.01303885132074356, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_out.0.weight": { + "min": -0.3154986798763275, + "max": 0.37501832842826843, + "mean": -2.0688352378783748e-05, + "std": 0.024059457704424858, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_out.0.bias": { + "min": -0.1054358258843422, + "max": 0.12218254804611206, + "mean": -0.001968180760741234, + "std": 0.02885930798947811, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.3.g": { + "min": 0.3115288317203522, + "max": 1.1208443641662598, + "mean": 0.6663118004798889, + "std": 0.09773967415094376, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.4.ff.0.0.weight": { + "min": -0.8727405071258545, + "max": 0.6275568604469299, + "mean": 0.001675269566476345, + "std": 0.04743880778551102, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.1.4.ff.0.0.bias": { + "min": -0.27153271436691284, + "max": 0.034265656024217606, + "mean": -0.04660956189036369, + "std": 0.04060109704732895, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.1.4.ff.2.weight": { + "min": -0.9227067232131958, + "max": 0.9646649360656738, + "mean": 0.0010214094072580338, + "std": 0.04070667922496796, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.1.4.ff.2.bias": { + "min": -0.14452314376831055, + "max": 0.0749678835272789, + "mean": -0.009091369807720184, + "std": 0.025692423805594444, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.1.g": { + "min": 0.2401818335056305, + "max": 0.7130386829376221, + "mean": 0.4472571313381195, + "std": 0.05933048576116562, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_q.weight": { + "min": -0.27240708470344543, + "max": 0.2978667914867401, + "mean": 9.335752110928297e-06, + "std": 0.03546963632106781, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_q.bias": { + "min": -0.11937365680932999, + "max": 0.11856595426797867, + "mean": 0.0007609212771058083, + "std": 0.027630653232336044, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_k.weight": { + "min": -0.2809975743293762, + "max": 0.2798910439014435, + "mean": -7.717408880125731e-05, + "std": 0.03509914502501488, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_k.bias": { + "min": -2.5100622177124023, + "max": 2.5220582485198975, + "mean": 0.026752006262540817, + "std": 0.5868890285491943, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_v.weight": { + "min": -0.2211453914642334, + "max": 0.2715946435928345, + "mean": 2.9373950383160263e-06, + "std": 0.030732743442058563, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_v.bias": { + "min": -0.03357521444559097, + "max": 0.031258679926395416, + "mean": 0.00011264161730650812, + "std": 0.012410733848810196, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_out.0.weight": { + "min": -0.235328808426857, + "max": 0.23169946670532227, + "mean": 5.690910984412767e-05, + "std": 0.025696253404021263, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_out.0.bias": { + "min": -0.1358632743358612, + "max": 0.1274021714925766, + "mean": -0.005497328005731106, + "std": 0.03996951878070831, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.3.g": { + "min": 0.3545131981372833, + "max": 1.172075629234314, + "mean": 0.7106390595436096, + "std": 0.10376753658056259, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.4.ff.0.0.weight": { + "min": -0.6174606084823608, + "max": 0.5543855428695679, + "mean": 0.0011602300219237804, + "std": 0.04611969366669655, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.2.4.ff.0.0.bias": { + "min": -0.18855658173561096, + "max": 0.024964194744825363, + "mean": -0.034842122346162796, + "std": 0.02861381322145462, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.2.4.ff.2.weight": { + "min": -1.1317338943481445, + "max": 0.9715229272842407, + "mean": 0.00035948510048910975, + "std": 0.04234746843576431, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.2.4.ff.2.bias": { + "min": -0.5981062650680542, + "max": 0.06280992925167084, + "mean": -0.004879314452409744, + "std": 0.028617065399885178, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.1.g": { + "min": 0.3752063512802124, + "max": 0.940569281578064, + "mean": 0.5925507545471191, + "std": 0.06694991141557693, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_q.weight": { + "min": -0.39141029119491577, + "max": 0.3690900504589081, + "mean": 7.122607348719612e-05, + "std": 0.03718871995806694, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_q.bias": { + "min": -0.11894620954990387, + "max": 0.13650599122047424, + "mean": 0.0009305156418122351, + "std": 0.029250090941786766, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_k.weight": { + "min": -0.6192548274993896, + "max": 0.5089151263237, + "mean": 1.523251921753399e-05, + "std": 0.03644222766160965, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_k.bias": { + "min": -8.188663482666016, + "max": 8.790773391723633, + "mean": -0.10929425060749054, + "std": 1.6991606950759888, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_v.weight": { + "min": -0.27665913105010986, + "max": 0.23989883065223694, + "mean": 5.3170409955782816e-05, + "std": 0.03261546045541763, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_v.bias": { + "min": -0.05207620561122894, + "max": 0.039528362452983856, + "mean": 9.136732842307538e-05, + "std": 0.012959755957126617, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_out.0.weight": { + "min": -0.23080551624298096, + "max": 0.23467440903186798, + "mean": -2.1718551579397172e-05, + "std": 0.0293918177485466, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_out.0.bias": { + "min": -0.20433980226516724, + "max": 0.10561156272888184, + "mean": -0.0040257819928228855, + "std": 0.03262433037161827, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.3.g": { + "min": 0.3398168385028839, + "max": 1.0127116441726685, + "mean": 0.7008739709854126, + "std": 0.09675976634025574, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.4.ff.0.0.weight": { + "min": -0.5649558305740356, + "max": 0.8329834342002869, + "mean": 0.00041514058830216527, + "std": 0.04230239987373352, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.3.4.ff.0.0.bias": { + "min": -0.21168796718120575, + "max": 0.030586589127779007, + "mean": -0.03219006583094597, + "std": 0.02651149593293667, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.3.4.ff.2.weight": { + "min": -0.7545908689498901, + "max": 0.7186294794082642, + "mean": -9.42062251851894e-06, + "std": 0.036842189729213715, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.3.4.ff.2.bias": { + "min": -0.26354482769966125, + "max": 0.10587722808122635, + "mean": -0.0030317441560328007, + "std": 0.028866499662399292, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.1.g": { + "min": 0.28444212675094604, + "max": 0.695132315158844, + "mean": 0.49955570697784424, + "std": 0.04653683677315712, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_q.weight": { + "min": -0.27924680709838867, + "max": 0.2342948317527771, + "mean": -0.00011125784658361226, + "std": 0.03876316547393799, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_q.bias": { + "min": -0.1545136421918869, + "max": 0.12684346735477448, + "mean": -0.002232692204415798, + "std": 0.03341302275657654, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_k.weight": { + "min": -0.41413962841033936, + "max": 0.6599588990211487, + "mean": -1.9788125428021885e-05, + "std": 0.03910021111369133, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_k.bias": { + "min": -4.238841533660889, + "max": 4.723404884338379, + "mean": -0.02046296000480652, + "std": 1.0078750848770142, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_v.weight": { + "min": -0.245038241147995, + "max": 0.20766045153141022, + "mean": 4.384694329928607e-05, + "std": 0.03396622836589813, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_v.bias": { + "min": -0.034554872661828995, + "max": 0.04480086266994476, + "mean": -1.7740559997037053e-05, + "std": 0.012627062387764454, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_out.0.weight": { + "min": -0.20076899230480194, + "max": 0.20593972504138947, + "mean": -2.9633309168275446e-05, + "std": 0.031023768708109856, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_out.0.bias": { + "min": -0.1999690979719162, + "max": 0.11344368755817413, + "mean": -0.0029194147791713476, + "std": 0.034512441605329514, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.3.g": { + "min": 0.3670799434185028, + "max": 1.056976079940796, + "mean": 0.67062908411026, + "std": 0.06638980656862259, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.4.ff.0.0.weight": { + "min": -0.39831405878067017, + "max": 0.5025192499160767, + "mean": -3.858314084936865e-05, + "std": 0.04113723710179329, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.4.4.ff.0.0.bias": { + "min": -0.12804804742336273, + "max": 0.026756688952445984, + "mean": -0.030546799302101135, + "std": 0.021871846169233322, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.4.4.ff.2.weight": { + "min": -0.4490903913974762, + "max": 0.4329609274864197, + "mean": 8.376075129490346e-05, + "std": 0.034896120429039, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.4.4.ff.2.bias": { + "min": -0.26764214038848877, + "max": 0.07259879261255264, + "mean": -0.0011110607301816344, + "std": 0.023125821724534035, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.1.g": { + "min": 0.28748002648353577, + "max": 0.68532794713974, + "mean": 0.5245869159698486, + "std": 0.047536663711071014, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_q.weight": { + "min": -0.22228431701660156, + "max": 0.22351308166980743, + "mean": 1.5719435396022163e-05, + "std": 0.03895285725593567, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_q.bias": { + "min": -0.13649071753025055, + "max": 0.10923465341329575, + "mean": 0.00023689989757258445, + "std": 0.029244115576148033, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_k.weight": { + "min": -0.37521880865097046, + "max": 0.43729540705680847, + "mean": -9.554900316288695e-06, + "std": 0.03928901627659798, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_k.bias": { + "min": -3.8464367389678955, + "max": 5.000250816345215, + "mean": 0.009746391326189041, + "std": 0.8453746438026428, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_v.weight": { + "min": -0.22334088385105133, + "max": 0.22010144591331482, + "mean": -2.237738954136148e-07, + "std": 0.03441348671913147, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_v.bias": { + "min": -0.04365158826112747, + "max": 0.035844866186380386, + "mean": -0.00025856425054371357, + "std": 0.012080752290785313, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_out.0.weight": { + "min": -0.21325859427452087, + "max": 0.1888350248336792, + "mean": -1.6756794138927944e-05, + "std": 0.03154024854302406, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_out.0.bias": { + "min": -0.1807885617017746, + "max": 0.1208307296037674, + "mean": -0.0024116605054587126, + "std": 0.04126964509487152, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.3.g": { + "min": 0.4224590063095093, + "max": 0.9420249462127686, + "mean": 0.6628004908561707, + "std": 0.05680832266807556, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.4.ff.0.0.weight": { + "min": -0.371380478143692, + "max": 0.4757322669029236, + "mean": -8.227255602832884e-05, + "std": 0.040896233171224594, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.5.4.ff.0.0.bias": { + "min": -0.20817440748214722, + "max": 0.027128340676426888, + "mean": -0.03024515137076378, + "std": 0.021346455439925194, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.5.4.ff.2.weight": { + "min": -0.34020015597343445, + "max": 0.7336611747741699, + "mean": 8.482092380290851e-05, + "std": 0.03477148711681366, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.5.4.ff.2.bias": { + "min": -0.2402409464120865, + "max": 0.05044962465763092, + "mean": -0.0011967722093686461, + "std": 0.020463695749640465, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.1.g": { + "min": 0.3060604929924011, + "max": 0.6536474823951721, + "mean": 0.525157630443573, + "std": 0.04612673819065094, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_q.weight": { + "min": -0.30424413084983826, + "max": 0.2173623889684677, + "mean": 6.994098657742143e-05, + "std": 0.03949854522943497, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_q.bias": { + "min": -0.14945130050182343, + "max": 0.13143886625766754, + "mean": 0.00034817858249880373, + "std": 0.030476493760943413, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_k.weight": { + "min": -0.2574465572834015, + "max": 0.20223106443881989, + "mean": 3.098994420724921e-05, + "std": 0.03948768228292465, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_k.bias": { + "min": -2.336733102798462, + "max": 2.376356840133667, + "mean": -0.02624763362109661, + "std": 0.44985321164131165, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_v.weight": { + "min": -0.18909630179405212, + "max": 0.21054214239120483, + "mean": 3.723270128830336e-05, + "std": 0.034798216074705124, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_v.bias": { + "min": -0.03172660619020462, + "max": 0.03550007939338684, + "mean": -0.00020049612794537097, + "std": 0.012289649806916714, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_out.0.weight": { + "min": -0.1884716898202896, + "max": 0.17050357162952423, + "mean": -6.797749665565789e-05, + "std": 0.03217477723956108, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_out.0.bias": { + "min": -0.1394048035144806, + "max": 0.13731525838375092, + "mean": -0.0025170280132442713, + "std": 0.05131148546934128, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.3.g": { + "min": 0.46712788939476013, + "max": 0.9565918445587158, + "mean": 0.6689888834953308, + "std": 0.052790068089962006, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.4.ff.0.0.weight": { + "min": -0.32436564564704895, + "max": 0.3097445070743561, + "mean": -1.5296809579012915e-06, + "std": 0.04095211252570152, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.6.4.ff.0.0.bias": { + "min": -0.12481985241174698, + "max": 0.02530287392437458, + "mean": -0.030714336782693863, + "std": 0.019815392792224884, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.6.4.ff.2.weight": { + "min": -0.44007495045661926, + "max": 0.44524118304252625, + "mean": 9.531660907668993e-05, + "std": 0.03512417897582054, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.6.4.ff.2.bias": { + "min": -0.22461570799350739, + "max": 0.05165664851665497, + "mean": -0.0011837758356705308, + "std": 0.018468836322426796, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.1.g": { + "min": 0.33936041593551636, + "max": 0.7393229007720947, + "mean": 0.5587522983551025, + "std": 0.04140261933207512, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_q.weight": { + "min": -0.27253732085227966, + "max": 0.2784145176410675, + "mean": 1.9914490621886216e-05, + "std": 0.041062433272600174, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_q.bias": { + "min": -0.13705013692378998, + "max": 0.13989973068237305, + "mean": 0.0004888542462140322, + "std": 0.02663799747824669, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_k.weight": { + "min": -0.49079182744026184, + "max": 0.35604262351989746, + "mean": 8.881442772690207e-05, + "std": 0.04070043936371803, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_k.bias": { + "min": -2.2975404262542725, + "max": 1.7454535961151123, + "mean": -0.021080955862998962, + "std": 0.5002180933952332, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_v.weight": { + "min": -0.21756696701049805, + "max": 0.19789846241474152, + "mean": -4.058882768731564e-05, + "std": 0.03423743322491646, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_v.bias": { + "min": -0.04133187234401703, + "max": 0.03867634758353233, + "mean": -0.00014505762374028563, + "std": 0.012880876660346985, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_out.0.weight": { + "min": -0.1775415539741516, + "max": 0.18375425040721893, + "mean": 4.7608955355826765e-05, + "std": 0.03156036138534546, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_out.0.bias": { + "min": -0.17991603910923004, + "max": 0.18388697504997253, + "mean": -0.0022191007155925035, + "std": 0.05484011396765709, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.3.g": { + "min": 0.4743064045906067, + "max": 1.0255905389785767, + "mean": 0.6453731656074524, + "std": 0.050350919365882874, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.4.ff.0.0.weight": { + "min": -0.2718814015388489, + "max": 0.30937331914901733, + "mean": 0.00011242884647799656, + "std": 0.04068846255540848, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.7.4.ff.0.0.bias": { + "min": -0.1058216467499733, + "max": 0.026849187910556793, + "mean": -0.029516499489545822, + "std": 0.01792926900088787, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.7.4.ff.2.weight": { + "min": -0.33906012773513794, + "max": 0.3292734920978546, + "mean": 5.717227759305388e-05, + "std": 0.034418120980262756, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.7.4.ff.2.bias": { + "min": -0.18169447779655457, + "max": 0.04204929992556572, + "mean": -0.0010728895431384444, + "std": 0.01721538044512272, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.1.g": { + "min": 0.32545599341392517, + "max": 0.686664342880249, + "mean": 0.5112766027450562, + "std": 0.036954235285520554, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_q.weight": { + "min": -0.23384402692317963, + "max": 0.22573164105415344, + "mean": -3.598508192226291e-05, + "std": 0.0391816683113575, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_q.bias": { + "min": -0.11534573137760162, + "max": 0.13162653148174286, + "mean": 0.0001513269089628011, + "std": 0.029193254187703133, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_k.weight": { + "min": -0.3528960049152374, + "max": 0.285469651222229, + "mean": 7.2757711677695625e-06, + "std": 0.03925016149878502, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_k.bias": { + "min": -4.133274078369141, + "max": 3.544353723526001, + "mean": -0.011593645438551903, + "std": 0.6827419400215149, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_v.weight": { + "min": -0.21140114963054657, + "max": 0.20909518003463745, + "mean": 3.4737786336336285e-05, + "std": 0.0344894602894783, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_v.bias": { + "min": -0.035711076110601425, + "max": 0.048078615218400955, + "mean": 0.0007944396347738802, + "std": 0.01285555586218834, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_out.0.weight": { + "min": -0.21062366664409637, + "max": 0.193213552236557, + "mean": -1.284678091906244e-06, + "std": 0.031699951738119125, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_out.0.bias": { + "min": -0.18667221069335938, + "max": 0.17721369862556458, + "mean": -0.002848550211638212, + "std": 0.058637380599975586, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.3.g": { + "min": 0.4746397137641907, + "max": 1.041860818862915, + "mean": 0.651482880115509, + "std": 0.049657855182886124, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.4.ff.0.0.weight": { + "min": -0.24850571155548096, + "max": 0.32913738489151, + "mean": 0.00018063749303109944, + "std": 0.04057687148451805, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.8.4.ff.0.0.bias": { + "min": -0.12447232753038406, + "max": 0.024594351649284363, + "mean": -0.030502719804644585, + "std": 0.01760093867778778, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.8.4.ff.2.weight": { + "min": -0.4212387502193451, + "max": 0.482032835483551, + "mean": 2.141768618457718e-06, + "std": 0.03540309891104698, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.8.4.ff.2.bias": { + "min": -0.15185940265655518, + "max": 0.04337269812822342, + "mean": 3.945987918996252e-05, + "std": 0.014877513982355595, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.1.g": { + "min": 0.31561797857284546, + "max": 0.682021975517273, + "mean": 0.5529669523239136, + "std": 0.04071478173136711, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_q.weight": { + "min": -0.20643697679042816, + "max": 0.21993368864059448, + "mean": 3.0923340091248974e-05, + "std": 0.03830339014530182, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_q.bias": { + "min": -0.1378619521856308, + "max": 0.112775057554245, + "mean": 2.049036993412301e-05, + "std": 0.02582140639424324, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_k.weight": { + "min": -0.40277066826820374, + "max": 0.3711613118648529, + "mean": 2.6232244636048563e-05, + "std": 0.038185227662324905, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_k.bias": { + "min": -3.7714638710021973, + "max": 2.8691656589508057, + "mean": 0.0011571794748306274, + "std": 0.516919732093811, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_v.weight": { + "min": -0.20294280350208282, + "max": 0.1974332332611084, + "mean": 2.9497665309463628e-05, + "std": 0.03430052474141121, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_v.bias": { + "min": -0.050981007516384125, + "max": 0.04004063457250595, + "mean": -0.0004196166410110891, + "std": 0.013425874523818493, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_out.0.weight": { + "min": -0.19651710987091064, + "max": 0.2017611861228943, + "mean": -1.2331822290434502e-05, + "std": 0.031808242201805115, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_out.0.bias": { + "min": -0.19314663112163544, + "max": 0.19513675570487976, + "mean": -0.0029698254074901342, + "std": 0.06256996840238571, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.3.g": { + "min": 0.3494587540626526, + "max": 1.0840725898742676, + "mean": 0.6672499775886536, + "std": 0.05523226782679558, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.4.ff.0.0.weight": { + "min": -0.22516681253910065, + "max": 0.2514885663986206, + "mean": 0.00035906361881643534, + "std": 0.040765900164842606, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.9.4.ff.0.0.bias": { + "min": -0.09103509038686752, + "max": 0.04371785372495651, + "mean": -0.030089743435382843, + "std": 0.017607875168323517, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.9.4.ff.2.weight": { + "min": -0.35351207852363586, + "max": 0.30409130454063416, + "mean": -4.350150265963748e-05, + "std": 0.03712816908955574, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.9.4.ff.2.bias": { + "min": -0.16206228733062744, + "max": 0.06353683769702911, + "mean": -8.305630763061345e-05, + "std": 0.019406888633966446, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.1.g": { + "min": 0.348787397146225, + "max": 0.722071647644043, + "mean": 0.5424383878707886, + "std": 0.039067838340997696, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_q.weight": { + "min": -0.21942198276519775, + "max": 0.22312530875205994, + "mean": -1.1118878319393843e-05, + "std": 0.03923613205552101, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_q.bias": { + "min": -0.11845415830612183, + "max": 0.1708553582429886, + "mean": 0.0002840349334292114, + "std": 0.025122985243797302, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_k.weight": { + "min": -0.24687451124191284, + "max": 0.301123708486557, + "mean": -3.652745726867579e-05, + "std": 0.038935691118240356, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_k.bias": { + "min": -3.5055902004241943, + "max": 3.715036153793335, + "mean": 0.01585160195827484, + "std": 0.7825287580490112, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_v.weight": { + "min": -0.2186805158853531, + "max": 0.23763009905815125, + "mean": -1.3581981875177007e-05, + "std": 0.036307912319898605, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_v.bias": { + "min": -0.047199200838804245, + "max": 0.05141306668519974, + "mean": 0.0004809980746358633, + "std": 0.013516527600586414, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_out.0.weight": { + "min": -0.21401917934417725, + "max": 0.21761927008628845, + "mean": 5.652284016832709e-05, + "std": 0.03361988440155983, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_out.0.bias": { + "min": -0.21142059564590454, + "max": 0.23152688145637512, + "mean": -0.005106795579195023, + "std": 0.061881836503744125, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.3.g": { + "min": 0.36215895414352417, + "max": 1.1013121604919434, + "mean": 0.6993671655654907, + "std": 0.05360371619462967, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.4.ff.0.0.weight": { + "min": -0.2346053123474121, + "max": 0.24489951133728027, + "mean": 0.000463481672341004, + "std": 0.0412747748196125, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.10.4.ff.0.0.bias": { + "min": -0.09809241443872452, + "max": 0.06830352544784546, + "mean": -0.031439535319805145, + "std": 0.01812061481177807, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.10.4.ff.2.weight": { + "min": -0.3016868829727173, + "max": 0.35154613852500916, + "mean": -8.162677113432437e-05, + "std": 0.040280576795339584, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.10.4.ff.2.bias": { + "min": -0.15234576165676117, + "max": 0.14968463778495789, + "mean": 0.00025512842694297433, + "std": 0.023036863654851913, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.1.g": { + "min": 0.99940425157547, + "max": 1.0017729997634888, + "mean": 1.0002546310424805, + "std": 0.0006659556529484689, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_q.weight": { + "min": -0.03126639127731323, + "max": 0.03126263990998268, + "mean": -1.9294351659482345e-05, + "std": 0.018044061958789825, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_q.bias": { + "min": -0.031232889741659164, + "max": 0.03099249303340912, + "mean": -0.001084338640794158, + "std": 0.017953665927052498, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_k.weight": { + "min": -0.031263306736946106, + "max": 0.031267084181308746, + "mean": 3.548892891558353e-06, + "std": 0.018044468015432358, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_k.bias": { + "min": -0.03115880861878395, + "max": 0.031179169192910194, + "mean": 0.0003339822869747877, + "std": 0.018065886572003365, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_v.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_v.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_out.0.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_out.0.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.3.g": { + "min": 0.9994449615478516, + "max": 1.0018095970153809, + "mean": 1.0002632141113281, + "std": 0.0006522060139104724, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.4.ff.0.0.weight": { + "min": -0.03126963973045349, + "max": 0.03127080947160721, + "mean": -8.397149031225126e-06, + "std": 0.01804318279027939, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.11.4.ff.0.0.bias": { + "min": -0.0312512069940567, + "max": 0.031249327585101128, + "mean": 0.0001536280324216932, + "std": 0.01799430511891842, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.11.4.ff.2.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.11.4.ff.2.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.1.g": { + "min": 0.38297948241233826, + "max": 0.7195636034011841, + "mean": 0.5807591080665588, + "std": 0.03886506333947182, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_q.weight": { + "min": -0.2380739152431488, + "max": 0.19658486545085907, + "mean": 2.6584548322716728e-05, + "std": 0.03746968135237694, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_q.bias": { + "min": -0.11867813766002655, + "max": 0.16608171164989471, + "mean": 0.0009910191874951124, + "std": 0.02755763940513134, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_k.weight": { + "min": -0.2461400330066681, + "max": 0.5007420182228088, + "mean": -5.0447401008568704e-05, + "std": 0.03762757405638695, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_k.bias": { + "min": -3.9424328804016113, + "max": 3.7695746421813965, + "mean": -0.0035724048502743244, + "std": 0.681464672088623, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_v.weight": { + "min": -0.22735856473445892, + "max": 0.2514454424381256, + "mean": -1.1598500350373797e-05, + "std": 0.03743908926844597, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_v.bias": { + "min": -0.07171762734651566, + "max": 0.08069814741611481, + "mean": -0.0005200206069275737, + "std": 0.015662606805562973, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_out.0.weight": { + "min": -0.2281658798456192, + "max": 0.2580048441886902, + "mean": -2.8616894269362092e-05, + "std": 0.03542575612664223, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_out.0.bias": { + "min": -0.20041774213314056, + "max": 0.2152491807937622, + "mean": -0.005537157878279686, + "std": 0.06833865493535995, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.3.g": { + "min": 0.4051746428012848, + "max": 1.1894384622573853, + "mean": 0.7380443215370178, + "std": 0.05523209273815155, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.4.ff.0.0.weight": { + "min": -0.2211739420890808, + "max": 0.2460654377937317, + "mean": 0.0005211163079366088, + "std": 0.04134252667427063, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.12.4.ff.0.0.bias": { + "min": -0.10346150398254395, + "max": 0.024183176457881927, + "mean": -0.03266960382461548, + "std": 0.018883610144257545, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.12.4.ff.2.weight": { + "min": -0.4493599832057953, + "max": 0.42234691977500916, + "mean": -0.0004324695619288832, + "std": 0.046903885900974274, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.12.4.ff.2.bias": { + "min": -0.25148940086364746, + "max": 0.47015321254730225, + "mean": 0.0031974762678146362, + "std": 0.044545728713274, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.0.weight": { + "min": -0.3170487582683563, + "max": 0.33324581384658813, + "mean": -2.528912045818288e-05, + "std": 0.0212908573448658, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.13.1.g": { + "min": 0.3246053457260132, + "max": 0.6854332685470581, + "mean": 0.5710639357566833, + "std": 0.04471997916698456, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_q.weight": { + "min": -0.16466176509857178, + "max": 0.1740393489599228, + "mean": -4.8587571654934436e-05, + "std": 0.03318466991186142, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_q.bias": { + "min": -0.18687166273593903, + "max": 0.14292190968990326, + "mean": 3.81053687306121e-05, + "std": 0.029696526005864143, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_k.weight": { + "min": -0.38059744238853455, + "max": 0.24608764052391052, + "mean": -9.966568541130982e-06, + "std": 0.032765913754701614, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_k.bias": { + "min": -3.65606689453125, + "max": 3.290353775024414, + "mean": -0.014253877103328705, + "std": 0.9852582216262817, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_v.weight": { + "min": -0.23507949709892273, + "max": 0.2475711703300476, + "mean": -1.77873171196552e-05, + "std": 0.041702862828969955, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_v.bias": { + "min": -0.07279693335294724, + "max": 0.15454502403736115, + "mean": 0.000664762279484421, + "std": 0.025170980021357536, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_out.0.weight": { + "min": -0.2665710747241974, + "max": 0.24850338697433472, + "mean": -1.535093724669423e-05, + "std": 0.04014323651790619, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_out.0.bias": { + "min": -0.18962323665618896, + "max": 0.19475142657756805, + "mean": -0.0012306260177865624, + "std": 0.06669402867555618, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.3.g": { + "min": 0.3292522728443146, + "max": 0.999567985534668, + "mean": 0.7192600965499878, + "std": 0.052342262119054794, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.4.ff.0.0.weight": { + "min": -0.23160076141357422, + "max": 0.2457643449306488, + "mean": 0.00018272445595357567, + "std": 0.04090625420212746, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.13.4.ff.0.0.bias": { + "min": -0.11420896649360657, + "max": 0.018650474026799202, + "mean": -0.042482297867536545, + "std": 0.018855074420571327, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.13.4.ff.2.weight": { + "min": -0.3899804949760437, + "max": 0.40730100870132446, + "mean": -2.1874793674214743e-05, + "std": 0.04854067414999008, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.13.4.ff.2.bias": { + "min": -0.6932750344276428, + "max": 0.41266557574272156, + "mean": 0.0008518121903762221, + "std": 0.060295384377241135, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.0.weight": { + "min": 0.0, + "max": 0.999998927116394, + "mean": 0.00048828075523488224, + "std": 0.02209167368710041, + "sparsity": 0.99951171875, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.14.1.g": { + "min": 0.9994292855262756, + "max": 1.0017839670181274, + "mean": 1.000253677368164, + "std": 0.000652652932330966, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_q.weight": { + "min": -0.03126111254096031, + "max": 0.0312650129199028, + "mean": -2.1023370209150016e-05, + "std": 0.0180354006588459, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_q.bias": { + "min": -0.031219881027936935, + "max": 0.031236713752150536, + "mean": -0.0006771213375031948, + "std": 0.017829909920692444, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_k.weight": { + "min": -0.03126417100429535, + "max": 0.03126959502696991, + "mean": -8.832794264890254e-06, + "std": 0.018034426495432854, + "sparsity": 9.5367431640625e-07, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_k.bias": { + "min": -0.03123662993311882, + "max": 0.03124932385981083, + "mean": -0.0007298794225789607, + "std": 0.01794484816491604, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_v.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_v.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_out.0.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_out.0.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.3.g": { + "min": 0.9993973970413208, + "max": 1.0017794370651245, + "mean": 1.00028395652771, + "std": 0.0006690355949103832, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.4.ff.0.0.weight": { + "min": -0.03126800060272217, + "max": 0.031265586614608765, + "mean": 3.591585482354276e-06, + "std": 0.018040791153907776, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.14.4.ff.0.0.bias": { + "min": -0.031230367720127106, + "max": 0.03125299513339996, + "mean": 0.00019574598991312087, + "std": 0.018076494336128235, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.14.4.ff.2.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.14.4.ff.2.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.0.weight": { + "min": -0.23457114398479462, + "max": 0.2725405693054199, + "mean": 6.967699391680071e-06, + "std": 0.01881221868097782, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.15.1.g": { + "min": 0.3212726414203644, + "max": 0.6936339139938354, + "mean": 0.5816882848739624, + "std": 0.04593805596232414, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_q.weight": { + "min": -0.18182046711444855, + "max": 0.1976739764213562, + "mean": -1.1725308468157891e-05, + "std": 0.033187251538038254, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_q.bias": { + "min": -0.1606890708208084, + "max": 0.12948599457740784, + "mean": -0.001067878445610404, + "std": 0.034144606441259384, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_k.weight": { + "min": -0.332189679145813, + "max": 0.31144458055496216, + "mean": -1.0352114259148948e-05, + "std": 0.03223797678947449, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_k.bias": { + "min": -7.803721904754639, + "max": 8.76359748840332, + "mean": 0.09347224235534668, + "std": 1.6197657585144043, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_v.weight": { + "min": -0.23381681740283966, + "max": 0.2420002520084381, + "mean": 4.138463191338815e-05, + "std": 0.04086202755570412, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_v.bias": { + "min": -0.07600986212491989, + "max": 0.06578930467367172, + "mean": 0.00047852861462160945, + "std": 0.019416049122810364, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_out.0.weight": { + "min": -0.24590720236301422, + "max": 0.23409155011177063, + "mean": -2.9138864192645997e-06, + "std": 0.039436690509319305, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_out.0.bias": { + "min": -0.16287560760974884, + "max": 0.16082623600959778, + "mean": 0.0016318459529429674, + "std": 0.06528104841709137, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.3.g": { + "min": 0.5568646192550659, + "max": 0.9439972043037415, + "mean": 0.7129673957824707, + "std": 0.0401376374065876, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.4.ff.0.0.weight": { + "min": -0.22865070402622223, + "max": 0.25514620542526245, + "mean": -4.54368710052222e-05, + "std": 0.04058137908577919, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.15.4.ff.0.0.bias": { + "min": -0.13480910658836365, + "max": 0.022281890735030174, + "mean": -0.04135727509856224, + "std": 0.018383679911494255, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.15.4.ff.2.weight": { + "min": -0.42169636487960815, + "max": 0.39239397644996643, + "mean": -4.40980693383608e-06, + "std": 0.04779108986258507, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.15.4.ff.2.bias": { + "min": -0.6075002551078796, + "max": 0.6514228582382202, + "mean": 0.0015837398823350668, + "std": 0.05683837831020355, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.0.weight": { + "min": -0.25171443819999695, + "max": 0.32070818543434143, + "mean": -6.0755610320484266e-06, + "std": 0.01961563341319561, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.16.1.g": { + "min": 0.3600234091281891, + "max": 0.6823956370353699, + "mean": 0.5707757472991943, + "std": 0.04296165704727173, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_q.weight": { + "min": -0.22057192027568817, + "max": 0.1770636886358261, + "mean": -3.4672062611207366e-05, + "std": 0.03430239111185074, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_q.bias": { + "min": -0.16365490853786469, + "max": 0.23306845128536224, + "mean": 0.0003636471228674054, + "std": 0.03286948427557945, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_k.weight": { + "min": -0.2637504041194916, + "max": 0.23983356356620789, + "mean": -5.237644290900789e-05, + "std": 0.03390154615044594, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_k.bias": { + "min": -4.8552327156066895, + "max": 5.091460227966309, + "mean": 0.04388175159692764, + "std": 1.2293211221694946, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_v.weight": { + "min": -0.2467021644115448, + "max": 0.2504825294017792, + "mean": 7.218097016448155e-05, + "std": 0.04399321228265762, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_v.bias": { + "min": -0.06264208257198334, + "max": 0.054531484842300415, + "mean": 0.00065071159042418, + "std": 0.017192156985402107, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_out.0.weight": { + "min": -0.2865971624851227, + "max": 0.2718464434146881, + "mean": -4.9919544835574925e-05, + "std": 0.04299159720540047, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_out.0.bias": { + "min": -0.16066378355026245, + "max": 0.17053070664405823, + "mean": -0.0028841430321335793, + "std": 0.059287648648023605, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.3.g": { + "min": 0.5196661353111267, + "max": 0.9328836798667908, + "mean": 0.7135858535766602, + "std": 0.038419246673583984, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.4.ff.0.0.weight": { + "min": -0.2381887435913086, + "max": 0.24951320886611938, + "mean": 0.00046486116480082273, + "std": 0.04046149179339409, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.16.4.ff.0.0.bias": { + "min": -0.14427022635936737, + "max": 0.041461389511823654, + "mean": -0.03969397395849228, + "std": 0.02054336480796337, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.16.4.ff.2.weight": { + "min": -0.5328505039215088, + "max": 0.5830832719802856, + "mean": 5.9098410929436795e-06, + "std": 0.04886835068464279, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.16.4.ff.2.bias": { + "min": -0.5191918015480042, + "max": 0.49353325366973877, + "mean": 0.0023602654691785574, + "std": 0.05344703048467636, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.0.weight": { + "min": -0.2736090123653412, + "max": 0.31520769000053406, + "mean": 1.8358268789597787e-06, + "std": 0.020052799955010414, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.17.1.g": { + "min": 0.36640509963035583, + "max": 0.711678147315979, + "mean": 0.593246340751648, + "std": 0.04593454673886299, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_q.weight": { + "min": -0.21106205880641937, + "max": 0.1996321678161621, + "mean": 3.077441579080187e-05, + "std": 0.03486856073141098, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_q.bias": { + "min": -0.18727192282676697, + "max": 0.20402666926383972, + "mean": 0.0009561080951243639, + "std": 0.031529128551483154, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_k.weight": { + "min": -0.28969451785087585, + "max": 0.3398367166519165, + "mean": -4.7392662963829935e-05, + "std": 0.03458969667553902, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_k.bias": { + "min": -3.877439260482788, + "max": 3.3875346183776855, + "mean": 0.014458965510129929, + "std": 0.8584734797477722, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_v.weight": { + "min": -0.2244323492050171, + "max": 0.24988871812820435, + "mean": -3.996262876171386e-06, + "std": 0.04223586246371269, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_v.bias": { + "min": -0.055074166506528854, + "max": 0.0468442440032959, + "mean": -1.8697581253945827e-05, + "std": 0.015848318114876747, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_out.0.weight": { + "min": -0.2929523289203644, + "max": 0.29100877046585083, + "mean": -7.363702025031671e-06, + "std": 0.04195086285471916, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_out.0.bias": { + "min": -0.12487897276878357, + "max": 0.2594272792339325, + "mean": -0.003234811592847109, + "std": 0.05315796285867691, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.3.g": { + "min": 0.45620009303092957, + "max": 0.844541609287262, + "mean": 0.7056601047515869, + "std": 0.035222552716732025, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.4.ff.0.0.weight": { + "min": -0.5114080309867859, + "max": 0.34850868582725525, + "mean": 0.00034260982647538185, + "std": 0.040206458419561386, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.17.4.ff.0.0.bias": { + "min": -0.18708936870098114, + "max": 0.03951717168092728, + "mean": -0.03939085826277733, + "std": 0.02134866826236248, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.17.4.ff.2.weight": { + "min": -0.544402539730072, + "max": 0.5565053224563599, + "mean": -7.180786633398384e-05, + "std": 0.05074291676282883, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.17.4.ff.2.bias": { + "min": -0.511856734752655, + "max": 0.6643833518028259, + "mean": 0.002446281723678112, + "std": 0.04952690377831459, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.0.weight": { + "min": -0.3324280381202698, + "max": 0.2657060921192169, + "mean": 3.681749149109237e-06, + "std": 0.01939038746058941, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.18.1.g": { + "min": 0.32228395342826843, + "max": 0.76633620262146, + "mean": 0.6510899662971497, + "std": 0.04530107229948044, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_q.weight": { + "min": -0.2495409995317459, + "max": 0.21955986320972443, + "mean": -2.516008862585295e-06, + "std": 0.03650251030921936, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_q.bias": { + "min": -0.3271917402744293, + "max": 0.2873159945011139, + "mean": -0.0006787859019823372, + "std": 0.03855893388390541, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_k.weight": { + "min": -0.3100964426994324, + "max": 0.3699168264865875, + "mean": 6.482247408712283e-05, + "std": 0.036243122071027756, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_k.bias": { + "min": -4.71769905090332, + "max": 5.807940483093262, + "mean": 0.03796037286520004, + "std": 1.4132623672485352, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_v.weight": { + "min": -0.22175297141075134, + "max": 0.20589375495910645, + "mean": -7.500311767216772e-05, + "std": 0.04249146580696106, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_v.bias": { + "min": -0.07759421318769455, + "max": 0.05135132744908333, + "mean": -0.000925259490031749, + "std": 0.016409944742918015, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_out.0.weight": { + "min": -0.33092743158340454, + "max": 0.3291303813457489, + "mean": -4.938564870826667e-06, + "std": 0.04279821738600731, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_out.0.bias": { + "min": -0.28501445055007935, + "max": 0.11160922050476074, + "mean": -0.0012059551663696766, + "std": 0.047013018280267715, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.3.g": { + "min": 0.4864731431007385, + "max": 0.8868119716644287, + "mean": 0.7375612854957581, + "std": 0.03823444992303848, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.4.ff.0.0.weight": { + "min": -0.36123231053352356, + "max": 0.2742029130458832, + "mean": 5.119089109939523e-05, + "std": 0.04065319895744324, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.18.4.ff.0.0.bias": { + "min": -0.24762794375419617, + "max": 0.046543918550014496, + "mean": -0.03927048668265343, + "std": 0.023254919797182083, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.18.4.ff.2.weight": { + "min": -0.6263269186019897, + "max": 0.5970423817634583, + "mean": -6.188904080772772e-05, + "std": 0.05312599986791611, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.18.4.ff.2.bias": { + "min": -0.709787905216217, + "max": 0.2658335268497467, + "mean": 0.0009195120073854923, + "std": 0.051235005259513855, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.0.weight": { + "min": -0.34334826469421387, + "max": 0.30343398451805115, + "mean": 2.1822438611707184e-07, + "std": 0.019139666110277176, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.19.1.g": { + "min": 0.34997785091400146, + "max": 0.7828695178031921, + "mean": 0.6389003992080688, + "std": 0.049218229949474335, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_q.weight": { + "min": -0.2058519572019577, + "max": 0.20681944489479065, + "mean": -5.9934332966804504e-05, + "std": 0.037698548287153244, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_q.bias": { + "min": -0.2586883008480072, + "max": 0.26840776205062866, + "mean": -0.0004055192694067955, + "std": 0.044631343334913254, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_k.weight": { + "min": -0.3542138934135437, + "max": 0.32258859276771545, + "mean": -7.339326657529455e-06, + "std": 0.037206824868917465, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_k.bias": { + "min": -5.261901378631592, + "max": 4.204929351806641, + "mean": -0.02642371505498886, + "std": 1.0068365335464478, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_v.weight": { + "min": -0.2388344258069992, + "max": 0.24378669261932373, + "mean": -2.555117680458352e-05, + "std": 0.0432158038020134, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_v.bias": { + "min": -0.06242268532514572, + "max": 0.0566251203417778, + "mean": 0.00035173987271264195, + "std": 0.01414910051971674, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_out.0.weight": { + "min": -0.43747568130493164, + "max": 0.3737330734729767, + "mean": 1.4612624909204897e-05, + "std": 0.04412786290049553, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_out.0.bias": { + "min": -0.0961233526468277, + "max": 0.1762983798980713, + "mean": -0.000659514800645411, + "std": 0.03514162451028824, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.3.g": { + "min": 0.42177778482437134, + "max": 1.0692633390426636, + "mean": 0.7485724687576294, + "std": 0.04206255078315735, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.4.ff.0.0.weight": { + "min": -0.2659589648246765, + "max": 0.29692542552948, + "mean": -7.890580309322104e-05, + "std": 0.040813855826854706, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.19.4.ff.0.0.bias": { + "min": -0.18484872579574585, + "max": 0.04314016178250313, + "mean": -0.03681201860308647, + "std": 0.02558443695306778, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.19.4.ff.2.weight": { + "min": -0.457691490650177, + "max": 0.4868350028991699, + "mean": 4.39733594248537e-05, + "std": 0.0542210191488266, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.19.4.ff.2.bias": { + "min": -0.2863001823425293, + "max": 0.5517781972885132, + "mean": -0.0008814089233055711, + "std": 0.047833118587732315, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.0.weight": { + "min": -0.29263076186180115, + "max": 0.32270461320877075, + "mean": 6.018684871378355e-06, + "std": 0.019972756505012512, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.20.1.g": { + "min": 0.2913359999656677, + "max": 0.7601139545440674, + "mean": 0.6508511304855347, + "std": 0.052110809832811356, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_q.weight": { + "min": -0.24366426467895508, + "max": 0.26166871190071106, + "mean": -5.6619760471221525e-06, + "std": 0.039614126086235046, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_q.bias": { + "min": -0.26755285263061523, + "max": 0.20015348494052887, + "mean": -0.0008774641901254654, + "std": 0.05177554860711098, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_k.weight": { + "min": -0.2722264528274536, + "max": 0.2537742853164673, + "mean": 5.269570010568714e-06, + "std": 0.038710836321115494, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_k.bias": { + "min": -12.966026306152344, + "max": 15.947823524475098, + "mean": 0.0332300066947937, + "std": 1.989342451095581, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_v.weight": { + "min": -0.20672431588172913, + "max": 0.22581705451011658, + "mean": -7.253723015310243e-05, + "std": 0.04055880755186081, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_v.bias": { + "min": -0.06943444162607193, + "max": 0.06314389407634735, + "mean": 0.00015862843429204077, + "std": 0.0147479847073555, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_out.0.weight": { + "min": -0.46546468138694763, + "max": 0.32013440132141113, + "mean": 1.955418883881066e-05, + "std": 0.04059435427188873, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_out.0.bias": { + "min": -0.06408563256263733, + "max": 0.11556272953748703, + "mean": 0.0011989418417215347, + "std": 0.02470807358622551, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.3.g": { + "min": 0.3750011920928955, + "max": 0.9319288730621338, + "mean": 0.7511273622512817, + "std": 0.04018896445631981, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.4.ff.0.0.weight": { + "min": -0.27909016609191895, + "max": 0.27321043610572815, + "mean": -0.00016836788563523442, + "std": 0.04100494086742401, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.20.4.ff.0.0.bias": { + "min": -0.19844156503677368, + "max": 0.051351871341466904, + "mean": -0.032028019428253174, + "std": 0.025079041719436646, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.20.4.ff.2.weight": { + "min": -0.6585158705711365, + "max": 0.5356709957122803, + "mean": -5.047450395068154e-05, + "std": 0.05285719037055969, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.20.4.ff.2.bias": { + "min": -0.1926739513874054, + "max": 0.5822402238845825, + "mean": -0.0005105392774567008, + "std": 0.04108486697077751, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.0.weight": { + "min": -0.41757693886756897, + "max": 0.37195414304733276, + "mean": 6.520090209960472e-06, + "std": 0.021627968177199364, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.21.1.g": { + "min": 0.21454279124736786, + "max": 0.746727705001831, + "mean": 0.6494921445846558, + "std": 0.05432972311973572, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_q.weight": { + "min": -0.20945341885089874, + "max": 0.19550970196723938, + "mean": 4.009851181763224e-05, + "std": 0.03945960849523544, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_q.bias": { + "min": -0.32960787415504456, + "max": 0.25966984033584595, + "mean": -0.003232899820432067, + "std": 0.056286394596099854, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_k.weight": { + "min": -0.20589140057563782, + "max": 0.25466933846473694, + "mean": 5.40036016900558e-05, + "std": 0.03856228291988373, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_k.bias": { + "min": -6.243993759155273, + "max": 6.932845115661621, + "mean": 0.048340775072574615, + "std": 1.3851999044418335, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_v.weight": { + "min": -0.20977123081684113, + "max": 0.23046547174453735, + "mean": -4.7887324399198405e-06, + "std": 0.041317813098430634, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_v.bias": { + "min": -0.043830934911966324, + "max": 0.0359884537756443, + "mean": -6.7679648054763675e-06, + "std": 0.012799433432519436, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_out.0.weight": { + "min": -0.3975262939929962, + "max": 0.34497249126434326, + "mean": -5.5380802223226056e-05, + "std": 0.04239468649029732, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_out.0.bias": { + "min": -0.055168669670820236, + "max": 0.06281793117523193, + "mean": 0.0003579839540179819, + "std": 0.018675317987799644, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.3.g": { + "min": 0.35081058740615845, + "max": 1.0451138019561768, + "mean": 0.7896714210510254, + "std": 0.04873151332139969, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.4.ff.0.0.weight": { + "min": -0.333694726228714, + "max": 0.38623932003974915, + "mean": -0.00016907340614125133, + "std": 0.04149046167731285, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.21.4.ff.0.0.bias": { + "min": -0.15751884877681732, + "max": 0.05906709283590317, + "mean": -0.03182389587163925, + "std": 0.0251007080078125, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.21.4.ff.2.weight": { + "min": -0.6963667273521423, + "max": 0.46923714876174927, + "mean": -8.512083149980754e-05, + "std": 0.05180640146136284, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.21.4.ff.2.bias": { + "min": -0.24786238372325897, + "max": 0.3288760185241699, + "mean": -0.00026252405950799584, + "std": 0.04145393148064613, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.0.weight": { + "min": -0.28698989748954773, + "max": 0.350361168384552, + "mean": -2.7725566269509727e-06, + "std": 0.02424115315079689, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.22.1.g": { + "min": 0.19679424166679382, + "max": 0.7790785431861877, + "mean": 0.6702431440353394, + "std": 0.05866772681474686, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_q.weight": { + "min": -0.22908955812454224, + "max": 0.23140233755111694, + "mean": -2.085999039991293e-05, + "std": 0.04043996334075928, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_q.bias": { + "min": -0.22004202008247375, + "max": 0.24097159504890442, + "mean": 0.0007790824165567756, + "std": 0.055850621312856674, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_k.weight": { + "min": -0.2167646586894989, + "max": 0.226406067609787, + "mean": -7.223833381431177e-05, + "std": 0.039374157786369324, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_k.bias": { + "min": -8.906242370605469, + "max": 9.069114685058594, + "mean": -0.0012542838230729103, + "std": 1.8484386205673218, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_v.weight": { + "min": -0.26939529180526733, + "max": 0.258998304605484, + "mean": 4.3638072384055704e-05, + "std": 0.0384107306599617, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_v.bias": { + "min": -0.0579773373901844, + "max": 0.057985395193099976, + "mean": 0.0003543748171068728, + "std": 0.01471623033285141, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_out.0.weight": { + "min": -0.26387640833854675, + "max": 0.28812822699546814, + "mean": -6.169013795442879e-05, + "std": 0.0390775129199028, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_out.0.bias": { + "min": -0.04410848394036293, + "max": 0.03735562041401863, + "mean": -9.80982295004651e-05, + "std": 0.013347214087843895, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.3.g": { + "min": 0.33935481309890747, + "max": 1.0925333499908447, + "mean": 0.8639740347862244, + "std": 0.06387708336114883, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.4.ff.0.0.weight": { + "min": -0.42313116788864136, + "max": 0.41907814145088196, + "mean": 0.0003136416198685765, + "std": 0.04351295530796051, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.22.4.ff.0.0.bias": { + "min": -0.21479536592960358, + "max": 0.17072512209415436, + "mean": -0.029444200918078423, + "std": 0.0318748876452446, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.22.4.ff.2.weight": { + "min": -0.5986181497573853, + "max": 0.5598904490470886, + "mean": -0.00014800383360125124, + "std": 0.05346141383051872, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.22.4.ff.2.bias": { + "min": -0.17892269790172577, + "max": 0.37738052010536194, + "mean": 0.0013508039992302656, + "std": 0.03731485456228256, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.0.weight": { + "min": -0.39432692527770996, + "max": 0.36881834268569946, + "mean": 3.763254062505439e-05, + "std": 0.028617430478334427, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.23.1.g": { + "min": 0.2906792163848877, + "max": 0.8274716138839722, + "mean": 0.7055441737174988, + "std": 0.06783536076545715, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_q.weight": { + "min": -0.9265903830528259, + "max": 1.027007818222046, + "mean": -2.7936879632761702e-05, + "std": 0.04764379560947418, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_q.bias": { + "min": -0.8793500661849976, + "max": 0.8158687949180603, + "mean": -0.0002950741327367723, + "std": 0.09555269032716751, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_k.weight": { + "min": -0.27022066712379456, + "max": 0.24093179404735565, + "mean": -2.251441401313059e-05, + "std": 0.0389498770236969, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_k.bias": { + "min": -23.743555068969727, + "max": 22.852014541625977, + "mean": -0.09188339114189148, + "std": 4.070625305175781, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_v.weight": { + "min": -0.22778554260730743, + "max": 0.24572508037090302, + "mean": -2.547786607465241e-05, + "std": 0.03864147141575813, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_v.bias": { + "min": -0.06017241254448891, + "max": 0.045427631586790085, + "mean": -0.00013617021613754332, + "std": 0.014690100215375423, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_out.0.weight": { + "min": -0.3379840552806854, + "max": 0.3750169575214386, + "mean": 7.478654879378155e-06, + "std": 0.040820397436618805, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_out.0.bias": { + "min": -0.04619982838630676, + "max": 0.19537773728370667, + "mean": 0.0002735886082518846, + "std": 0.013551585376262665, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.3.g": { + "min": 0.37374061346054077, + "max": 1.1302894353866577, + "mean": 0.8902378082275391, + "std": 0.0640074834227562, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.4.ff.0.0.weight": { + "min": -0.4474950134754181, + "max": 0.542551577091217, + "mean": 2.5157038180623204e-05, + "std": 0.0455806665122509, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.23.4.ff.0.0.bias": { + "min": -0.2237873524427414, + "max": 0.08737614750862122, + "mean": -0.03201454132795334, + "std": 0.03774423152208328, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.23.4.ff.2.weight": { + "min": -0.7263057827949524, + "max": 0.6888318657875061, + "mean": 3.633538290159777e-05, + "std": 0.0517943874001503, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.23.4.ff.2.bias": { + "min": -0.1743825227022171, + "max": 0.21823401749134064, + "mean": 3.549834946170449e-05, + "std": 0.031774841248989105, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.0.weight": { + "min": -0.3394811451435089, + "max": 0.37303876876831055, + "mean": 4.305133916204795e-05, + "std": 0.034135352820158005, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.24.1.g": { + "min": 0.31772536039352417, + "max": 1.2872265577316284, + "mean": 0.6015347242355347, + "std": 0.08348645269870758, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_q.weight": { + "min": -0.2831268906593323, + "max": 0.26034945249557495, + "mean": -3.016911477971007e-06, + "std": 0.03598069027066231, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_q.bias": { + "min": -0.23578572273254395, + "max": 0.20580488443374634, + "mean": 0.00023967580636963248, + "std": 0.056039854884147644, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_k.weight": { + "min": -0.43542858958244324, + "max": 0.32475200295448303, + "mean": 2.4229491828009486e-05, + "std": 0.034124139696359634, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_k.bias": { + "min": -5.546597957611084, + "max": 7.314022064208984, + "mean": -0.007369913160800934, + "std": 0.6993920803070068, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_v.weight": { + "min": -0.34419700503349304, + "max": 0.36281776428222656, + "mean": 0.00010317970009054989, + "std": 0.04783639311790466, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_v.bias": { + "min": -0.07377609610557556, + "max": 0.06036657840013504, + "mean": 0.0009365753503516316, + "std": 0.014937076717615128, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_out.0.weight": { + "min": -0.2563660442829132, + "max": 0.28687092661857605, + "mean": 4.898875886283349e-06, + "std": 0.04156457632780075, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_out.0.bias": { + "min": -0.055319979786872864, + "max": 0.06281081587076187, + "mean": 0.000127265666378662, + "std": 0.007150812540203333, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.3.g": { + "min": 0.4940038025379181, + "max": 1.220664620399475, + "mean": 1.0135600566864014, + "std": 0.11748378723859787, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.4.ff.0.0.weight": { + "min": -1.0940163135528564, + "max": 1.0475441217422485, + "mean": -4.872599311056547e-05, + "std": 0.05241787061095238, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.24.4.ff.0.0.bias": { + "min": -0.2236405611038208, + "max": 0.1730623096227646, + "mean": -0.027228206396102905, + "std": 0.0363101065158844, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.24.4.ff.2.weight": { + "min": -0.8842402696609497, + "max": 0.9227275252342224, + "mean": -0.00014601278235204518, + "std": 0.05329864099621773, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.24.4.ff.2.bias": { + "min": -0.1710553914308548, + "max": 0.3796318471431732, + "mean": 0.0033668535761535168, + "std": 0.03987643122673035, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.0.weight": { + "min": -0.7777752876281738, + "max": 0.722641110420227, + "mean": 1.80296028702287e-05, + "std": 0.0461542084813118, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.25.1.g": { + "min": 0.3386844992637634, + "max": 1.4281909465789795, + "mean": 0.9485001564025879, + "std": 0.20679982006549835, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_q.weight": { + "min": -1.745824933052063, + "max": 1.7045180797576904, + "mean": 0.0002270373224746436, + "std": 0.15870553255081177, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_q.bias": { + "min": -1.1994949579238892, + "max": 1.1009647846221924, + "mean": -0.009547820314764977, + "std": 0.20390011370182037, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_k.weight": { + "min": -0.4209446907043457, + "max": 0.42817720770835876, + "mean": 6.392307841451839e-05, + "std": 0.04802021011710167, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_k.bias": { + "min": -19.74793243408203, + "max": 19.543048858642578, + "mean": -0.2483428716659546, + "std": 4.7770676612854, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_v.weight": { + "min": -0.32391926646232605, + "max": 0.438634991645813, + "mean": -1.1790625649155118e-05, + "std": 0.04616706818342209, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_v.bias": { + "min": -0.03377115726470947, + "max": 0.03684735298156738, + "mean": 0.0006395116215571761, + "std": 0.012911375612020493, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_out.0.weight": { + "min": -0.7035614252090454, + "max": 0.6690102815628052, + "mean": 4.2652536649256945e-05, + "std": 0.0578920915722847, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_out.0.bias": { + "min": -0.07234025001525879, + "max": 0.06776763498783112, + "mean": -0.00013464699441101402, + "std": 0.012891847640275955, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.3.g": { + "min": 0.38047194480895996, + "max": 1.39299738407135, + "mean": 1.06674325466156, + "std": 0.2197609543800354, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.4.ff.0.0.weight": { + "min": -0.6165490746498108, + "max": 0.7185496091842651, + "mean": 0.00011303066276013851, + "std": 0.05802777782082558, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.25.4.ff.0.0.bias": { + "min": -0.21881279349327087, + "max": 0.22498759627342224, + "mean": 0.00618295231834054, + "std": 0.04969846084713936, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.25.4.ff.2.weight": { + "min": -0.629830002784729, + "max": 0.8896750807762146, + "mean": 1.2404842891555745e-05, + "std": 0.023545295000076294, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.25.4.ff.2.bias": { + "min": -0.5068444013595581, + "max": 0.47373077273368835, + "mean": -0.0030198940075933933, + "std": 0.06924331188201904, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.norm_out.g": { + "min": 0.5382840037345886, + "max": 1.1801176071166992, + "mean": 0.7828130722045898, + "std": 0.09876110404729843, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.proj_out.weight": { + "min": -0.267057865858078, + "max": 0.212993323802948, + "mean": -0.0002232328843092546, + "std": 0.054005783051252365, + "sparsity": 0.0, + "shape": [ + 100, + 1024 + ] + }, + "transformer.proj_out.bias": { + "min": -0.23836649954319, + "max": 0.014864332042634487, + "mean": -0.043917927891016006, + "std": 0.03428623452782631, + "sparsity": 0.0, + "shape": [ + 100 + ] + } + } +} \ No newline at end of file