diff --git "a/model_analysis.json" "b/model_analysis.json" new file mode 100644--- /dev/null +++ "b/model_analysis.json" @@ -0,0 +1,4683 @@ +{ + "layer_types": { + "transformer": 391 + }, + "parameter_counts": { + "transformer.time_embed.time_mlp.0.weight": 262144, + "transformer.time_embed.time_mlp.0.bias": 1024, + "transformer.time_embed.time_mlp.2.weight": 1048576, + "transformer.time_embed.time_mlp.2.bias": 1024, + "transformer.text_embed.text_embed.weight": 254600, + "transformer.input_embed.proj.weight": 307200, + "transformer.input_embed.proj.bias": 1024, + "transformer.input_embed.conv_pos_embed.conv1d.0.weight": 2031616, + "transformer.input_embed.conv_pos_embed.conv1d.0.bias": 1024, + "transformer.input_embed.conv_pos_embed.conv1d.2.weight": 2031616, + "transformer.input_embed.conv_pos_embed.conv1d.2.bias": 1024, + "transformer.layers.0.1.g": 1024, + "transformer.layers.0.2.to_q.weight": 1048576, + "transformer.layers.0.2.to_q.bias": 1024, + "transformer.layers.0.2.to_k.weight": 1048576, + "transformer.layers.0.2.to_k.bias": 1024, + "transformer.layers.0.2.to_v.weight": 1048576, + "transformer.layers.0.2.to_v.bias": 1024, + "transformer.layers.0.2.to_out.0.weight": 1048576, + "transformer.layers.0.2.to_out.0.bias": 1024, + "transformer.layers.0.3.g": 1024, + "transformer.layers.0.4.ff.0.0.weight": 4194304, + "transformer.layers.0.4.ff.0.0.bias": 4096, + "transformer.layers.0.4.ff.2.weight": 4194304, + "transformer.layers.0.4.ff.2.bias": 1024, + "transformer.layers.1.1.g": 1024, + "transformer.layers.1.2.to_q.weight": 1048576, + "transformer.layers.1.2.to_q.bias": 1024, + "transformer.layers.1.2.to_k.weight": 1048576, + "transformer.layers.1.2.to_k.bias": 1024, + "transformer.layers.1.2.to_v.weight": 1048576, + "transformer.layers.1.2.to_v.bias": 1024, + "transformer.layers.1.2.to_out.0.weight": 1048576, + "transformer.layers.1.2.to_out.0.bias": 1024, + "transformer.layers.1.3.g": 1024, + "transformer.layers.1.4.ff.0.0.weight": 4194304, + "transformer.layers.1.4.ff.0.0.bias": 4096, + "transformer.layers.1.4.ff.2.weight": 4194304, + "transformer.layers.1.4.ff.2.bias": 1024, + "transformer.layers.2.1.g": 1024, + "transformer.layers.2.2.to_q.weight": 1048576, + "transformer.layers.2.2.to_q.bias": 1024, + "transformer.layers.2.2.to_k.weight": 1048576, + "transformer.layers.2.2.to_k.bias": 1024, + "transformer.layers.2.2.to_v.weight": 1048576, + "transformer.layers.2.2.to_v.bias": 1024, + "transformer.layers.2.2.to_out.0.weight": 1048576, + "transformer.layers.2.2.to_out.0.bias": 1024, + "transformer.layers.2.3.g": 1024, + "transformer.layers.2.4.ff.0.0.weight": 4194304, + "transformer.layers.2.4.ff.0.0.bias": 4096, + "transformer.layers.2.4.ff.2.weight": 4194304, + "transformer.layers.2.4.ff.2.bias": 1024, + "transformer.layers.3.1.g": 1024, + "transformer.layers.3.2.to_q.weight": 1048576, + "transformer.layers.3.2.to_q.bias": 1024, + "transformer.layers.3.2.to_k.weight": 1048576, + "transformer.layers.3.2.to_k.bias": 1024, + "transformer.layers.3.2.to_v.weight": 1048576, + "transformer.layers.3.2.to_v.bias": 1024, + "transformer.layers.3.2.to_out.0.weight": 1048576, + "transformer.layers.3.2.to_out.0.bias": 1024, + "transformer.layers.3.3.g": 1024, + "transformer.layers.3.4.ff.0.0.weight": 4194304, + "transformer.layers.3.4.ff.0.0.bias": 4096, + "transformer.layers.3.4.ff.2.weight": 4194304, + "transformer.layers.3.4.ff.2.bias": 1024, + "transformer.layers.4.1.g": 1024, + "transformer.layers.4.2.to_q.weight": 1048576, + "transformer.layers.4.2.to_q.bias": 1024, + "transformer.layers.4.2.to_k.weight": 1048576, + "transformer.layers.4.2.to_k.bias": 1024, + "transformer.layers.4.2.to_v.weight": 1048576, + "transformer.layers.4.2.to_v.bias": 1024, + "transformer.layers.4.2.to_out.0.weight": 1048576, + "transformer.layers.4.2.to_out.0.bias": 1024, + "transformer.layers.4.3.g": 1024, + "transformer.layers.4.4.ff.0.0.weight": 4194304, + "transformer.layers.4.4.ff.0.0.bias": 4096, + "transformer.layers.4.4.ff.2.weight": 4194304, + "transformer.layers.4.4.ff.2.bias": 1024, + "transformer.layers.5.1.g": 1024, + "transformer.layers.5.2.to_q.weight": 1048576, + "transformer.layers.5.2.to_q.bias": 1024, + "transformer.layers.5.2.to_k.weight": 1048576, + "transformer.layers.5.2.to_k.bias": 1024, + "transformer.layers.5.2.to_v.weight": 1048576, + "transformer.layers.5.2.to_v.bias": 1024, + "transformer.layers.5.2.to_out.0.weight": 1048576, + "transformer.layers.5.2.to_out.0.bias": 1024, + "transformer.layers.5.3.g": 1024, + "transformer.layers.5.4.ff.0.0.weight": 4194304, + "transformer.layers.5.4.ff.0.0.bias": 4096, + "transformer.layers.5.4.ff.2.weight": 4194304, + "transformer.layers.5.4.ff.2.bias": 1024, + "transformer.layers.6.1.g": 1024, + "transformer.layers.6.2.to_q.weight": 1048576, + "transformer.layers.6.2.to_q.bias": 1024, + "transformer.layers.6.2.to_k.weight": 1048576, + "transformer.layers.6.2.to_k.bias": 1024, + "transformer.layers.6.2.to_v.weight": 1048576, + "transformer.layers.6.2.to_v.bias": 1024, + "transformer.layers.6.2.to_out.0.weight": 1048576, + "transformer.layers.6.2.to_out.0.bias": 1024, + "transformer.layers.6.3.g": 1024, + "transformer.layers.6.4.ff.0.0.weight": 4194304, + "transformer.layers.6.4.ff.0.0.bias": 4096, + "transformer.layers.6.4.ff.2.weight": 4194304, + "transformer.layers.6.4.ff.2.bias": 1024, + "transformer.layers.7.1.g": 1024, + "transformer.layers.7.2.to_q.weight": 1048576, + "transformer.layers.7.2.to_q.bias": 1024, + "transformer.layers.7.2.to_k.weight": 1048576, + "transformer.layers.7.2.to_k.bias": 1024, + "transformer.layers.7.2.to_v.weight": 1048576, + "transformer.layers.7.2.to_v.bias": 1024, + "transformer.layers.7.2.to_out.0.weight": 1048576, + "transformer.layers.7.2.to_out.0.bias": 1024, + "transformer.layers.7.3.g": 1024, + "transformer.layers.7.4.ff.0.0.weight": 4194304, + "transformer.layers.7.4.ff.0.0.bias": 4096, + "transformer.layers.7.4.ff.2.weight": 4194304, + "transformer.layers.7.4.ff.2.bias": 1024, + "transformer.layers.8.1.g": 1024, + "transformer.layers.8.2.to_q.weight": 1048576, + "transformer.layers.8.2.to_q.bias": 1024, + "transformer.layers.8.2.to_k.weight": 1048576, + "transformer.layers.8.2.to_k.bias": 1024, + "transformer.layers.8.2.to_v.weight": 1048576, + "transformer.layers.8.2.to_v.bias": 1024, + "transformer.layers.8.2.to_out.0.weight": 1048576, + "transformer.layers.8.2.to_out.0.bias": 1024, + "transformer.layers.8.3.g": 1024, + "transformer.layers.8.4.ff.0.0.weight": 4194304, + "transformer.layers.8.4.ff.0.0.bias": 4096, + "transformer.layers.8.4.ff.2.weight": 4194304, + "transformer.layers.8.4.ff.2.bias": 1024, + "transformer.layers.9.1.g": 1024, + "transformer.layers.9.2.to_q.weight": 1048576, + "transformer.layers.9.2.to_q.bias": 1024, + "transformer.layers.9.2.to_k.weight": 1048576, + "transformer.layers.9.2.to_k.bias": 1024, + "transformer.layers.9.2.to_v.weight": 1048576, + "transformer.layers.9.2.to_v.bias": 1024, + "transformer.layers.9.2.to_out.0.weight": 1048576, + "transformer.layers.9.2.to_out.0.bias": 1024, + "transformer.layers.9.3.g": 1024, + "transformer.layers.9.4.ff.0.0.weight": 4194304, + "transformer.layers.9.4.ff.0.0.bias": 4096, + "transformer.layers.9.4.ff.2.weight": 4194304, + "transformer.layers.9.4.ff.2.bias": 1024, + "transformer.layers.10.1.g": 1024, + "transformer.layers.10.2.to_q.weight": 1048576, + "transformer.layers.10.2.to_q.bias": 1024, + "transformer.layers.10.2.to_k.weight": 1048576, + "transformer.layers.10.2.to_k.bias": 1024, + "transformer.layers.10.2.to_v.weight": 1048576, + "transformer.layers.10.2.to_v.bias": 1024, + "transformer.layers.10.2.to_out.0.weight": 1048576, + "transformer.layers.10.2.to_out.0.bias": 1024, + "transformer.layers.10.3.g": 1024, + "transformer.layers.10.4.ff.0.0.weight": 4194304, + "transformer.layers.10.4.ff.0.0.bias": 4096, + "transformer.layers.10.4.ff.2.weight": 4194304, + "transformer.layers.10.4.ff.2.bias": 1024, + "transformer.layers.11.1.g": 1024, + "transformer.layers.11.2.to_q.weight": 1048576, + "transformer.layers.11.2.to_q.bias": 1024, + "transformer.layers.11.2.to_k.weight": 1048576, + "transformer.layers.11.2.to_k.bias": 1024, + "transformer.layers.11.2.to_v.weight": 1048576, + "transformer.layers.11.2.to_v.bias": 1024, + "transformer.layers.11.2.to_out.0.weight": 1048576, + "transformer.layers.11.2.to_out.0.bias": 1024, + "transformer.layers.11.3.g": 1024, + "transformer.layers.11.4.ff.0.0.weight": 4194304, + "transformer.layers.11.4.ff.0.0.bias": 4096, + "transformer.layers.11.4.ff.2.weight": 4194304, + "transformer.layers.11.4.ff.2.bias": 1024, + "transformer.layers.12.1.g": 1024, + "transformer.layers.12.2.to_q.weight": 1048576, + "transformer.layers.12.2.to_q.bias": 1024, + "transformer.layers.12.2.to_k.weight": 1048576, + "transformer.layers.12.2.to_k.bias": 1024, + "transformer.layers.12.2.to_v.weight": 1048576, + "transformer.layers.12.2.to_v.bias": 1024, + "transformer.layers.12.2.to_out.0.weight": 1048576, + "transformer.layers.12.2.to_out.0.bias": 1024, + "transformer.layers.12.3.g": 1024, + "transformer.layers.12.4.ff.0.0.weight": 4194304, + "transformer.layers.12.4.ff.0.0.bias": 4096, + "transformer.layers.12.4.ff.2.weight": 4194304, + "transformer.layers.12.4.ff.2.bias": 1024, + "transformer.layers.13.0.weight": 2097152, + "transformer.layers.13.1.g": 1024, + "transformer.layers.13.2.to_q.weight": 1048576, + "transformer.layers.13.2.to_q.bias": 1024, + "transformer.layers.13.2.to_k.weight": 1048576, + "transformer.layers.13.2.to_k.bias": 1024, + "transformer.layers.13.2.to_v.weight": 1048576, + "transformer.layers.13.2.to_v.bias": 1024, + "transformer.layers.13.2.to_out.0.weight": 1048576, + "transformer.layers.13.2.to_out.0.bias": 1024, + "transformer.layers.13.3.g": 1024, + "transformer.layers.13.4.ff.0.0.weight": 4194304, + "transformer.layers.13.4.ff.0.0.bias": 4096, + "transformer.layers.13.4.ff.2.weight": 4194304, + "transformer.layers.13.4.ff.2.bias": 1024, + "transformer.layers.14.0.weight": 2097152, + "transformer.layers.14.1.g": 1024, + "transformer.layers.14.2.to_q.weight": 1048576, + "transformer.layers.14.2.to_q.bias": 1024, + "transformer.layers.14.2.to_k.weight": 1048576, + "transformer.layers.14.2.to_k.bias": 1024, + "transformer.layers.14.2.to_v.weight": 1048576, + "transformer.layers.14.2.to_v.bias": 1024, + "transformer.layers.14.2.to_out.0.weight": 1048576, + "transformer.layers.14.2.to_out.0.bias": 1024, + "transformer.layers.14.3.g": 1024, + "transformer.layers.14.4.ff.0.0.weight": 4194304, + "transformer.layers.14.4.ff.0.0.bias": 4096, + "transformer.layers.14.4.ff.2.weight": 4194304, + "transformer.layers.14.4.ff.2.bias": 1024, + "transformer.layers.15.0.weight": 2097152, + "transformer.layers.15.1.g": 1024, + "transformer.layers.15.2.to_q.weight": 1048576, + "transformer.layers.15.2.to_q.bias": 1024, + "transformer.layers.15.2.to_k.weight": 1048576, + "transformer.layers.15.2.to_k.bias": 1024, + "transformer.layers.15.2.to_v.weight": 1048576, + "transformer.layers.15.2.to_v.bias": 1024, + "transformer.layers.15.2.to_out.0.weight": 1048576, + "transformer.layers.15.2.to_out.0.bias": 1024, + "transformer.layers.15.3.g": 1024, + "transformer.layers.15.4.ff.0.0.weight": 4194304, + "transformer.layers.15.4.ff.0.0.bias": 4096, + "transformer.layers.15.4.ff.2.weight": 4194304, + "transformer.layers.15.4.ff.2.bias": 1024, + "transformer.layers.16.0.weight": 2097152, + "transformer.layers.16.1.g": 1024, + "transformer.layers.16.2.to_q.weight": 1048576, + "transformer.layers.16.2.to_q.bias": 1024, + "transformer.layers.16.2.to_k.weight": 1048576, + "transformer.layers.16.2.to_k.bias": 1024, + "transformer.layers.16.2.to_v.weight": 1048576, + "transformer.layers.16.2.to_v.bias": 1024, + "transformer.layers.16.2.to_out.0.weight": 1048576, + "transformer.layers.16.2.to_out.0.bias": 1024, + "transformer.layers.16.3.g": 1024, + "transformer.layers.16.4.ff.0.0.weight": 4194304, + "transformer.layers.16.4.ff.0.0.bias": 4096, + "transformer.layers.16.4.ff.2.weight": 4194304, + "transformer.layers.16.4.ff.2.bias": 1024, + "transformer.layers.17.0.weight": 2097152, + "transformer.layers.17.1.g": 1024, + "transformer.layers.17.2.to_q.weight": 1048576, + "transformer.layers.17.2.to_q.bias": 1024, + "transformer.layers.17.2.to_k.weight": 1048576, + "transformer.layers.17.2.to_k.bias": 1024, + "transformer.layers.17.2.to_v.weight": 1048576, + "transformer.layers.17.2.to_v.bias": 1024, + "transformer.layers.17.2.to_out.0.weight": 1048576, + "transformer.layers.17.2.to_out.0.bias": 1024, + "transformer.layers.17.3.g": 1024, + "transformer.layers.17.4.ff.0.0.weight": 4194304, + "transformer.layers.17.4.ff.0.0.bias": 4096, + "transformer.layers.17.4.ff.2.weight": 4194304, + "transformer.layers.17.4.ff.2.bias": 1024, + "transformer.layers.18.0.weight": 2097152, + "transformer.layers.18.1.g": 1024, + "transformer.layers.18.2.to_q.weight": 1048576, + "transformer.layers.18.2.to_q.bias": 1024, + "transformer.layers.18.2.to_k.weight": 1048576, + "transformer.layers.18.2.to_k.bias": 1024, + "transformer.layers.18.2.to_v.weight": 1048576, + "transformer.layers.18.2.to_v.bias": 1024, + "transformer.layers.18.2.to_out.0.weight": 1048576, + "transformer.layers.18.2.to_out.0.bias": 1024, + "transformer.layers.18.3.g": 1024, + "transformer.layers.18.4.ff.0.0.weight": 4194304, + "transformer.layers.18.4.ff.0.0.bias": 4096, + "transformer.layers.18.4.ff.2.weight": 4194304, + "transformer.layers.18.4.ff.2.bias": 1024, + "transformer.layers.19.0.weight": 2097152, + "transformer.layers.19.1.g": 1024, + "transformer.layers.19.2.to_q.weight": 1048576, + "transformer.layers.19.2.to_q.bias": 1024, + "transformer.layers.19.2.to_k.weight": 1048576, + "transformer.layers.19.2.to_k.bias": 1024, + "transformer.layers.19.2.to_v.weight": 1048576, + "transformer.layers.19.2.to_v.bias": 1024, + "transformer.layers.19.2.to_out.0.weight": 1048576, + "transformer.layers.19.2.to_out.0.bias": 1024, + "transformer.layers.19.3.g": 1024, + "transformer.layers.19.4.ff.0.0.weight": 4194304, + "transformer.layers.19.4.ff.0.0.bias": 4096, + "transformer.layers.19.4.ff.2.weight": 4194304, + "transformer.layers.19.4.ff.2.bias": 1024, + "transformer.layers.20.0.weight": 2097152, + "transformer.layers.20.1.g": 1024, + "transformer.layers.20.2.to_q.weight": 1048576, + "transformer.layers.20.2.to_q.bias": 1024, + "transformer.layers.20.2.to_k.weight": 1048576, + "transformer.layers.20.2.to_k.bias": 1024, + "transformer.layers.20.2.to_v.weight": 1048576, + "transformer.layers.20.2.to_v.bias": 1024, + "transformer.layers.20.2.to_out.0.weight": 1048576, + "transformer.layers.20.2.to_out.0.bias": 1024, + "transformer.layers.20.3.g": 1024, + "transformer.layers.20.4.ff.0.0.weight": 4194304, + "transformer.layers.20.4.ff.0.0.bias": 4096, + "transformer.layers.20.4.ff.2.weight": 4194304, + "transformer.layers.20.4.ff.2.bias": 1024, + "transformer.layers.21.0.weight": 2097152, + "transformer.layers.21.1.g": 1024, + "transformer.layers.21.2.to_q.weight": 1048576, + "transformer.layers.21.2.to_q.bias": 1024, + "transformer.layers.21.2.to_k.weight": 1048576, + "transformer.layers.21.2.to_k.bias": 1024, + "transformer.layers.21.2.to_v.weight": 1048576, + "transformer.layers.21.2.to_v.bias": 1024, + "transformer.layers.21.2.to_out.0.weight": 1048576, + "transformer.layers.21.2.to_out.0.bias": 1024, + "transformer.layers.21.3.g": 1024, + "transformer.layers.21.4.ff.0.0.weight": 4194304, + "transformer.layers.21.4.ff.0.0.bias": 4096, + "transformer.layers.21.4.ff.2.weight": 4194304, + "transformer.layers.21.4.ff.2.bias": 1024, + "transformer.layers.22.0.weight": 2097152, + "transformer.layers.22.1.g": 1024, + "transformer.layers.22.2.to_q.weight": 1048576, + "transformer.layers.22.2.to_q.bias": 1024, + "transformer.layers.22.2.to_k.weight": 1048576, + "transformer.layers.22.2.to_k.bias": 1024, + "transformer.layers.22.2.to_v.weight": 1048576, + "transformer.layers.22.2.to_v.bias": 1024, + "transformer.layers.22.2.to_out.0.weight": 1048576, + "transformer.layers.22.2.to_out.0.bias": 1024, + "transformer.layers.22.3.g": 1024, + "transformer.layers.22.4.ff.0.0.weight": 4194304, + "transformer.layers.22.4.ff.0.0.bias": 4096, + "transformer.layers.22.4.ff.2.weight": 4194304, + "transformer.layers.22.4.ff.2.bias": 1024, + "transformer.layers.23.0.weight": 2097152, + "transformer.layers.23.1.g": 1024, + "transformer.layers.23.2.to_q.weight": 1048576, + "transformer.layers.23.2.to_q.bias": 1024, + "transformer.layers.23.2.to_k.weight": 1048576, + "transformer.layers.23.2.to_k.bias": 1024, + "transformer.layers.23.2.to_v.weight": 1048576, + "transformer.layers.23.2.to_v.bias": 1024, + "transformer.layers.23.2.to_out.0.weight": 1048576, + "transformer.layers.23.2.to_out.0.bias": 1024, + "transformer.layers.23.3.g": 1024, + "transformer.layers.23.4.ff.0.0.weight": 4194304, + "transformer.layers.23.4.ff.0.0.bias": 4096, + "transformer.layers.23.4.ff.2.weight": 4194304, + "transformer.layers.23.4.ff.2.bias": 1024, + "transformer.layers.24.0.weight": 2097152, + "transformer.layers.24.1.g": 1024, + "transformer.layers.24.2.to_q.weight": 1048576, + "transformer.layers.24.2.to_q.bias": 1024, + "transformer.layers.24.2.to_k.weight": 1048576, + "transformer.layers.24.2.to_k.bias": 1024, + "transformer.layers.24.2.to_v.weight": 1048576, + "transformer.layers.24.2.to_v.bias": 1024, + "transformer.layers.24.2.to_out.0.weight": 1048576, + "transformer.layers.24.2.to_out.0.bias": 1024, + "transformer.layers.24.3.g": 1024, + "transformer.layers.24.4.ff.0.0.weight": 4194304, + "transformer.layers.24.4.ff.0.0.bias": 4096, + "transformer.layers.24.4.ff.2.weight": 4194304, + "transformer.layers.24.4.ff.2.bias": 1024, + "transformer.layers.25.0.weight": 2097152, + "transformer.layers.25.1.g": 1024, + "transformer.layers.25.2.to_q.weight": 1048576, + "transformer.layers.25.2.to_q.bias": 1024, + "transformer.layers.25.2.to_k.weight": 1048576, + "transformer.layers.25.2.to_k.bias": 1024, + "transformer.layers.25.2.to_v.weight": 1048576, + "transformer.layers.25.2.to_v.bias": 1024, + "transformer.layers.25.2.to_out.0.weight": 1048576, + "transformer.layers.25.2.to_out.0.bias": 1024, + "transformer.layers.25.3.g": 1024, + "transformer.layers.25.4.ff.0.0.weight": 4194304, + "transformer.layers.25.4.ff.0.0.bias": 4096, + "transformer.layers.25.4.ff.2.weight": 4194304, + "transformer.layers.25.4.ff.2.bias": 1024, + "transformer.norm_out.g": 1024, + "transformer.proj_out.weight": 102400, + "transformer.proj_out.bias": 100 + }, + "important_layers": [ + "transformer.time_embed.time_mlp.0.weight", + "transformer.time_embed.time_mlp.2.weight", + "transformer.text_embed.text_embed.weight", + "transformer.input_embed.proj.weight", + "transformer.input_embed.conv_pos_embed.conv1d.0.weight", + "transformer.input_embed.conv_pos_embed.conv1d.2.weight", + "transformer.layers.0.2.to_q.weight", + "transformer.layers.0.2.to_k.weight", + "transformer.layers.0.2.to_v.weight", + "transformer.layers.0.2.to_out.0.weight", + "transformer.layers.0.4.ff.0.0.weight", + "transformer.layers.0.4.ff.2.weight", + "transformer.layers.1.2.to_q.weight", + "transformer.layers.1.2.to_k.weight", + "transformer.layers.1.2.to_v.weight", + "transformer.layers.1.2.to_out.0.weight", + "transformer.layers.1.4.ff.0.0.weight", + "transformer.layers.1.4.ff.2.weight", + "transformer.layers.2.2.to_q.weight", + "transformer.layers.2.2.to_k.weight", + "transformer.layers.2.2.to_v.weight", + "transformer.layers.2.2.to_out.0.weight", + "transformer.layers.2.4.ff.0.0.weight", + "transformer.layers.2.4.ff.2.weight", + "transformer.layers.3.2.to_q.weight", + "transformer.layers.3.2.to_k.weight", + "transformer.layers.3.2.to_v.weight", + "transformer.layers.3.2.to_out.0.weight", + "transformer.layers.3.4.ff.0.0.weight", + "transformer.layers.3.4.ff.2.weight", + "transformer.layers.4.2.to_q.weight", + "transformer.layers.4.2.to_k.weight", + "transformer.layers.4.2.to_v.weight", + "transformer.layers.4.2.to_out.0.weight", + "transformer.layers.4.4.ff.0.0.weight", + "transformer.layers.4.4.ff.2.weight", + "transformer.layers.5.2.to_q.weight", + "transformer.layers.5.2.to_k.weight", + "transformer.layers.5.2.to_v.weight", + "transformer.layers.5.2.to_out.0.weight", + "transformer.layers.5.4.ff.0.0.weight", + "transformer.layers.5.4.ff.2.weight", + "transformer.layers.6.2.to_q.weight", + "transformer.layers.6.2.to_k.weight", + "transformer.layers.6.2.to_v.weight", + "transformer.layers.6.2.to_out.0.weight", + "transformer.layers.6.4.ff.0.0.weight", + "transformer.layers.6.4.ff.2.weight", + "transformer.layers.7.2.to_q.weight", + "transformer.layers.7.2.to_k.weight", + "transformer.layers.7.2.to_v.weight", + "transformer.layers.7.2.to_out.0.weight", + "transformer.layers.7.4.ff.0.0.weight", + "transformer.layers.7.4.ff.2.weight", + "transformer.layers.8.4.ff.0.0.weight", + "transformer.layers.8.4.ff.2.weight", + "transformer.layers.9.4.ff.0.0.weight", + "transformer.layers.9.4.ff.2.weight", + "transformer.layers.10.4.ff.0.0.weight", + "transformer.layers.10.4.ff.2.weight", + "transformer.layers.11.4.ff.0.0.weight", + "transformer.layers.11.4.ff.2.weight", + "transformer.layers.12.4.ff.0.0.weight", + "transformer.layers.12.4.ff.2.weight", + "transformer.layers.13.0.weight", + "transformer.layers.13.4.ff.0.0.weight", + "transformer.layers.13.4.ff.2.weight", + "transformer.layers.14.0.weight", + "transformer.layers.14.4.ff.0.0.weight", + "transformer.layers.14.4.ff.2.weight", + "transformer.layers.15.0.weight", + "transformer.layers.15.4.ff.0.0.weight", + "transformer.layers.15.4.ff.2.weight", + "transformer.layers.16.4.ff.0.0.weight", + "transformer.layers.16.4.ff.2.weight", + "transformer.layers.17.4.ff.0.0.weight", + "transformer.layers.17.4.ff.2.weight", + "transformer.layers.18.4.ff.0.0.weight", + "transformer.layers.18.4.ff.2.weight", + "transformer.layers.19.4.ff.0.0.weight", + "transformer.layers.19.4.ff.2.weight", + "transformer.layers.20.4.ff.0.0.weight", + "transformer.layers.20.4.ff.2.weight", + "transformer.layers.21.4.ff.0.0.weight", + "transformer.layers.21.4.ff.2.weight", + "transformer.layers.22.4.ff.0.0.weight", + "transformer.layers.22.4.ff.2.weight", + "transformer.layers.23.4.ff.0.0.weight", + "transformer.layers.23.4.ff.2.weight", + "transformer.layers.24.4.ff.0.0.weight", + "transformer.layers.24.4.ff.2.weight", + "transformer.layers.25.4.ff.0.0.weight", + "transformer.layers.25.4.ff.2.weight" + ], + "bottleneck_layers": [], + "recommendations": { + "focus_layers": [ + "transformer.time_embed.time_mlp.0.weight", + "transformer.time_embed.time_mlp.2.weight", + "transformer.text_embed.text_embed.weight", + "transformer.input_embed.proj.weight", + "transformer.input_embed.conv_pos_embed.conv1d.0.weight", + "transformer.input_embed.conv_pos_embed.conv1d.2.weight", + "transformer.layers.0.2.to_q.weight", + "transformer.layers.0.2.to_k.weight", + "transformer.layers.0.2.to_v.weight", + "transformer.layers.0.2.to_out.0.weight", + "transformer.layers.0.4.ff.0.0.weight", + "transformer.layers.0.4.ff.2.weight", + "transformer.layers.1.2.to_q.weight", + "transformer.layers.1.2.to_k.weight", + "transformer.layers.1.2.to_v.weight", + "transformer.layers.1.2.to_out.0.weight", + "transformer.layers.1.4.ff.0.0.weight", + "transformer.layers.1.4.ff.2.weight", + "transformer.layers.2.2.to_q.weight", + "transformer.layers.2.2.to_k.weight", + "transformer.layers.2.2.to_v.weight", + "transformer.layers.2.2.to_out.0.weight", + "transformer.layers.2.4.ff.0.0.weight", + "transformer.layers.2.4.ff.2.weight", + "transformer.layers.3.2.to_q.weight", + "transformer.layers.3.2.to_k.weight", + "transformer.layers.3.2.to_v.weight", + "transformer.layers.3.2.to_out.0.weight", + "transformer.layers.3.4.ff.0.0.weight", + "transformer.layers.3.4.ff.2.weight", + "transformer.layers.4.2.to_q.weight", + "transformer.layers.4.2.to_k.weight", + "transformer.layers.4.2.to_v.weight", + "transformer.layers.4.2.to_out.0.weight", + "transformer.layers.4.4.ff.0.0.weight", + "transformer.layers.4.4.ff.2.weight", + "transformer.layers.5.2.to_q.weight", + "transformer.layers.5.2.to_k.weight", + "transformer.layers.5.2.to_v.weight", + "transformer.layers.5.2.to_out.0.weight", + "transformer.layers.5.4.ff.0.0.weight", + "transformer.layers.5.4.ff.2.weight", + "transformer.layers.6.2.to_q.weight", + "transformer.layers.6.2.to_k.weight", + "transformer.layers.6.2.to_v.weight", + "transformer.layers.6.2.to_out.0.weight", + "transformer.layers.6.4.ff.0.0.weight", + "transformer.layers.6.4.ff.2.weight", + "transformer.layers.7.2.to_q.weight", + "transformer.layers.7.2.to_k.weight", + "transformer.layers.7.2.to_v.weight", + "transformer.layers.7.2.to_out.0.weight", + "transformer.layers.7.4.ff.0.0.weight", + "transformer.layers.7.4.ff.2.weight", + "transformer.layers.8.4.ff.0.0.weight", + "transformer.layers.8.4.ff.2.weight", + "transformer.layers.9.4.ff.0.0.weight", + "transformer.layers.9.4.ff.2.weight", + "transformer.layers.10.4.ff.0.0.weight", + "transformer.layers.10.4.ff.2.weight", + "transformer.layers.11.4.ff.0.0.weight", + "transformer.layers.11.4.ff.2.weight", + "transformer.layers.12.4.ff.0.0.weight", + "transformer.layers.12.4.ff.2.weight", + "transformer.layers.13.0.weight", + "transformer.layers.13.4.ff.0.0.weight", + "transformer.layers.13.4.ff.2.weight", + "transformer.layers.14.0.weight", + "transformer.layers.14.4.ff.0.0.weight", + "transformer.layers.14.4.ff.2.weight", + "transformer.layers.15.0.weight", + "transformer.layers.15.4.ff.0.0.weight", + "transformer.layers.15.4.ff.2.weight", + "transformer.layers.16.4.ff.0.0.weight", + "transformer.layers.16.4.ff.2.weight", + "transformer.layers.17.4.ff.0.0.weight", + "transformer.layers.17.4.ff.2.weight", + "transformer.layers.18.4.ff.0.0.weight", + "transformer.layers.18.4.ff.2.weight", + "transformer.layers.19.4.ff.0.0.weight", + "transformer.layers.19.4.ff.2.weight", + "transformer.layers.20.4.ff.0.0.weight", + "transformer.layers.20.4.ff.2.weight", + "transformer.layers.21.4.ff.0.0.weight", + "transformer.layers.21.4.ff.2.weight", + "transformer.layers.22.4.ff.0.0.weight", + "transformer.layers.22.4.ff.2.weight", + "transformer.layers.23.4.ff.0.0.weight", + "transformer.layers.23.4.ff.2.weight", + "transformer.layers.24.4.ff.0.0.weight", + "transformer.layers.24.4.ff.2.weight", + "transformer.layers.25.4.ff.0.0.weight", + "transformer.layers.25.4.ff.2.weight" + ] + }, + "total_parameters": 391, + "total_elements": 360755948, + "param_ranges": { + "transformer.time_embed.time_mlp.0.weight": { + "min": -0.43031466007232666, + "max": 0.298143208026886, + "mean": -0.0025431362446397543, + "std": 0.042562514543533325, + "sparsity": 0.0, + "shape": [ + 1024, + 256 + ] + }, + "transformer.time_embed.time_mlp.0.bias": { + "min": -0.0631568506360054, + "max": 0.10771193355321884, + "mean": 0.0006426331819966435, + "std": 0.03407834470272064, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.time_embed.time_mlp.2.weight": { + "min": -0.4127056896686554, + "max": 0.8369137644767761, + "mean": -0.00020141302957199514, + "std": 0.024111632257699966, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.time_embed.time_mlp.2.bias": { + "min": -0.11548846960067749, + "max": 0.3221578299999237, + "mean": -0.0009410656057298183, + "std": 0.019580261781811714, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.text_embed.text_embed.weight": { + "min": -2.7946255207061768, + "max": 2.873370885848999, + "mean": -0.0003634353051893413, + "std": 0.6154844164848328, + "sparsity": 0.0, + "shape": [ + 2546, + 100 + ] + }, + "transformer.input_embed.proj.weight": { + "min": -0.2794482707977295, + "max": 0.38173243403434753, + "mean": 0.0004242636787239462, + "std": 0.042748358100652695, + "sparsity": 0.0, + "shape": [ + 1024, + 300 + ] + }, + "transformer.input_embed.proj.bias": { + "min": -0.22289495170116425, + "max": 0.21001911163330078, + "mean": -0.004489608108997345, + "std": 0.040950216352939606, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.0.weight": { + "min": -0.4283224046230316, + "max": 0.4761110544204712, + "mean": 3.962942628277233e-06, + "std": 0.02451062761247158, + "sparsity": 0.0, + "shape": [ + 1024, + 64, + 31 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.0.bias": { + "min": -0.32575708627700806, + "max": 0.1571168750524521, + "mean": -0.04673216491937637, + "std": 0.051645807921886444, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.2.weight": { + "min": -0.4105567932128906, + "max": 0.3547790050506592, + "mean": -0.0001310346560785547, + "std": 0.02360442653298378, + "sparsity": 0.0, + "shape": [ + 1024, + 64, + 31 + ] + }, + "transformer.input_embed.conv_pos_embed.conv1d.2.bias": { + "min": -0.23018451035022736, + "max": 0.2630932033061981, + "mean": -0.029156308621168137, + "std": 0.04940544068813324, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.1.g": { + "min": 0.2545531988143921, + "max": 0.8213090300559998, + "mean": 0.5256362557411194, + "std": 0.08106369525194168, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_q.weight": { + "min": -0.2971626818180084, + "max": 0.26604607701301575, + "mean": -0.0004256928223185241, + "std": 0.03210251033306122, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_q.bias": { + "min": -0.09291917830705643, + "max": 0.1250312328338623, + "mean": 0.0006477435817942023, + "std": 0.025753259658813477, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_k.weight": { + "min": -0.29085373878479004, + "max": 0.28159603476524353, + "mean": -7.506589463446289e-05, + "std": 0.030931703746318817, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_k.bias": { + "min": -5.906967639923096, + "max": 5.821649074554443, + "mean": -0.009350163862109184, + "std": 1.296647071838379, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_v.weight": { + "min": -0.42530331015586853, + "max": 0.3440260589122772, + "mean": 9.807322931010276e-05, + "std": 0.02995346300303936, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_v.bias": { + "min": -0.029081525281071663, + "max": 0.02767445333302021, + "mean": -0.00032374687725678086, + "std": 0.012576405890285969, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.2.to_out.0.weight": { + "min": -0.45424115657806396, + "max": 0.4482896625995636, + "mean": 2.3885608243290335e-05, + "std": 0.02385384775698185, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.0.2.to_out.0.bias": { + "min": -0.08883396536111832, + "max": 0.09114022552967072, + "mean": 0.00228882092051208, + "std": 0.01952745020389557, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.3.g": { + "min": 0.26677191257476807, + "max": 1.0577468872070312, + "mean": 0.53135746717453, + "std": 0.10473316162824631, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.0.4.ff.0.0.weight": { + "min": -0.5746102333068848, + "max": 0.6084363460540771, + "mean": -0.00043127068784087896, + "std": 0.03860073536634445, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.0.4.ff.0.0.bias": { + "min": -0.18297578394412994, + "max": 0.0456179715692997, + "mean": -0.029477983713150024, + "std": 0.042657021433115005, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.0.4.ff.2.weight": { + "min": -1.1673263311386108, + "max": 1.6341116428375244, + "mean": 0.00032315164571627975, + "std": 0.02769668586552143, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.0.4.ff.2.bias": { + "min": -0.16250400245189667, + "max": 0.20589958131313324, + "mean": -0.02113456465303898, + "std": 0.027959568426012993, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.1.g": { + "min": 0.22410069406032562, + "max": 0.8451111912727356, + "mean": 0.48777928948402405, + "std": 0.07542530447244644, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_q.weight": { + "min": -0.25582441687583923, + "max": 0.30595168471336365, + "mean": -6.705071427859366e-06, + "std": 0.03347504884004593, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_q.bias": { + "min": -0.09550327807664871, + "max": 0.11064136773347855, + "mean": 6.668796413578093e-05, + "std": 0.026976482942700386, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_k.weight": { + "min": -0.2973037660121918, + "max": 0.29644775390625, + "mean": 5.341449286788702e-05, + "std": 0.032546162605285645, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_k.bias": { + "min": -5.17097806930542, + "max": 5.091113090515137, + "mean": -0.01462231483310461, + "std": 1.1586002111434937, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_v.weight": { + "min": -0.34501704573631287, + "max": 0.34340131282806396, + "mean": 7.8546792792622e-05, + "std": 0.030061908066272736, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_v.bias": { + "min": -0.036109186708927155, + "max": 0.03340720757842064, + "mean": -0.00014173206000123173, + "std": 0.013041709549725056, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.2.to_out.0.weight": { + "min": -0.3156168460845947, + "max": 0.3752053380012512, + "mean": -2.0681722162407823e-05, + "std": 0.02405940182507038, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.1.2.to_out.0.bias": { + "min": -0.10555326193571091, + "max": 0.12231862545013428, + "mean": -0.0019678983371704817, + "std": 0.028872456401586533, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.3.g": { + "min": 0.3113996386528015, + "max": 1.1224051713943481, + "mean": 0.6664633750915527, + "std": 0.0980152115225792, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.1.4.ff.0.0.weight": { + "min": -0.8727887272834778, + "max": 0.6275914907455444, + "mean": 0.0016750607173889875, + "std": 0.047438763082027435, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.1.4.ff.0.0.bias": { + "min": -0.27183517813682556, + "max": 0.034259725362062454, + "mean": -0.046628981828689575, + "std": 0.04063701629638672, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.1.4.ff.2.weight": { + "min": -0.9230329394340515, + "max": 0.9648618102073669, + "mean": 0.0010213888017460704, + "std": 0.04070665314793587, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.1.4.ff.2.bias": { + "min": -0.14468412101268768, + "max": 0.07505139708518982, + "mean": -0.009096229448914528, + "std": 0.025706371292471886, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.1.g": { + "min": 0.24036771059036255, + "max": 0.7140315771102905, + "mean": 0.4473647475242615, + "std": 0.05951203405857086, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_q.weight": { + "min": -0.27264565229415894, + "max": 0.29809534549713135, + "mean": 9.332510671811178e-06, + "std": 0.03546958044171333, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_q.bias": { + "min": -0.11950661987066269, + "max": 0.11869802325963974, + "mean": 0.0007616454968228936, + "std": 0.02764517441391945, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_k.weight": { + "min": -0.2813079059123993, + "max": 0.28023794293403625, + "mean": -7.719700079178438e-05, + "std": 0.0350990891456604, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_k.bias": { + "min": -2.5128581523895264, + "max": 2.524867296218872, + "mean": 0.026786239817738533, + "std": 0.5873143672943115, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_v.weight": { + "min": -0.2213059961795807, + "max": 0.2717853784561157, + "mean": 2.9610819183290005e-06, + "std": 0.030732687562704086, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_v.bias": { + "min": -0.03361261636018753, + "max": 0.03129349276423454, + "mean": 0.00011305588122922927, + "std": 0.012413612566888332, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.2.to_out.0.weight": { + "min": -0.23544403910636902, + "max": 0.23186075687408447, + "mean": 5.69116891711019e-05, + "std": 0.025696195662021637, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.2.2.to_out.0.bias": { + "min": -0.13601461052894592, + "max": 0.12754406034946442, + "mean": -0.005499254446476698, + "std": 0.03998684883117676, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.3.g": { + "min": 0.35436785221099854, + "max": 1.1737076044082642, + "mean": 0.7108283638954163, + "std": 0.10403098911046982, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.2.4.ff.0.0.weight": { + "min": -0.6176053881645203, + "max": 0.5545136332511902, + "mean": 0.0011602240847423673, + "std": 0.04611964151263237, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.2.4.ff.0.0.bias": { + "min": -0.18876661360263824, + "max": 0.024967461824417114, + "mean": -0.03485583886504173, + "std": 0.028641268610954285, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.2.4.ff.2.weight": { + "min": -1.1323436498641968, + "max": 0.9720706939697266, + "mean": 0.00035946519346907735, + "std": 0.042347442358732224, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.2.4.ff.2.bias": { + "min": -0.598772406578064, + "max": 0.06287988275289536, + "mean": -0.004880873020738363, + "std": 0.028635544702410698, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.1.g": { + "min": 0.3750710189342499, + "max": 0.9418790340423584, + "mean": 0.5926927328109741, + "std": 0.06721659004688263, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_q.weight": { + "min": -0.3915771543979645, + "max": 0.3692559599876404, + "mean": 7.123942486941814e-05, + "std": 0.03718866407871246, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_q.bias": { + "min": -0.11907870322465897, + "max": 0.13665802776813507, + "mean": 0.0009319179225713015, + "std": 0.02926611341536045, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_k.weight": { + "min": -0.619708240032196, + "max": 0.5092929005622864, + "mean": 1.5245183021761477e-05, + "std": 0.03644217178225517, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_k.bias": { + "min": -8.197783470153809, + "max": 8.800565719604492, + "mean": -0.10938873887062073, + "std": 1.7007076740264893, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_v.weight": { + "min": -0.2768491804599762, + "max": 0.2400088757276535, + "mean": 5.314283407642506e-05, + "std": 0.032615404576063156, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_v.bias": { + "min": -0.05213421210646629, + "max": 0.03957239165902138, + "mean": 9.133941057370976e-05, + "std": 0.012963276356458664, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.2.to_out.0.weight": { + "min": -0.23089444637298584, + "max": 0.2348451018333435, + "mean": -2.176157067879103e-05, + "std": 0.029391760006546974, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.3.2.to_out.0.bias": { + "min": -0.20456741750240326, + "max": 0.10572919249534607, + "mean": -0.00402758177369833, + "std": 0.03263704851269722, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.3.g": { + "min": 0.3400026261806488, + "max": 1.0141218900680542, + "mean": 0.7010252475738525, + "std": 0.09696138650178909, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.3.4.ff.0.0.weight": { + "min": -0.5654259324073792, + "max": 0.8335409760475159, + "mean": 0.0004151407047174871, + "std": 0.04230234771966934, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.3.4.ff.0.0.bias": { + "min": -0.2119237780570984, + "max": 0.030580509454011917, + "mean": -0.03220224380493164, + "std": 0.026535935699939728, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.3.4.ff.2.weight": { + "min": -0.7552511096000671, + "max": 0.7191816568374634, + "mean": -9.422379662282765e-06, + "std": 0.036842163652181625, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.3.4.ff.2.bias": { + "min": -0.26383838057518005, + "max": 0.10599514842033386, + "mean": -0.0030335707124322653, + "std": 0.028880203142762184, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.1.g": { + "min": 0.28429752588272095, + "max": 0.6961002945899963, + "mean": 0.49966490268707275, + "std": 0.046708256006240845, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_q.weight": { + "min": -0.27950623631477356, + "max": 0.23444026708602905, + "mean": -0.0001112212921725586, + "std": 0.03876311331987381, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_q.bias": { + "min": -0.15468573570251465, + "max": 0.12698474526405334, + "mean": -0.0022345406468957663, + "std": 0.033433251082897186, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_k.weight": { + "min": -0.41459208726882935, + "max": 0.6603645086288452, + "mean": -1.977803731278982e-05, + "std": 0.03910015523433685, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_k.bias": { + "min": -4.243562698364258, + "max": 4.728666305541992, + "mean": -0.020446542650461197, + "std": 1.0085786581039429, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_v.weight": { + "min": -0.24519944190979004, + "max": 0.2077825665473938, + "mean": 4.388581874081865e-05, + "std": 0.033966176211833954, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_v.bias": { + "min": -0.034593358635902405, + "max": 0.04485077038407326, + "mean": -1.7529440810903907e-05, + "std": 0.012629235163331032, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.2.to_out.0.weight": { + "min": -0.20095357298851013, + "max": 0.20613527297973633, + "mean": -2.959615085273981e-05, + "std": 0.03102371282875538, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.4.2.to_out.0.bias": { + "min": -0.20019184052944183, + "max": 0.11357004940509796, + "mean": -0.0029205437749624252, + "std": 0.034529101103544235, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.3.g": { + "min": 0.36704930663108826, + "max": 1.058448076248169, + "mean": 0.6707465052604675, + "std": 0.0665469765663147, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.4.4.ff.0.0.weight": { + "min": -0.3986629843711853, + "max": 0.5028019547462463, + "mean": -3.858951822621748e-05, + "std": 0.04113718494772911, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.4.4.ff.0.0.bias": { + "min": -0.12819068133831024, + "max": 0.026764869689941406, + "mean": -0.03055746480822563, + "std": 0.021891731768846512, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.4.4.ff.2.weight": { + "min": -0.44944334030151367, + "max": 0.43338072299957275, + "mean": 8.373618766199797e-05, + "std": 0.03489609435200691, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.4.4.ff.2.bias": { + "min": -0.2679402530193329, + "max": 0.07267966121435165, + "mean": -0.0011121004354208708, + "std": 0.023136794567108154, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.1.g": { + "min": 0.2874027192592621, + "max": 0.6862822771072388, + "mean": 0.5247019529342651, + "std": 0.047706179320812225, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_q.weight": { + "min": -0.22247114777565002, + "max": 0.2237931489944458, + "mean": 1.5673409507144243e-05, + "std": 0.03895280137658119, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_q.bias": { + "min": -0.13664273917675018, + "max": 0.10935632139444351, + "mean": 0.00023680762387812138, + "std": 0.029263831675052643, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_k.weight": { + "min": -0.37552782893180847, + "max": 0.43765556812286377, + "mean": -9.529509043204598e-06, + "std": 0.0392889641225338, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_k.bias": { + "min": -3.8507211208343506, + "max": 5.005820274353027, + "mean": 0.00975782610476017, + "std": 0.8459950685501099, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_v.weight": { + "min": -0.2234737128019333, + "max": 0.22026528418064117, + "mean": -2.2568747226614505e-07, + "std": 0.03441343083977699, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_v.bias": { + "min": -0.043700210750103, + "max": 0.0358847938477993, + "mean": -0.0002585579641163349, + "std": 0.012083812616765499, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.2.to_out.0.weight": { + "min": -0.21352721750736237, + "max": 0.1891147494316101, + "mean": -1.673133192525711e-05, + "std": 0.031540192663669586, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.5.2.to_out.0.bias": { + "min": -0.18098995089530945, + "max": 0.12096531689167023, + "mean": -0.0024120290763676167, + "std": 0.04128490760922432, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.3.g": { + "min": 0.4226054847240448, + "max": 0.9433368444442749, + "mean": 0.6629081964492798, + "std": 0.056974004954099655, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.5.4.ff.0.0.weight": { + "min": -0.37151336669921875, + "max": 0.4759024977684021, + "mean": -8.223902113968506e-05, + "std": 0.040896181017160416, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.5.4.ff.0.0.bias": { + "min": -0.20840628445148468, + "max": 0.02712824009358883, + "mean": -0.030254749581217766, + "std": 0.02136547490954399, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.5.4.ff.2.weight": { + "min": -0.3405216336250305, + "max": 0.7342746257781982, + "mean": 8.478653035126626e-05, + "std": 0.03477146103978157, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.5.4.ff.2.bias": { + "min": -0.2405085265636444, + "max": 0.05050582066178322, + "mean": -0.0011980931740254164, + "std": 0.02047325111925602, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.1.g": { + "min": 0.305998831987381, + "max": 0.6545577049255371, + "mean": 0.525275707244873, + "std": 0.0462840236723423, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_q.weight": { + "min": -0.30443698167800903, + "max": 0.2175063043832779, + "mean": 6.991640839260072e-05, + "std": 0.03949848935008049, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_q.bias": { + "min": -0.1496177613735199, + "max": 0.1315852701663971, + "mean": 0.00034793667146004736, + "std": 0.030498284846544266, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_k.weight": { + "min": -0.25779959559440613, + "max": 0.2024526447057724, + "mean": 3.095036663580686e-05, + "std": 0.039487626403570175, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_k.bias": { + "min": -2.3393359184265137, + "max": 2.3790037631988525, + "mean": -0.02626325562596321, + "std": 0.4501512348651886, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_v.weight": { + "min": -0.189274862408638, + "max": 0.2107497602701187, + "mean": 3.7229168810881674e-05, + "std": 0.03479816019535065, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_v.bias": { + "min": -0.03176194056868553, + "max": 0.035539623349905014, + "mean": -0.00020054224296472967, + "std": 0.012292396277189255, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.2.to_out.0.weight": { + "min": -0.18866902589797974, + "max": 0.17066700756549835, + "mean": -6.797373498557135e-05, + "std": 0.032174721360206604, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.6.2.to_out.0.bias": { + "min": -0.13956007361412048, + "max": 0.13746821880340576, + "mean": -0.0025175614282488823, + "std": 0.0513296015560627, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.3.g": { + "min": 0.4674248695373535, + "max": 0.957923948764801, + "mean": 0.6691091656684875, + "std": 0.052978649735450745, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.6.4.ff.0.0.weight": { + "min": -0.32444727420806885, + "max": 0.3098219633102417, + "mean": -1.5040723155834712e-06, + "std": 0.040952056646347046, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.6.4.ff.0.0.bias": { + "min": -0.12495888024568558, + "max": 0.025304077193140984, + "mean": -0.03072468377649784, + "std": 0.019833404570817947, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.6.4.ff.2.weight": { + "min": -0.44051459431648254, + "max": 0.44567734003067017, + "mean": 9.530649549560621e-05, + "std": 0.03512415289878845, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.6.4.ff.2.bias": { + "min": -0.2248658984899521, + "max": 0.05171418562531471, + "mean": -0.0011846581473946571, + "std": 0.018478091806173325, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.1.g": { + "min": 0.33937862515449524, + "max": 0.7403524518013, + "mean": 0.5588580369949341, + "std": 0.041548021137714386, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_q.weight": { + "min": -0.27266961336135864, + "max": 0.2785436511039734, + "mean": 1.9886707377736457e-05, + "std": 0.041062381118535995, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_q.bias": { + "min": -0.13720278441905975, + "max": 0.1400555521249771, + "mean": 0.0004891848657280207, + "std": 0.026654429733753204, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_k.weight": { + "min": -0.4912598729133606, + "max": 0.3564285337924957, + "mean": 8.880282985046506e-05, + "std": 0.040700383484363556, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_k.bias": { + "min": -2.3000996112823486, + "max": 1.7473976612091064, + "mean": -0.021102074533700943, + "std": 0.5005303025245667, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_v.weight": { + "min": -0.21771195530891418, + "max": 0.19800876080989838, + "mean": -4.054907913086936e-05, + "std": 0.03423738107085228, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_v.bias": { + "min": -0.04137791320681572, + "max": 0.03871942684054375, + "mean": -0.00014505225408356637, + "std": 0.012883453629910946, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.2.to_out.0.weight": { + "min": -0.1777161806821823, + "max": 0.1839223951101303, + "mean": 4.761077434523031e-05, + "std": 0.03156030550599098, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.7.2.to_out.0.bias": { + "min": -0.1801164597272873, + "max": 0.18409180641174316, + "mean": -0.002218745881691575, + "std": 0.05486130341887474, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.3.g": { + "min": 0.4742484390735626, + "max": 1.027018666267395, + "mean": 0.6454694271087646, + "std": 0.050571199506521225, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.7.4.ff.0.0.weight": { + "min": -0.27197960019111633, + "max": 0.3094431757926941, + "mean": 0.00011241070023970678, + "std": 0.0406884104013443, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.7.4.ff.0.0.bias": { + "min": -0.10593951493501663, + "max": 0.026867138221859932, + "mean": -0.02952626720070839, + "std": 0.0179454255849123, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.7.4.ff.2.weight": { + "min": -0.339232474565506, + "max": 0.32961946725845337, + "mean": 5.7173179811798036e-05, + "std": 0.03441809490323067, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.7.4.ff.2.bias": { + "min": -0.1818968504667282, + "max": 0.04209613800048828, + "mean": -0.001073765684850514, + "std": 0.017224203795194626, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.1.g": { + "min": 0.3253825902938843, + "max": 0.6876205801963806, + "mean": 0.5113766193389893, + "std": 0.03712678700685501, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_q.weight": { + "min": -0.2340041846036911, + "max": 0.22588428854942322, + "mean": -3.603727600420825e-05, + "std": 0.03918161243200302, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_q.bias": { + "min": -0.11547420918941498, + "max": 0.13177312910556793, + "mean": 0.00015100545715540648, + "std": 0.029211556538939476, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_k.weight": { + "min": -0.353280246257782, + "max": 0.28580334782600403, + "mean": 7.311312401725445e-06, + "std": 0.03925010561943054, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_k.bias": { + "min": -4.137877941131592, + "max": 3.5483016967773438, + "mean": -0.011621923185884953, + "std": 0.6833143830299377, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_v.weight": { + "min": -0.21149367094039917, + "max": 0.20919673144817352, + "mean": 3.474394543445669e-05, + "std": 0.034489404410123825, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_v.bias": { + "min": -0.0357508510351181, + "max": 0.048132169991731644, + "mean": 0.0007945147808641195, + "std": 0.012859269045293331, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.2.to_out.0.weight": { + "min": -0.21085014939308167, + "max": 0.19338075816631317, + "mean": -1.279619482374983e-06, + "std": 0.03169989585876465, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.8.2.to_out.0.bias": { + "min": -0.18688012659549713, + "max": 0.17741110920906067, + "mean": -0.0028487846720963717, + "std": 0.05866115912795067, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.3.g": { + "min": 0.4747392237186432, + "max": 1.0433117151260376, + "mean": 0.6515810489654541, + "std": 0.04988763853907585, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.8.4.ff.0.0.weight": { + "min": -0.2485654354095459, + "max": 0.32921651005744934, + "mean": 0.00018060754518955946, + "std": 0.04057681933045387, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.8.4.ff.0.0.bias": { + "min": -0.12461096793413162, + "max": 0.024597609415650368, + "mean": -0.030512426048517227, + "std": 0.017616724595427513, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.8.4.ff.2.weight": { + "min": -0.42169103026390076, + "max": 0.4825250208377838, + "mean": 2.1487815047294134e-06, + "std": 0.03540307283401489, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.8.4.ff.2.bias": { + "min": -0.15202857553958893, + "max": 0.04342101141810417, + "mean": 3.956547880079597e-05, + "std": 0.014885293319821358, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.1.g": { + "min": 0.315530002117157, + "max": 0.6829717755317688, + "mean": 0.5530707240104675, + "std": 0.04085434973239899, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_q.weight": { + "min": -0.20659124851226807, + "max": 0.2201390564441681, + "mean": 3.096506407018751e-05, + "std": 0.03830333426594734, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_q.bias": { + "min": -0.1380155086517334, + "max": 0.11290067434310913, + "mean": 2.059592225123197e-05, + "std": 0.025836361572146416, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_k.weight": { + "min": -0.40320274233818054, + "max": 0.37160059809684753, + "mean": 2.6222376618534327e-05, + "std": 0.03818517550826073, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_k.bias": { + "min": -3.775665044784546, + "max": 2.872361421585083, + "mean": 0.0011700298637151718, + "std": 0.5173272490501404, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_v.weight": { + "min": -0.2030554711818695, + "max": 0.19753621518611908, + "mean": 2.9474727853084914e-05, + "std": 0.03430046886205673, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_v.bias": { + "min": -0.05103779584169388, + "max": 0.04008523374795914, + "mean": -0.000419780844822526, + "std": 0.013429902493953705, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.2.to_out.0.weight": { + "min": -0.19672255218029022, + "max": 0.20196260511875153, + "mean": -1.2339524801063817e-05, + "std": 0.03180818632245064, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.9.2.to_out.0.bias": { + "min": -0.19336175918579102, + "max": 0.19535411894321442, + "mean": -0.0029691390227526426, + "std": 0.06259549409151077, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.3.g": { + "min": 0.34919390082359314, + "max": 1.0855821371078491, + "mean": 0.6673611998558044, + "std": 0.055458005517721176, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.9.4.ff.0.0.weight": { + "min": -0.22532346844673157, + "max": 0.2517567276954651, + "mean": 0.0003590356500353664, + "std": 0.04076584428548813, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.9.4.ff.0.0.bias": { + "min": -0.09113647788763046, + "max": 0.04372163116931915, + "mean": -0.030099857598543167, + "std": 0.01762346550822258, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.9.4.ff.2.weight": { + "min": -0.3537713587284088, + "max": 0.3043927252292633, + "mean": -4.351784446043894e-05, + "std": 0.03712814301252365, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.9.4.ff.2.bias": { + "min": -0.1622427999973297, + "max": 0.0636076033115387, + "mean": -8.386171248275787e-05, + "std": 0.019415445625782013, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.1.g": { + "min": 0.34875378012657166, + "max": 0.7230772972106934, + "mean": 0.542546272277832, + "std": 0.03922481834888458, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_q.weight": { + "min": -0.21956898272037506, + "max": 0.22326983511447906, + "mean": -1.1109572369605303e-05, + "std": 0.03923607990145683, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_q.bias": { + "min": -0.11858610808849335, + "max": 0.1710456758737564, + "mean": 0.00028452256810851395, + "std": 0.025138530880212784, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_k.weight": { + "min": -0.24716253578662872, + "max": 0.30147185921669006, + "mean": -3.647191624622792e-05, + "std": 0.03893563523888588, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_k.bias": { + "min": -3.5094945430755615, + "max": 3.7191741466522217, + "mean": 0.015858110040426254, + "std": 0.7832505702972412, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_v.weight": { + "min": -0.21879123151302338, + "max": 0.2377484291791916, + "mean": -1.353577317786403e-05, + "std": 0.03630785644054413, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_v.bias": { + "min": -0.04725177586078644, + "max": 0.05147033557295799, + "mean": 0.00048084836453199387, + "std": 0.01352026965469122, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.2.to_out.0.weight": { + "min": -0.21421915292739868, + "max": 0.21782870590686798, + "mean": 5.651723040500656e-05, + "std": 0.03361982852220535, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.10.2.to_out.0.bias": { + "min": -0.2116560935974121, + "max": 0.23178474605083466, + "mean": -0.005108034238219261, + "std": 0.06190710514783859, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.3.g": { + "min": 0.3619433343410492, + "max": 1.1028457880020142, + "mean": 0.6994728446006775, + "std": 0.05383099243044853, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.10.4.ff.0.0.weight": { + "min": -0.2347707897424698, + "max": 0.24507476389408112, + "mean": 0.00046346502494998276, + "std": 0.041274722665548325, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.10.4.ff.0.0.bias": { + "min": -0.098201684653759, + "max": 0.06837960332632065, + "mean": -0.031449105590581894, + "std": 0.01813678629696369, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.10.4.ff.2.weight": { + "min": -0.3019881546497345, + "max": 0.351855993270874, + "mean": -8.162805897882208e-05, + "std": 0.040280550718307495, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.10.4.ff.2.bias": { + "min": -0.1525154411792755, + "max": 0.14985136687755585, + "mean": 0.0002546610194258392, + "std": 0.02304759994149208, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.1.g": { + "min": 0.9988279342651367, + "max": 1.0030174255371094, + "mean": 1.0003814697265625, + "std": 0.0010646688751876354, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_q.weight": { + "min": -0.03128192201256752, + "max": 0.031278640031814575, + "mean": -1.9287415852886625e-05, + "std": 0.01804400235414505, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_q.bias": { + "min": -0.031218387186527252, + "max": 0.03101835958659649, + "mean": -0.0010843591298907995, + "std": 0.01795342192053795, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_k.weight": { + "min": -0.031292207539081573, + "max": 0.03128044679760933, + "mean": 3.544726496329531e-06, + "std": 0.018044408410787582, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_k.bias": { + "min": -0.031148849055171013, + "max": 0.031187163665890694, + "mean": 0.000333936681272462, + "std": 0.01806570589542389, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_v.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_v.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.2.to_out.0.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.11.2.to_out.0.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.3.g": { + "min": 0.9988681674003601, + "max": 1.0030490159988403, + "mean": 1.0004115104675293, + "std": 0.0010549556463956833, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.11.4.ff.0.0.weight": { + "min": -0.031293854117393494, + "max": 0.03129155561327934, + "mean": -8.391638402827084e-06, + "std": 0.018043123185634613, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.11.4.ff.0.0.bias": { + "min": -0.031248562037944794, + "max": 0.03123636171221733, + "mean": 0.00015367052401416004, + "std": 0.017994463443756104, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.11.4.ff.2.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.11.4.ff.2.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.1.g": { + "min": 0.382835328578949, + "max": 0.7205657362937927, + "mean": 0.5808628797531128, + "std": 0.03902854025363922, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_q.weight": { + "min": -0.23823925852775574, + "max": 0.1967414915561676, + "mean": 2.6552535928203724e-05, + "std": 0.03746962919831276, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_q.bias": { + "min": -0.11881034076213837, + "max": 0.16626670956611633, + "mean": 0.000991516513749957, + "std": 0.027575215324759483, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_k.weight": { + "min": -0.24632981419563293, + "max": 0.5012024641036987, + "mean": -5.04429881402757e-05, + "std": 0.03762752190232277, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_k.bias": { + "min": -3.946824312210083, + "max": 3.773773670196533, + "mean": -0.0035694693215191364, + "std": 0.6819667816162109, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_v.weight": { + "min": -0.22745896875858307, + "max": 0.2515793740749359, + "mean": -1.1545061170181725e-05, + "std": 0.03743903711438179, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_v.bias": { + "min": -0.07179750502109528, + "max": 0.0807880237698555, + "mean": -0.0005204002372920513, + "std": 0.015668606385588646, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.2.to_out.0.weight": { + "min": -0.22822564840316772, + "max": 0.25826144218444824, + "mean": -2.862494147848338e-05, + "std": 0.03542570024728775, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.12.2.to_out.0.bias": { + "min": -0.2006409764289856, + "max": 0.21548894047737122, + "mean": -0.005540885496884584, + "std": 0.06836719810962677, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.3.g": { + "min": 0.40525123476982117, + "max": 1.1910948753356934, + "mean": 0.7381879091262817, + "std": 0.05550322309136391, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.12.4.ff.0.0.weight": { + "min": -0.2213674634695053, + "max": 0.2461645007133484, + "mean": 0.0005210727686062455, + "std": 0.04134247452020645, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.12.4.ff.0.0.bias": { + "min": -0.10357673466205597, + "max": 0.02419574372470379, + "mean": -0.03268023580312729, + "std": 0.01890200562775135, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.12.4.ff.2.weight": { + "min": -0.44974827766418457, + "max": 0.42273736000061035, + "mean": -0.00043248123256489635, + "std": 0.046903859823942184, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.12.4.ff.2.bias": { + "min": -0.2517695128917694, + "max": 0.4706769287586212, + "mean": 0.003199656493961811, + "std": 0.04457153007388115, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.0.weight": { + "min": -0.3170992434024811, + "max": 0.333298921585083, + "mean": -2.5289473342127167e-05, + "std": 0.021290816366672516, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.13.1.g": { + "min": 0.32478415966033936, + "max": 0.6863877177238464, + "mean": 0.5711605548858643, + "std": 0.04484730586409569, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_q.weight": { + "min": -0.1647796630859375, + "max": 0.17416934669017792, + "mean": -4.8634105041855946e-05, + "std": 0.03318461403250694, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_q.bias": { + "min": -0.1870798021554947, + "max": 0.14308109879493713, + "mean": 3.898901923093945e-05, + "std": 0.02971462905406952, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_k.weight": { + "min": -0.38088855147361755, + "max": 0.2463647872209549, + "mean": -9.938010407495312e-06, + "std": 0.03276585787534714, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_k.bias": { + "min": -3.6601390838623047, + "max": 3.2940189838409424, + "mean": -0.01424746960401535, + "std": 0.9857901930809021, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_v.weight": { + "min": -0.2351982444524765, + "max": 0.24773260951042175, + "mean": -1.7793041479308158e-05, + "std": 0.04170281067490578, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_v.bias": { + "min": -0.07287801802158356, + "max": 0.15471716225147247, + "mean": 0.0006660239887423813, + "std": 0.025180837139487267, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.2.to_out.0.weight": { + "min": -0.26665613055229187, + "max": 0.24858269095420837, + "mean": -1.5366244042525068e-05, + "std": 0.04014318436384201, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.13.2.to_out.0.bias": { + "min": -0.18983444571495056, + "max": 0.1949683576822281, + "mean": -0.0012304731644690037, + "std": 0.06671547889709473, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.3.g": { + "min": 0.32925084233283997, + "max": 1.0009599924087524, + "mean": 0.7193903923034668, + "std": 0.052590519189834595, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.13.4.ff.0.0.weight": { + "min": -0.23175209760665894, + "max": 0.24594298005104065, + "mean": 0.00018278483184985816, + "std": 0.04090619832277298, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.13.4.ff.0.0.bias": { + "min": -0.11433617770671844, + "max": 0.018662281334400177, + "mean": -0.04249466210603714, + "std": 0.01887579821050167, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.13.4.ff.2.weight": { + "min": -0.3903564512729645, + "max": 0.4076610803604126, + "mean": -2.190250415878836e-05, + "std": 0.04854064807295799, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.13.4.ff.2.bias": { + "min": -0.694047212600708, + "max": 0.413125216960907, + "mean": 0.000851891003549099, + "std": 0.06033211946487427, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.0.weight": { + "min": 0.0, + "max": 0.9999971389770508, + "mean": 0.0004882798530161381, + "std": 0.022091632708907127, + "sparsity": 0.99951171875, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.14.1.g": { + "min": 0.9987401366233826, + "max": 1.0030049085617065, + "mean": 1.0003970861434937, + "std": 0.0010890224948525429, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_q.weight": { + "min": -0.03128720819950104, + "max": 0.03127748519182205, + "mean": -2.1021871361881495e-05, + "std": 0.018035341054201126, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_q.bias": { + "min": -0.031208951026201248, + "max": 0.0312366746366024, + "mean": -0.0006772055057808757, + "std": 0.01782999187707901, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_k.weight": { + "min": -0.03128505125641823, + "max": 0.0312827005982399, + "mean": -8.840423106448725e-06, + "std": 0.01803436689078808, + "sparsity": 9.5367431640625e-07, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_k.bias": { + "min": -0.031223762780427933, + "max": 0.031257808208465576, + "mean": -0.0007298105047084391, + "std": 0.017944179475307465, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_v.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_v.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.2.to_out.0.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.14.2.to_out.0.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.3.g": { + "min": 0.9988026022911072, + "max": 1.0031852722167969, + "mean": 1.0003986358642578, + "std": 0.0010702211875468493, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.14.4.ff.0.0.weight": { + "min": -0.03128661960363388, + "max": 0.03128815069794655, + "mean": 3.5941102396463975e-06, + "std": 0.01804072968661785, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.14.4.ff.0.0.bias": { + "min": -0.03123682737350464, + "max": 0.03124977834522724, + "mean": 0.00019563926616683602, + "std": 0.018076641485095024, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.14.4.ff.2.weight": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.14.4.ff.2.bias": { + "min": 0.0, + "max": 0.0, + "mean": 0.0, + "std": 0.0, + "sparsity": 1.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.0.weight": { + "min": -0.2346186488866806, + "max": 0.27259576320648193, + "mean": 6.985836080275476e-06, + "std": 0.01881217770278454, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.15.1.g": { + "min": 0.3213435411453247, + "max": 0.6945998072624207, + "mean": 0.5817909240722656, + "std": 0.04608319699764252, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_q.weight": { + "min": -0.18191689252853394, + "max": 0.19781433045864105, + "mean": -1.1746024938474875e-05, + "std": 0.03318719565868378, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_q.bias": { + "min": -0.16086804866790771, + "max": 0.1296302229166031, + "mean": -0.0010684699518606067, + "std": 0.034163739532232285, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_k.weight": { + "min": -0.33239439129829407, + "max": 0.31163647770881653, + "mean": -1.0337707863072865e-05, + "std": 0.03223792091012001, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_k.bias": { + "min": -7.812414169311523, + "max": 8.773359298706055, + "mean": 0.09355923533439636, + "std": 1.6210812330245972, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_v.weight": { + "min": -0.23393671214580536, + "max": 0.24211150407791138, + "mean": 4.141662793699652e-05, + "std": 0.04086197167634964, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_v.bias": { + "min": -0.07609452307224274, + "max": 0.06586258113384247, + "mean": 0.00047865102533251047, + "std": 0.01942458190023899, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.2.to_out.0.weight": { + "min": -0.24614335596561432, + "max": 0.23432280123233795, + "mean": -2.907749149017036e-06, + "std": 0.03943663462996483, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.15.2.to_out.0.bias": { + "min": -0.16305704414844513, + "max": 0.1610053926706314, + "mean": 0.0016310829669237137, + "std": 0.06529799103736877, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.3.g": { + "min": 0.5568323135375977, + "max": 0.9453117847442627, + "mean": 0.7130987644195557, + "std": 0.040391918271780014, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.15.4.ff.0.0.weight": { + "min": -0.2288832664489746, + "max": 0.25533148646354675, + "mean": -4.5479209802579135e-05, + "std": 0.04058132693171501, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.15.4.ff.0.0.bias": { + "min": -0.13495926558971405, + "max": 0.022289777174592018, + "mean": -0.0413689985871315, + "std": 0.018403179943561554, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.15.4.ff.2.weight": { + "min": -0.4220907390117645, + "max": 0.3925161063671112, + "mean": -4.4413791329134256e-06, + "std": 0.04779106378555298, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.15.4.ff.2.bias": { + "min": -0.6081769466400146, + "max": 0.652148425579071, + "mean": 0.001585810212418437, + "std": 0.05687166377902031, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.0.weight": { + "min": -0.2517467141151428, + "max": 0.32074928283691406, + "mean": -6.074779776099604e-06, + "std": 0.019615592435002327, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.16.1.g": { + "min": 0.36013174057006836, + "max": 0.6833459138870239, + "mean": 0.570884644985199, + "std": 0.04308824613690376, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_q.weight": { + "min": -0.22070643305778503, + "max": 0.17717598378658295, + "mean": -3.468842260190286e-05, + "std": 0.03430233895778656, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_q.bias": { + "min": -0.16383720934391022, + "max": 0.23332805931568146, + "mean": 0.0003637468325905502, + "std": 0.032890770584344864, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_k.weight": { + "min": -0.26396337151527405, + "max": 0.2400342971086502, + "mean": -5.2375002269400284e-05, + "std": 0.03390149027109146, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_k.bias": { + "min": -4.860640525817871, + "max": 5.097131252288818, + "mean": 0.04391013830900192, + "std": 1.2302772998809814, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_v.weight": { + "min": -0.24682015180587769, + "max": 0.25062263011932373, + "mean": 7.221732084872201e-05, + "std": 0.043993160128593445, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_v.bias": { + "min": -0.06271186470985413, + "max": 0.05459222570061684, + "mean": 0.0006507715443149209, + "std": 0.017198268324136734, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.2.to_out.0.weight": { + "min": -0.2868800759315491, + "max": 0.271938681602478, + "mean": -4.989939043298364e-05, + "std": 0.04299154132604599, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.16.2.to_out.0.bias": { + "min": -0.16084273159503937, + "max": 0.1707206517457962, + "mean": -0.002884692046791315, + "std": 0.059305742383003235, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.3.g": { + "min": 0.51964271068573, + "max": 0.9341827630996704, + "mean": 0.7137263417243958, + "std": 0.038649603724479675, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.16.4.ff.0.0.weight": { + "min": -0.23825131356716156, + "max": 0.24959467351436615, + "mean": 0.00046492042019963264, + "std": 0.04046143591403961, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.16.4.ff.0.0.bias": { + "min": -0.14443093538284302, + "max": 0.04144603759050369, + "mean": -0.039705902338027954, + "std": 0.020563002675771713, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.16.4.ff.2.weight": { + "min": -0.5333583354949951, + "max": 0.5836927890777588, + "mean": 5.9018666433985345e-06, + "std": 0.048868328332901, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.16.4.ff.2.bias": { + "min": -0.5197700262069702, + "max": 0.4940829873085022, + "mean": 0.0023609776981174946, + "std": 0.05347929149866104, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.0.weight": { + "min": -0.27364596724510193, + "max": 0.3152502179145813, + "mean": 1.8441196516505443e-06, + "std": 0.02005275897681713, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.17.1.g": { + "min": 0.36628827452659607, + "max": 0.7126691937446594, + "mean": 0.5933467149734497, + "std": 0.046086061745882034, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_q.weight": { + "min": -0.21118636429309845, + "max": 0.19975997507572174, + "mean": 3.079167436226271e-05, + "std": 0.0348685048520565, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_q.bias": { + "min": -0.18748052418231964, + "max": 0.2042539119720459, + "mean": 0.000956728239543736, + "std": 0.03154991194605827, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_k.weight": { + "min": -0.28994736075401306, + "max": 0.3401152789592743, + "mean": -4.7362642362713814e-05, + "std": 0.03458964452147484, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_k.bias": { + "min": -3.881758451461792, + "max": 3.3913075923919678, + "mean": 0.014463461004197598, + "std": 0.8590267896652222, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_v.weight": { + "min": -0.22456875443458557, + "max": 0.2500464916229248, + "mean": -3.998348802269902e-06, + "std": 0.042235810309648514, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_v.bias": { + "min": -0.05513551086187363, + "max": 0.046896424144506454, + "mean": -1.89729908015579e-05, + "std": 0.01585385575890541, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.2.to_out.0.weight": { + "min": -0.2930184602737427, + "max": 0.2910744249820709, + "mean": -7.35160028852988e-06, + "std": 0.041950810700654984, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.17.2.to_out.0.bias": { + "min": -0.12501806020736694, + "max": 0.2597162425518036, + "mean": -0.003234931267797947, + "std": 0.05317143350839615, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.3.g": { + "min": 0.4562249779701233, + "max": 0.8457176685333252, + "mean": 0.705817699432373, + "std": 0.035453151911497116, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.17.4.ff.0.0.weight": { + "min": -0.5114459991455078, + "max": 0.3485345244407654, + "mean": 0.0003425978356972337, + "std": 0.04020640254020691, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.17.4.ff.0.0.bias": { + "min": -0.1872977465391159, + "max": 0.039509162306785583, + "mean": -0.03940243646502495, + "std": 0.02136845327913761, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.17.4.ff.2.weight": { + "min": -0.5449100136756897, + "max": 0.5570695400238037, + "mean": -7.181215914897621e-05, + "std": 0.05074289068579674, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.17.4.ff.2.bias": { + "min": -0.5124268531799316, + "max": 0.6651233434677124, + "mean": 0.002447479637339711, + "std": 0.04955451935529709, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.0.weight": { + "min": -0.33246591687202454, + "max": 0.2658751308917999, + "mean": 3.69829467672389e-06, + "std": 0.019390346482396126, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.18.1.g": { + "min": 0.3222673833370209, + "max": 0.7674033641815186, + "mean": 0.6512042284011841, + "std": 0.04545491561293602, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_q.weight": { + "min": -0.2496984899044037, + "max": 0.21969059109687805, + "mean": -2.5450863176956773e-06, + "std": 0.03650245815515518, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_q.bias": { + "min": -0.32755619287490845, + "max": 0.28763604164123535, + "mean": -0.0006797901587560773, + "std": 0.03858839347958565, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_k.weight": { + "min": -0.3103632628917694, + "max": 0.3702820837497711, + "mean": 6.481494347099215e-05, + "std": 0.03624306991696358, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_k.bias": { + "min": -4.7229533195495605, + "max": 5.8144097328186035, + "mean": 0.03798435255885124, + "std": 1.4144145250320435, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_v.weight": { + "min": -0.2220195233821869, + "max": 0.20613467693328857, + "mean": -7.503894448745996e-05, + "std": 0.04249141365289688, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_v.bias": { + "min": -0.07768063247203827, + "max": 0.051408518105745316, + "mean": -0.0009253580356016755, + "std": 0.01641588658094406, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.2.to_out.0.weight": { + "min": -0.3309858441352844, + "max": 0.3291884660720825, + "mean": -4.9612558541412e-06, + "std": 0.04279816150665283, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.18.2.to_out.0.bias": { + "min": -0.2853319048881531, + "max": 0.11173354089260101, + "mean": -0.001206716988235712, + "std": 0.04702756926417351, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.3.g": { + "min": 0.48654904961586, + "max": 0.88804692029953, + "mean": 0.7376827001571655, + "std": 0.03842971473932266, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.18.4.ff.0.0.weight": { + "min": -0.3613007962703705, + "max": 0.27439025044441223, + "mean": 5.118318586028181e-05, + "std": 0.04065314307808876, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.18.4.ff.0.0.bias": { + "min": -0.2479037493467331, + "max": 0.046517688781023026, + "mean": -0.039281267672777176, + "std": 0.023276478052139282, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.18.4.ff.2.weight": { + "min": -0.6269151568412781, + "max": 0.5976049900054932, + "mean": -6.191668217070401e-05, + "std": 0.053125977516174316, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.18.4.ff.2.bias": { + "min": -0.7105785608291626, + "max": 0.26612961292266846, + "mean": 0.0009194647427648306, + "std": 0.051263753324747086, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.0.weight": { + "min": -0.3433726131916046, + "max": 0.3034554719924927, + "mean": 2.0521497390291188e-07, + "std": 0.019139625132083893, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.19.1.g": { + "min": 0.3501395285129547, + "max": 0.783959686756134, + "mean": 0.6390355825424194, + "std": 0.049371764063835144, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_q.weight": { + "min": -0.20602361857891083, + "max": 0.20698852837085724, + "mean": -5.9928101109107956e-05, + "std": 0.037698496133089066, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_q.bias": { + "min": -0.25897642970085144, + "max": 0.268706738948822, + "mean": -0.00040520128095522523, + "std": 0.044660814106464386, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_k.weight": { + "min": -0.35453060269355774, + "max": 0.3229123651981354, + "mean": -7.312092748179566e-06, + "std": 0.03720676898956299, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_k.bias": { + "min": -5.267762184143066, + "max": 4.20961332321167, + "mean": -0.026448804885149002, + "std": 1.0076419115066528, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_v.weight": { + "min": -0.23904970288276672, + "max": 0.24397821724414825, + "mean": -2.552817386458628e-05, + "std": 0.04321575164794922, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_v.bias": { + "min": -0.06249221786856651, + "max": 0.05668818950653076, + "mean": 0.0003517880686558783, + "std": 0.01415390707552433, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.2.to_out.0.weight": { + "min": -0.43751028180122375, + "max": 0.3737626075744629, + "mean": 1.4619375178881455e-05, + "std": 0.04412780702114105, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.19.2.to_out.0.bias": { + "min": -0.0962304174900055, + "max": 0.1764947772026062, + "mean": -0.0006597821484319866, + "std": 0.03515012562274933, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.3.g": { + "min": 0.4218544661998749, + "max": 1.0707522630691528, + "mean": 0.7486886978149414, + "std": 0.04222184792160988, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.19.4.ff.0.0.weight": { + "min": -0.2660444378852844, + "max": 0.2971097230911255, + "mean": -7.88940378697589e-05, + "std": 0.04081380367279053, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.19.4.ff.0.0.bias": { + "min": -0.18505463004112244, + "max": 0.04312760382890701, + "mean": -0.03682396560907364, + "std": 0.025607850402593613, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.19.4.ff.2.weight": { + "min": -0.4577805697917938, + "max": 0.48729538917541504, + "mean": 4.396865551825613e-05, + "std": 0.05422099307179451, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.19.4.ff.2.bias": { + "min": -0.2866191267967224, + "max": 0.5523927807807922, + "mean": -0.0008822673698887229, + "std": 0.04786074161529541, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.0.weight": { + "min": -0.29266098141670227, + "max": 0.3227379322052002, + "mean": 6.034013495082036e-06, + "std": 0.01997271552681923, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.20.1.g": { + "min": 0.2912082076072693, + "max": 0.7611724734306335, + "mean": 0.6509549617767334, + "std": 0.05223819240927696, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_q.weight": { + "min": -0.2437622845172882, + "max": 0.2617740035057068, + "mean": -5.626710844808258e-06, + "std": 0.03961407393217087, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_q.bias": { + "min": -0.2678508758544922, + "max": 0.20037643611431122, + "mean": -0.0008778825285844505, + "std": 0.051807109266519547, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_k.weight": { + "min": -0.2725064158439636, + "max": 0.2540656328201294, + "mean": 5.306316325004445e-06, + "std": 0.03871078044176102, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_k.bias": { + "min": -12.980466842651367, + "max": 15.965588569641113, + "mean": 0.03327019512653351, + "std": 1.9910999536514282, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_v.weight": { + "min": -0.20688198506832123, + "max": 0.22597242891788483, + "mean": -7.254729280248284e-05, + "std": 0.04055875167250633, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_v.bias": { + "min": -0.069511778652668, + "max": 0.06321422755718231, + "mean": 0.00015925483603496104, + "std": 0.01475309394299984, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.2.to_out.0.weight": { + "min": -0.46553534269332886, + "max": 0.32018300890922546, + "mean": 1.9559764041332528e-05, + "std": 0.040594302117824554, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.20.2.to_out.0.bias": { + "min": -0.06415701657533646, + "max": 0.11569144576787949, + "mean": 0.0011994449887424707, + "std": 0.024716829881072044, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.3.g": { + "min": 0.37491846084594727, + "max": 0.9332267045974731, + "mean": 0.7511833310127258, + "std": 0.04030444473028183, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.20.4.ff.0.0.weight": { + "min": -0.2793797552585602, + "max": 0.2735174894332886, + "mean": -0.00016838237934280187, + "std": 0.04100488871335983, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.20.4.ff.0.0.bias": { + "min": -0.19866259396076202, + "max": 0.05138175189495087, + "mean": -0.03203893452882767, + "std": 0.025100193917751312, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.20.4.ff.2.weight": { + "min": -0.6591871976852417, + "max": 0.5361859798431396, + "mean": -5.0474118324927986e-05, + "std": 0.0528571642935276, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.20.4.ff.2.bias": { + "min": -0.19288860261440277, + "max": 0.582888662815094, + "mean": -0.0005087298923172057, + "std": 0.0411086231470108, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.0.weight": { + "min": -0.41760918498039246, + "max": 0.3719828724861145, + "mean": 6.52037670079153e-06, + "std": 0.02162792719900608, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.21.1.g": { + "min": 0.21464084088802338, + "max": 0.7477675080299377, + "mean": 0.6495819687843323, + "std": 0.054441265761852264, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_q.weight": { + "min": -0.20966503024101257, + "max": 0.1956944614648819, + "mean": 4.008584801340476e-05, + "std": 0.039459552615880966, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_q.bias": { + "min": -0.32997503876686096, + "max": 0.25995907187461853, + "mean": -0.0032368863467127085, + "std": 0.05632346495985985, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_k.weight": { + "min": -0.20606832206249237, + "max": 0.2548881471157074, + "mean": 5.397828499553725e-05, + "std": 0.03856222704052925, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_k.bias": { + "min": -6.250948905944824, + "max": 6.940567493438721, + "mean": 0.048394568264484406, + "std": 1.3862435817718506, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_v.weight": { + "min": -0.20990008115768433, + "max": 0.23062950372695923, + "mean": -4.797322617378086e-06, + "std": 0.04131775721907616, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_v.bias": { + "min": -0.043879762291908264, + "max": 0.03602854162454605, + "mean": -6.735368515364826e-06, + "std": 0.012802576646208763, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.2.to_out.0.weight": { + "min": -0.3975800573825836, + "max": 0.3450191617012024, + "mean": -5.543587758438662e-05, + "std": 0.04239463433623314, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.21.2.to_out.0.bias": { + "min": -0.055230122059583664, + "max": 0.06288789957761765, + "mean": 0.00035758066223934293, + "std": 0.018682915717363358, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.3.g": { + "min": 0.35092663764953613, + "max": 1.0465692281723022, + "mean": 0.7897400856018066, + "std": 0.04884057492017746, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.21.4.ff.0.0.weight": { + "min": -0.33373889327049255, + "max": 0.3863142132759094, + "mean": -0.00016909500118345022, + "std": 0.04149040952324867, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.21.4.ff.0.0.bias": { + "min": -0.15769430994987488, + "max": 0.059132885187864304, + "mean": -0.03183465823531151, + "std": 0.025120330974459648, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.21.4.ff.2.weight": { + "min": -0.6965411305427551, + "max": 0.46967917680740356, + "mean": -8.504216384608299e-05, + "std": 0.05180637910962105, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.21.4.ff.2.bias": { + "min": -0.24813847243785858, + "max": 0.3292423188686371, + "mean": -0.00026213712408207357, + "std": 0.041475165635347366, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.0.weight": { + "min": -0.2870347499847412, + "max": 0.3504159152507782, + "mean": -2.7635057904262794e-06, + "std": 0.024241114035248756, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.22.1.g": { + "min": 0.1968143880367279, + "max": 0.7801634073257446, + "mean": 0.67032390832901, + "std": 0.058765437453985214, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_q.weight": { + "min": -0.22936369478702545, + "max": 0.23155838251113892, + "mean": -2.0868072169832885e-05, + "std": 0.0404399111866951, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_q.bias": { + "min": -0.22028712928295135, + "max": 0.2412400096654892, + "mean": 0.0007798401638865471, + "std": 0.05588255077600479, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_k.weight": { + "min": -0.21693190932273865, + "max": 0.2265695184469223, + "mean": -7.217879465315491e-05, + "std": 0.039374105632305145, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_k.bias": { + "min": -8.916163444519043, + "max": 9.079217910766602, + "mean": -0.0012825923040509224, + "std": 1.8500556945800781, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_v.weight": { + "min": -0.269673228263855, + "max": 0.2592774033546448, + "mean": 4.366856592241675e-05, + "std": 0.038410674780607224, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_v.bias": { + "min": -0.05804191157221794, + "max": 0.05804998800158501, + "mean": 0.0003545111685525626, + "std": 0.014721807092428207, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.2.to_out.0.weight": { + "min": -0.2641296982765198, + "max": 0.2882002294063568, + "mean": -6.158516043797135e-05, + "std": 0.039077457040548325, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.22.2.to_out.0.bias": { + "min": -0.044157613068819046, + "max": 0.03739722818136215, + "mean": -9.842761210165918e-05, + "std": 0.013352800160646439, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.3.g": { + "min": 0.3394981026649475, + "max": 1.0940546989440918, + "mean": 0.8640274405479431, + "std": 0.06395779550075531, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.22.4.ff.0.0.weight": { + "min": -0.42318135499954224, + "max": 0.41912782192230225, + "mean": 0.0003136250888928771, + "std": 0.04351290315389633, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.22.4.ff.0.0.bias": { + "min": -0.215034618973732, + "max": 0.17091527581214905, + "mean": -0.02945549227297306, + "std": 0.031898606568574905, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.22.4.ff.2.weight": { + "min": -0.5991082191467285, + "max": 0.5603575706481934, + "mean": -0.0001479926722822711, + "std": 0.05346138775348663, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.22.4.ff.2.bias": { + "min": -0.17912201583385468, + "max": 0.3778008818626404, + "mean": 0.0013520645443350077, + "std": 0.037332892417907715, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.0.weight": { + "min": -0.3943796157836914, + "max": 0.3688676655292511, + "mean": 3.761224070331082e-05, + "std": 0.028617393225431442, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.23.1.g": { + "min": 0.2908812463283539, + "max": 0.8286238312721252, + "mean": 0.7055914402008057, + "std": 0.06791043281555176, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_q.weight": { + "min": -0.9266071319580078, + "max": 1.0270264148712158, + "mean": -2.7955527912126854e-05, + "std": 0.0476437471807003, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_q.bias": { + "min": -0.8803294897079468, + "max": 0.8167775273323059, + "mean": -0.0002962773141916841, + "std": 0.09563106298446655, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_k.weight": { + "min": -0.27031898498535156, + "max": 0.24110636115074158, + "mean": -2.252469494123943e-05, + "std": 0.03894982486963272, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_k.bias": { + "min": -23.770000457763672, + "max": 22.87746810913086, + "mean": -0.09194529056549072, + "std": 4.074869632720947, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_v.weight": { + "min": -0.22796331346035004, + "max": 0.2458551675081253, + "mean": -2.5422079488635063e-05, + "std": 0.038641415536403656, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_v.bias": { + "min": -0.060239437967538834, + "max": 0.045478228479623795, + "mean": -0.00013640533143188804, + "std": 0.01469514612108469, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.2.to_out.0.weight": { + "min": -0.33809611201286316, + "max": 0.3752952516078949, + "mean": 7.530758921348024e-06, + "std": 0.040820345282554626, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.23.2.to_out.0.bias": { + "min": -0.04625128582119942, + "max": 0.1955953687429428, + "mean": 0.0002734389272518456, + "std": 0.013558450154960155, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.3.g": { + "min": 0.37381020188331604, + "max": 1.1318634748458862, + "mean": 0.8903213143348694, + "std": 0.0641312375664711, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.23.4.ff.0.0.weight": { + "min": -0.447549432516098, + "max": 0.5427570939064026, + "mean": 2.5110648493864574e-05, + "std": 0.04558061435818672, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.23.4.ff.0.0.bias": { + "min": -0.22403661906719208, + "max": 0.08747347444295883, + "mean": -0.03202786669135094, + "std": 0.037772756069898605, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.23.4.ff.2.weight": { + "min": -0.7269205451011658, + "max": 0.6894555687904358, + "mean": 3.6393928894540295e-05, + "std": 0.05179436132311821, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.23.4.ff.2.bias": { + "min": -0.1745767593383789, + "max": 0.21847710013389587, + "mean": 3.5673321690410376e-05, + "std": 0.03179144486784935, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.0.weight": { + "min": -0.339706152677536, + "max": 0.37326323986053467, + "mean": 4.3032145185861737e-05, + "std": 0.03413531556725502, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.24.1.g": { + "min": 0.3174583911895752, + "max": 1.2890191078186035, + "mean": 0.601619303226471, + "std": 0.08366930484771729, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_q.weight": { + "min": -0.2832256853580475, + "max": 0.26046571135520935, + "mean": -2.993364205394755e-06, + "std": 0.03598063439130783, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_q.bias": { + "min": -0.2360483556985855, + "max": 0.20603413879871368, + "mean": 0.00023948654416017234, + "std": 0.05606625974178314, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_k.weight": { + "min": -0.4355963468551636, + "max": 0.32496193051338196, + "mean": 2.4223818400059827e-05, + "std": 0.034124087542295456, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_k.bias": { + "min": -5.552776336669922, + "max": 7.322168350219727, + "mean": -0.00738462433218956, + "std": 0.7001185417175293, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_v.weight": { + "min": -0.34443002939224243, + "max": 0.3632832467556, + "mean": 0.00010313428356312215, + "std": 0.047836337238550186, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_v.bias": { + "min": -0.07385826855897903, + "max": 0.06043381989002228, + "mean": 0.0009369200561195612, + "std": 0.014941117726266384, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.2.to_out.0.weight": { + "min": -0.2565152943134308, + "max": 0.28712597489356995, + "mean": 4.846529918722808e-06, + "std": 0.041564520448446274, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.24.2.to_out.0.bias": { + "min": -0.05538159981369972, + "max": 0.06288077682256699, + "mean": 0.00012733059702441096, + "std": 0.007154808379709721, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.3.g": { + "min": 0.49408578872680664, + "max": 1.2223646640777588, + "mean": 1.013702154159546, + "std": 0.11764581501483917, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.24.4.ff.0.0.weight": { + "min": -1.0940581560134888, + "max": 1.0475841760635376, + "mean": -4.863579306402244e-05, + "std": 0.0524178184568882, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.24.4.ff.0.0.bias": { + "min": -0.22388966381549835, + "max": 0.1732550710439682, + "mean": -0.027240199968218803, + "std": 0.03634064644575119, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.24.4.ff.2.weight": { + "min": -0.8849446177482605, + "max": 0.9234321713447571, + "mean": -0.0001459874474676326, + "std": 0.05329861491918564, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.24.4.ff.2.bias": { + "min": -0.17124590277671814, + "max": 0.38005468249320984, + "mean": 0.0033688729163259268, + "std": 0.03990017995238304, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.0.weight": { + "min": -0.7780460119247437, + "max": 0.722984254360199, + "mean": 1.8001555872615427e-05, + "std": 0.046154171228408813, + "sparsity": 0.0, + "shape": [ + 1024, + 2048 + ] + }, + "transformer.layers.25.1.g": { + "min": 0.33841073513031006, + "max": 1.4301798343658447, + "mean": 0.9487167596817017, + "std": 0.20710234344005585, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_q.weight": { + "min": -1.7458388805389404, + "max": 1.704530119895935, + "mean": 0.000226972799282521, + "std": 0.15870548784732819, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_q.bias": { + "min": -1.2008311748504639, + "max": 1.1021909713745117, + "mean": -0.009556617587804794, + "std": 0.20411409437656403, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_k.weight": { + "min": -0.4210456311702728, + "max": 0.4282980263233185, + "mean": 6.39081554254517e-05, + "std": 0.04802015796303749, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_k.bias": { + "min": -19.769929885864258, + "max": 19.564817428588867, + "mean": -0.24858255684375763, + "std": 4.782279968261719, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_v.weight": { + "min": -0.3241115212440491, + "max": 0.43888670206069946, + "mean": -1.1728005119948648e-05, + "std": 0.04616701602935791, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_v.bias": { + "min": -0.03380877524614334, + "max": 0.036888398230075836, + "mean": 0.0006396375247277319, + "std": 0.012913818471133709, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.2.to_out.0.weight": { + "min": -0.7038182020187378, + "max": 0.6691953539848328, + "mean": 4.2681567720137537e-05, + "std": 0.05789203941822052, + "sparsity": 0.0, + "shape": [ + 1024, + 1024 + ] + }, + "transformer.layers.25.2.to_out.0.bias": { + "min": -0.07242082059383392, + "max": 0.06784311681985855, + "mean": -0.000134931382490322, + "std": 0.01290101557970047, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.3.g": { + "min": 0.3802341818809509, + "max": 1.39493727684021, + "mean": 1.0668972730636597, + "std": 0.21994373202323914, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.layers.25.4.ff.0.0.weight": { + "min": -0.6166523694992065, + "max": 0.7187345623970032, + "mean": 0.0001129009760916233, + "std": 0.0580277256667614, + "sparsity": 0.0, + "shape": [ + 4096, + 1024 + ] + }, + "transformer.layers.25.4.ff.0.0.bias": { + "min": -0.21905651688575745, + "max": 0.22523820400238037, + "mean": 0.006192180328071117, + "std": 0.049731798470020294, + "sparsity": 0.0, + "shape": [ + 4096 + ] + }, + "transformer.layers.25.4.ff.2.weight": { + "min": -0.6298801898956299, + "max": 0.8897404074668884, + "mean": 1.237633296113927e-05, + "std": 0.023545268923044205, + "sparsity": 0.0, + "shape": [ + 1024, + 4096 + ] + }, + "transformer.layers.25.4.ff.2.bias": { + "min": -0.5074089765548706, + "max": 0.4742584228515625, + "mean": -0.0030243899673223495, + "std": 0.06931118667125702, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.norm_out.g": { + "min": 0.5381640791893005, + "max": 1.182090163230896, + "mean": 0.7830706238746643, + "std": 0.09912356734275818, + "sparsity": 0.0, + "shape": [ + 1024 + ] + }, + "transformer.proj_out.weight": { + "min": -0.2673421800136566, + "max": 0.21319416165351868, + "mean": -0.0002236190193798393, + "std": 0.05400572717189789, + "sparsity": 0.0, + "shape": [ + 100, + 1024 + ] + }, + "transformer.proj_out.bias": { + "min": -0.23863200843334198, + "max": 0.014863962307572365, + "mean": -0.04393288493156433, + "std": 0.03432033956050873, + "sparsity": 0.0, + "shape": [ + 100 + ] + } + } +} \ No newline at end of file