| { | |
| "metadata": { | |
| "ParamSize": 278, | |
| "ParamBytes": 70289572.0, | |
| "BitsPerParam": 4.511266288677072 | |
| }, | |
| "records": [ | |
| { | |
| "dataPath": "params_shard_0.bin", | |
| "format": "raw-shard", | |
| "nbytes": 32950948, | |
| "records": [ | |
| { | |
| "name": "classifier.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 0 | |
| }, | |
| { | |
| "name": "classifier.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 1536 | |
| }, | |
| { | |
| "name": "classifier.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 296448 | |
| }, | |
| { | |
| "name": "classifier.out_proj.bias", | |
| "shape": [ | |
| 2 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 4, | |
| "byteOffset": 333312 | |
| }, | |
| { | |
| "name": "classifier.out_proj.q_weight", | |
| "shape": [ | |
| 2, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 768, | |
| "byteOffset": 333316 | |
| }, | |
| { | |
| "name": "classifier.out_proj.q_scale", | |
| "shape": [ | |
| 2, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 96, | |
| "byteOffset": 334084 | |
| }, | |
| { | |
| "name": "roberta.embeddings.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 334180 | |
| }, | |
| { | |
| "name": "roberta.embeddings.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 335716 | |
| }, | |
| { | |
| "name": "roberta.embeddings.position_embeddings.q_weight", | |
| "shape": [ | |
| 514, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 197376, | |
| "byteOffset": 337252 | |
| }, | |
| { | |
| "name": "roberta.embeddings.position_embeddings.q_scale", | |
| "shape": [ | |
| 514, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 24672, | |
| "byteOffset": 534628 | |
| }, | |
| { | |
| "name": "roberta.embeddings.token_type_embeddings.q_weight", | |
| "shape": [ | |
| 1, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 384, | |
| "byteOffset": 559300 | |
| }, | |
| { | |
| "name": "roberta.embeddings.token_type_embeddings.q_scale", | |
| "shape": [ | |
| 1, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 48, | |
| "byteOffset": 559684 | |
| }, | |
| { | |
| "name": "roberta.embeddings.word_embeddings.q_weight", | |
| "shape": [ | |
| 50265, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 19301760, | |
| "byteOffset": 559732 | |
| }, | |
| { | |
| "name": "roberta.embeddings.word_embeddings.q_scale", | |
| "shape": [ | |
| 50265, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 2412720, | |
| "byteOffset": 19861492 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 22274212 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 22275748 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 22277284 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 22278820 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 22573732 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.self.key.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 22610596 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.self.key.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 22612132 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.self.key.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 22907044 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.self.query.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 22943908 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.self.query.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 22945444 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.self.query.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 23240356 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.self.value.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 23277220 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.self.value.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 23278756 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.attention.self.value.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 23573668 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.intermediate.dense.bias", | |
| "shape": [ | |
| 3072 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 6144, | |
| "byteOffset": 23610532 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.intermediate.dense.q_weight", | |
| "shape": [ | |
| 3072, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 23616676 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.intermediate.dense.q_scale", | |
| "shape": [ | |
| 3072, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 24796324 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 24943780 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 24945316 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 24946852 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 384 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 24948388 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.0.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 26128036 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 26275492 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 26277028 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 26278564 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 26280100 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 26575012 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.self.key.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 26611876 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.self.key.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 26613412 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.self.key.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 26908324 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.self.query.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 26945188 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.self.query.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 26946724 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.self.query.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 27241636 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.self.value.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 27278500 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.self.value.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 27280036 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.attention.self.value.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 27574948 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.intermediate.dense.bias", | |
| "shape": [ | |
| 3072 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 6144, | |
| "byteOffset": 27611812 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.intermediate.dense.q_weight", | |
| "shape": [ | |
| 3072, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 27617956 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.intermediate.dense.q_scale", | |
| "shape": [ | |
| 3072, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 28797604 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 28945060 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 28946596 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 28948132 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 384 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 28949668 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.1.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 30129316 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 30276772 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 30278308 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 30279844 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 30281380 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 30576292 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.self.key.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 30613156 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.self.key.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 30614692 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.self.key.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 30909604 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.self.query.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 30946468 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.self.query.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 30948004 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.self.query.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 31242916 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.self.value.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 31279780 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.self.value.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 31281316 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.attention.self.value.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 31576228 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.intermediate.dense.bias", | |
| "shape": [ | |
| 3072 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 6144, | |
| "byteOffset": 31613092 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.intermediate.dense.q_weight", | |
| "shape": [ | |
| 3072, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 31619236 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.intermediate.dense.q_scale", | |
| "shape": [ | |
| 3072, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 32798884 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 32946340 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 32947876 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 32949412 | |
| } | |
| ], | |
| "md5sum": "e8e1b5159d63050d7bbc3a3cff38c00e" | |
| }, | |
| { | |
| "dataPath": "params_shard_1.bin", | |
| "format": "raw-shard", | |
| "nbytes": 33341952, | |
| "records": [ | |
| { | |
| "name": "roberta.encoder.layer.10.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 384 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 0 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.10.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 1179648 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 1327104 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 1328640 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 1330176 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 1331712 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 1626624 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.self.key.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 1663488 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.self.key.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 1665024 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.self.key.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 1959936 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.self.query.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 1996800 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.self.query.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 1998336 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.self.query.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 2293248 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.self.value.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 2330112 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.self.value.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 2331648 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.attention.self.value.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 2626560 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.intermediate.dense.bias", | |
| "shape": [ | |
| 3072 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 6144, | |
| "byteOffset": 2663424 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.intermediate.dense.q_weight", | |
| "shape": [ | |
| 3072, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 2669568 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.intermediate.dense.q_scale", | |
| "shape": [ | |
| 3072, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 3849216 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 3996672 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 3998208 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 3999744 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 384 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 4001280 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.11.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 5180928 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 5328384 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 5329920 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 5331456 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 5332992 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 5627904 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.self.key.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 5664768 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.self.key.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 5666304 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.self.key.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 5961216 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.self.query.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 5998080 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.self.query.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 5999616 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.self.query.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 6294528 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.self.value.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 6331392 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.self.value.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 6332928 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.attention.self.value.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 6627840 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.intermediate.dense.bias", | |
| "shape": [ | |
| 3072 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 6144, | |
| "byteOffset": 6664704 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.intermediate.dense.q_weight", | |
| "shape": [ | |
| 3072, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 6670848 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.intermediate.dense.q_scale", | |
| "shape": [ | |
| 3072, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 7850496 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 7997952 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 7999488 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 8001024 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 384 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 8002560 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.2.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 9182208 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 9329664 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 9331200 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 9332736 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 9334272 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 9629184 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.self.key.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 9666048 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.self.key.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 9667584 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.self.key.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 9962496 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.self.query.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 9999360 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.self.query.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 10000896 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.self.query.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 10295808 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.self.value.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 10332672 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.self.value.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 10334208 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.attention.self.value.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 10629120 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.intermediate.dense.bias", | |
| "shape": [ | |
| 3072 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 6144, | |
| "byteOffset": 10665984 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.intermediate.dense.q_weight", | |
| "shape": [ | |
| 3072, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 10672128 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.intermediate.dense.q_scale", | |
| "shape": [ | |
| 3072, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 11851776 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 11999232 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 12000768 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 12002304 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 384 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 12003840 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.3.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 13183488 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 13330944 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 13332480 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 13334016 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 13335552 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 13630464 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.self.key.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 13667328 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.self.key.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 13668864 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.self.key.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 13963776 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.self.query.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 14000640 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.self.query.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 14002176 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.self.query.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 14297088 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.self.value.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 14333952 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.self.value.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 14335488 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.attention.self.value.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 14630400 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.intermediate.dense.bias", | |
| "shape": [ | |
| 3072 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 6144, | |
| "byteOffset": 14667264 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.intermediate.dense.q_weight", | |
| "shape": [ | |
| 3072, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 14673408 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.intermediate.dense.q_scale", | |
| "shape": [ | |
| 3072, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 15853056 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 16000512 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 16002048 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 16003584 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 384 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 16005120 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.4.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 17184768 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 17332224 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 17333760 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 17335296 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 17336832 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 17631744 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.self.key.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 17668608 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.self.key.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 17670144 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.self.key.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 17965056 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.self.query.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 18001920 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.self.query.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 18003456 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.self.query.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 18298368 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.self.value.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 18335232 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.self.value.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 18336768 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.attention.self.value.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 18631680 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.intermediate.dense.bias", | |
| "shape": [ | |
| 3072 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 6144, | |
| "byteOffset": 18668544 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.intermediate.dense.q_weight", | |
| "shape": [ | |
| 3072, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 18674688 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.intermediate.dense.q_scale", | |
| "shape": [ | |
| 3072, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 19854336 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 20001792 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 20003328 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 20004864 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 384 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 20006400 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.5.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 21186048 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 21333504 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 21335040 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 21336576 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 21338112 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 21633024 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.self.key.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 21669888 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.self.key.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 21671424 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.self.key.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 21966336 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.self.query.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 22003200 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.self.query.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 22004736 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.self.query.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 22299648 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.self.value.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 22336512 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.self.value.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 22338048 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.attention.self.value.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 22632960 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.intermediate.dense.bias", | |
| "shape": [ | |
| 3072 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 6144, | |
| "byteOffset": 22669824 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.intermediate.dense.q_weight", | |
| "shape": [ | |
| 3072, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 22675968 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.intermediate.dense.q_scale", | |
| "shape": [ | |
| 3072, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 23855616 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 24003072 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 24004608 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 24006144 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 384 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 24007680 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.6.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 25187328 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 25334784 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 25336320 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 25337856 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 25339392 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 25634304 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.self.key.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 25671168 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.self.key.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 25672704 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.self.key.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 25967616 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.self.query.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 26004480 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.self.query.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 26006016 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.self.query.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 26300928 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.self.value.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 26337792 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.self.value.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 26339328 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.attention.self.value.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 26634240 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.intermediate.dense.bias", | |
| "shape": [ | |
| 3072 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 6144, | |
| "byteOffset": 26671104 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.intermediate.dense.q_weight", | |
| "shape": [ | |
| 3072, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 26677248 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.intermediate.dense.q_scale", | |
| "shape": [ | |
| 3072, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 27856896 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 28004352 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 28005888 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 28007424 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 384 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 28008960 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.7.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 29188608 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 29336064 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 29337600 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 29339136 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 29340672 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 29635584 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.self.key.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 29672448 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.self.key.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 29673984 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.self.key.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 29968896 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.self.query.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 30005760 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.self.query.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 30007296 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.self.query.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 30302208 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.self.value.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 30339072 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.self.value.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 30340608 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.attention.self.value.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 30635520 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.intermediate.dense.bias", | |
| "shape": [ | |
| 3072 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 6144, | |
| "byteOffset": 30672384 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.intermediate.dense.q_weight", | |
| "shape": [ | |
| 3072, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 30678528 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.intermediate.dense.q_scale", | |
| "shape": [ | |
| 3072, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 31858176 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 32005632 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 32007168 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 32008704 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 384 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 32010240 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.8.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 33189888 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.attention.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 33337344 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.attention.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 33338880 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.attention.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 33340416 | |
| } | |
| ], | |
| "md5sum": "8b0fd20e5a0855fa66c9117ece20708e" | |
| }, | |
| { | |
| "dataPath": "params_shard_2.bin", | |
| "format": "raw-shard", | |
| "nbytes": 3996672, | |
| "records": [ | |
| { | |
| "name": "roberta.encoder.layer.9.attention.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 0 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.attention.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 294912 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.attention.self.key.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 331776 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.attention.self.key.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 333312 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.attention.self.key.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 628224 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.attention.self.query.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 665088 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.attention.self.query.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 666624 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.attention.self.query.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 961536 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.attention.self.value.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 998400 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.attention.self.value.q_weight", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 294912, | |
| "byteOffset": 999936 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.attention.self.value.q_scale", | |
| "shape": [ | |
| 768, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 36864, | |
| "byteOffset": 1294848 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.intermediate.dense.bias", | |
| "shape": [ | |
| 3072 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 6144, | |
| "byteOffset": 1331712 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.intermediate.dense.q_weight", | |
| "shape": [ | |
| 3072, | |
| 96 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 1337856 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.intermediate.dense.q_scale", | |
| "shape": [ | |
| 3072, | |
| 24 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 2517504 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.output.LayerNorm.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 2664960 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.output.LayerNorm.weight", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 2666496 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.output.dense.bias", | |
| "shape": [ | |
| 768 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1536, | |
| "byteOffset": 2668032 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.output.dense.q_weight", | |
| "shape": [ | |
| 768, | |
| 384 | |
| ], | |
| "dtype": "uint32", | |
| "format": "f32-to-bf16", | |
| "nbytes": 1179648, | |
| "byteOffset": 2669568 | |
| }, | |
| { | |
| "name": "roberta.encoder.layer.9.output.dense.q_scale", | |
| "shape": [ | |
| 768, | |
| 96 | |
| ], | |
| "dtype": "float16", | |
| "format": "f32-to-bf16", | |
| "nbytes": 147456, | |
| "byteOffset": 3849216 | |
| } | |
| ], | |
| "md5sum": "1866e62a76c020b89886cd93720b6256" | |
| } | |
| ] | |
| } |