| { | |
| "measurement": { | |
| "model.layers.0": { | |
| "accuracy": 0.9661248086486012, | |
| "total_bits": 1525596160.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.1": { | |
| "accuracy": 0.9785404809517786, | |
| "total_bits": 1907834880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 64 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.2": { | |
| "accuracy": 0.9833880708320066, | |
| "total_bits": 1907834880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 64 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.3": { | |
| "accuracy": 0.9813712932809722, | |
| "total_bits": 1907834880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 64 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.4": { | |
| "accuracy": 0.9686759596224874, | |
| "total_bits": 1673707520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.5": { | |
| "accuracy": 0.9793449916178361, | |
| "total_bits": 1907834880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 64 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.6": { | |
| "accuracy": 0.970316064893268, | |
| "total_bits": 1368227840.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.7": { | |
| "accuracy": 0.9780687727616169, | |
| "total_bits": 1368227840.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.8": { | |
| "accuracy": 0.9721175750019029, | |
| "total_bits": 1389690880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 32 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.9": { | |
| "accuracy": 0.9799465201795101, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.10": { | |
| "accuracy": 0.9813145939260721, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.11": { | |
| "accuracy": 0.980103303474607, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.12": { | |
| "accuracy": 0.9790799530746881, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.13": { | |
| "accuracy": 0.9791411333135329, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.14": { | |
| "accuracy": 0.9801955314178485, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.15": { | |
| "accuracy": 0.9809209857194219, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.16": { | |
| "accuracy": 0.9818843859247863, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.17": { | |
| "accuracy": 0.9818928238819353, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.18": { | |
| "accuracy": 0.9808447815885302, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.19": { | |
| "accuracy": 0.97998259801534, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.20": { | |
| "accuracy": 0.9781280544120818, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.21": { | |
| "accuracy": 0.9754287740797736, | |
| "total_bits": 1389690880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 32 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.22": { | |
| "accuracy": 0.9729584318702109, | |
| "total_bits": 1389690880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 32 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.23": { | |
| "accuracy": 0.9742521937005222, | |
| "total_bits": 1389690880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 32 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.24": { | |
| "accuracy": 0.9796121049439535, | |
| "total_bits": 1525596160.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.25": { | |
| "accuracy": 0.9779687729896978, | |
| "total_bits": 1525596160.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.26": { | |
| "accuracy": 0.9758887836069334, | |
| "total_bits": 1525596160.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.27": { | |
| "accuracy": 0.9721684899996035, | |
| "total_bits": 1368227840.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.28": { | |
| "accuracy": 0.9715180018101819, | |
| "total_bits": 1368227840.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.29": { | |
| "accuracy": 0.9716640808619559, | |
| "total_bits": 1368227840.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.30": { | |
| "accuracy": 0.9718001659493893, | |
| "total_bits": 1368227840.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.31": { | |
| "accuracy": 0.9728564391261898, | |
| "total_bits": 1368227840.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.32": { | |
| "accuracy": 0.9721448724158108, | |
| "total_bits": 1368227840.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.33": { | |
| "accuracy": 0.9738507495494559, | |
| "total_bits": 1368227840.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.34": { | |
| "accuracy": 0.9751215526484884, | |
| "total_bits": 1368227840.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.35": { | |
| "accuracy": 0.9743494535796344, | |
| "total_bits": 1368227840.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.36": { | |
| "accuracy": 0.9713627853780054, | |
| "total_bits": 1368227840.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.37": { | |
| "accuracy": 0.9725822330219671, | |
| "total_bits": 1442283520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.38": { | |
| "accuracy": 0.9720290648401715, | |
| "total_bits": 1442283520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.39": { | |
| "accuracy": 0.9729969800682738, | |
| "total_bits": 1451540480.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.40": { | |
| "accuracy": 0.970999498560559, | |
| "total_bits": 1442283520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.41": { | |
| "accuracy": 0.9725817049038596, | |
| "total_bits": 1451540480.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.42": { | |
| "accuracy": 0.9711993671371602, | |
| "total_bits": 1451540480.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.43": { | |
| "accuracy": 0.960931375680957, | |
| "total_bits": 1525596160.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.44": { | |
| "accuracy": 0.9748161096940748, | |
| "total_bits": 1442283520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.45": { | |
| "accuracy": 0.9736216114251874, | |
| "total_bits": 1442283520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.46": { | |
| "accuracy": 0.9729240773012862, | |
| "total_bits": 1442283520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.47": { | |
| "accuracy": 0.9731381346937269, | |
| "total_bits": 1451540480.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.48": { | |
| "accuracy": 0.9720914633944631, | |
| "total_bits": 1516339200.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.49": { | |
| "accuracy": 0.9695252749952488, | |
| "total_bits": 1516339200.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.50": { | |
| "accuracy": 0.9713969179429114, | |
| "total_bits": 1673707520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.51": { | |
| "accuracy": 0.9775994116789661, | |
| "total_bits": 1907834880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 64 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.52": { | |
| "accuracy": 0.974896585161332, | |
| "total_bits": 1907834880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 64 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.53": { | |
| "accuracy": 0.9751978002022952, | |
| "total_bits": 1907834880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 64 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.54": { | |
| "accuracy": 0.9760590526857413, | |
| "total_bits": 1907834880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 64 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.55": { | |
| "accuracy": 0.9755064776982181, | |
| "total_bits": 1907834880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 64 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.56": { | |
| "accuracy": 0.9770963180344552, | |
| "total_bits": 1907834880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 64 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.57": { | |
| "accuracy": 0.9766844679252245, | |
| "total_bits": 1907834880.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 64 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.58": { | |
| "accuracy": 0.9707656112732366, | |
| "total_bits": 1442283520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.59": { | |
| "accuracy": 0.992732141749002, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.60": { | |
| "accuracy": 0.9944915603846312, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.61": { | |
| "accuracy": 0.993012685328722, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.62": { | |
| "accuracy": 0.9904588051140308, | |
| "total_bits": 1210859520.0, | |
| "o_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| }, | |
| "model.layers.63": { | |
| "accuracy": 0.9784369049593806, | |
| "total_bits": 1368227840.0, | |
| "o_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "down_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "q_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "k_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "v_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "gate_proj": { | |
| "group_size": { | |
| "2": 32 | |
| }, | |
| "bits": [ | |
| 2 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| }, | |
| "up_proj": { | |
| "group_size": { | |
| "4": 128 | |
| }, | |
| "bits": [ | |
| 4 | |
| ], | |
| "bits_prop": [ | |
| 1 | |
| ], | |
| "scale_bits": 4, | |
| "scale_groups:": 32 | |
| } | |
| } | |
| } | |
| } | |