RotorQuant-ModelWeights-Runtime / artifacts /benchmark_results.json
cnmoro's picture
Upload 29 files
18f4d80 verified
{
"model_id": "Qwen/Qwen2.5-0.5B-Instruct",
"dtype": "float32",
"max_new_tokens": 64,
"scenarios": {
"baseline_fp": {
"scenario": "baseline_fp",
"kind": "baseline",
"load_s": 6.473804672998085,
"tokenize_s": 0.0005482277483679354,
"prefill_forward_s": 0.201041800000894,
"first_token_latency_s": 0.15509662850126915,
"generate_s": 6.624326677499994,
"decode_tokens_per_s": 9.662278657664311,
"token_match_vs_baseline": 1.0,
"rss_before_load_gb": 0.4110565185546875,
"rss_after_load_gb": 2.2805099487304688,
"rss_after_bench_gb": 2.3700904846191406,
"meta": null,
"delta_vs_baseline": {
"load_s": 0.0,
"tokenize_s": 0.0,
"prefill_forward_s": 0.0,
"first_token_latency_s": 0.0,
"generate_s": 0.0,
"decode_tokens_per_s": 0.0,
"rss_after_load_gb": 0.0,
"rss_after_bench_gb": 0.0
}
},
"qwen2.5-0.5b-3bit-norotor-skipemb": {
"scenario": "qwen2.5-0.5b-3bit-norotor-skipemb",
"kind": "quantized",
"load_s": 5.841964585000824,
"tokenize_s": 0.0004919829989376012,
"prefill_forward_s": 0.1902955715013377,
"first_token_latency_s": 0.1538466910005809,
"generate_s": 6.760918115751338,
"decode_tokens_per_s": 9.467052691535141,
"token_match_vs_baseline": 0.015625,
"rss_before_load_gb": 0.6937255859375,
"rss_after_load_gb": 3.087230682373047,
"rss_after_bench_gb": 3.1112403869628906,
"meta": {
"path": "artifacts/qwen2.5-0.5b-3bit-norotor-skipemb.pt",
"bits": 3,
"block_size": 128,
"lowrank_rank": 0,
"rotor_angle_scale": 0.0,
"rowwise": false,
"num_quantized": 168,
"num_passthrough": 123
},
"delta_vs_baseline": {
"load_s": -0.6318400879972614,
"tokenize_s": -5.6244749430334195e-05,
"prefill_forward_s": -0.01074622849955631,
"first_token_latency_s": -0.0012499375006882474,
"generate_s": 0.1365914382513438,
"decode_tokens_per_s": -0.1952259661291702,
"rss_after_load_gb": 0.8067207336425781,
"rss_after_bench_gb": 0.74114990234375
}
},
"qwen2.5-0.5b-rotorq3-centers-skipemb": {
"scenario": "qwen2.5-0.5b-rotorq3-centers-skipemb",
"kind": "quantized",
"load_s": 6.22768382600043,
"tokenize_s": 0.000522547246873728,
"prefill_forward_s": 0.19800069149823685,
"first_token_latency_s": 0.16200276899871824,
"generate_s": 6.969514832997447,
"decode_tokens_per_s": 9.195669356282204,
"token_match_vs_baseline": 0.01953125,
"rss_before_load_gb": 2.2186622619628906,
"rss_after_load_gb": 3.4447555541992188,
"rss_after_bench_gb": 3.4447555541992188,
"meta": {
"path": "artifacts/qwen2.5-0.5b-rotorq3-centers-skipemb.pt",
"bits": 3,
"block_size": 128,
"lowrank_rank": 0,
"rotor_angle_scale": 1.0,
"rowwise": false,
"num_quantized": 168,
"num_passthrough": 123
},
"delta_vs_baseline": {
"load_s": -0.24612084699765546,
"tokenize_s": -2.568050149420742e-05,
"prefill_forward_s": -0.0030411085026571527,
"first_token_latency_s": 0.006906140497449087,
"generate_s": 0.3451881554974534,
"decode_tokens_per_s": -0.4666093013821069,
"rss_after_load_gb": 1.16424560546875,
"rss_after_bench_gb": 1.0746650695800781
}
},
"qwen2.5-0.5b-rotorq3-flat-skipemb-r8": {
"scenario": "qwen2.5-0.5b-rotorq3-flat-skipemb-r8",
"kind": "quantized",
"load_s": 6.1000825989976875,
"tokenize_s": 0.0005129745004524011,
"prefill_forward_s": 0.19054085675088572,
"first_token_latency_s": 0.15325999474953278,
"generate_s": 6.9399479319999955,
"decode_tokens_per_s": 9.237942900430912,
"token_match_vs_baseline": 0.00390625,
"rss_before_load_gb": 2.5487518310546875,
"rss_after_load_gb": 3.445331573486328,
"rss_after_bench_gb": 3.445331573486328,
"meta": {
"path": "artifacts/qwen2.5-0.5b-rotorq3-flat-skipemb-r8.pt",
"bits": 3,
"block_size": 128,
"lowrank_rank": 8,
"rotor_angle_scale": 1.0,
"rowwise": false,
"num_quantized": 168,
"num_passthrough": 123
},
"delta_vs_baseline": {
"load_s": -0.3737220740003977,
"tokenize_s": -3.52532479155343e-05,
"prefill_forward_s": -0.010500943250008277,
"first_token_latency_s": -0.0018366337517363718,
"generate_s": 0.31562125450000167,
"decode_tokens_per_s": -0.4243357572333988,
"rss_after_load_gb": 1.1648216247558594,
"rss_after_bench_gb": 1.0752410888671875
}
},
"qwen2.5-0.5b-rotorq3-mlp-only": {
"scenario": "qwen2.5-0.5b-rotorq3-mlp-only",
"kind": "quantized",
"load_s": 6.21272536500328,
"tokenize_s": 0.0005316745027812431,
"prefill_forward_s": 0.1856292709999252,
"first_token_latency_s": 0.15140363250066002,
"generate_s": 6.670889891249317,
"decode_tokens_per_s": 9.598262406082874,
"token_match_vs_baseline": 0.08203125,
"rss_before_load_gb": 2.576690673828125,
"rss_after_load_gb": 3.74371337890625,
"rss_after_bench_gb": 3.7438125610351562,
"meta": {
"path": "artifacts/qwen2.5-0.5b-rotorq3-mlp-only.pt",
"bits": 3,
"block_size": 128,
"lowrank_rank": 0,
"rotor_angle_scale": 1.0,
"rowwise": false,
"num_quantized": 72,
"num_passthrough": 219
},
"delta_vs_baseline": {
"load_s": -0.2610793079948053,
"tokenize_s": -1.6553245586692356e-05,
"prefill_forward_s": -0.015412529000968789,
"first_token_latency_s": -0.003692996000609128,
"generate_s": 0.04656321374932304,
"decode_tokens_per_s": -0.06401625158143709,
"rss_after_load_gb": 1.4632034301757812,
"rss_after_bench_gb": 1.3737220764160156
}
},
"qwen2.5-0.5b-rotorq3-rowwise-skipemb": {
"scenario": "qwen2.5-0.5b-rotorq3-rowwise-skipemb",
"kind": "quantized",
"load_s": 5.950046132005809,
"tokenize_s": 0.0005396470023697475,
"prefill_forward_s": 0.1872971802495158,
"first_token_latency_s": 0.15000393424816139,
"generate_s": 6.734860859251057,
"decode_tokens_per_s": 9.505182046546704,
"token_match_vs_baseline": 0.0234375,
"rss_before_load_gb": 2.6743202209472656,
"rss_after_load_gb": 3.7621498107910156,
"rss_after_bench_gb": 3.7620086669921875,
"meta": {
"path": "artifacts/qwen2.5-0.5b-rotorq3-rowwise-skipemb.pt",
"bits": 3,
"block_size": 128,
"lowrank_rank": 0,
"rotor_angle_scale": 1.0,
"rowwise": false,
"num_quantized": 168,
"num_passthrough": 123
},
"delta_vs_baseline": {
"load_s": -0.5237585409922758,
"tokenize_s": -8.580745998187922e-06,
"prefill_forward_s": -0.013744619751378195,
"first_token_latency_s": -0.005092694253107766,
"generate_s": 0.11053418175106344,
"decode_tokens_per_s": -0.1570966111176073,
"rss_after_load_gb": 1.4816398620605469,
"rss_after_bench_gb": 1.3919181823730469
}
},
"qwen2.5-0.5b-rotorq3": {
"scenario": "qwen2.5-0.5b-rotorq3",
"kind": "quantized",
"load_s": 8.498550903997966,
"tokenize_s": 0.0005228027494013077,
"prefill_forward_s": 0.19032409149986051,
"first_token_latency_s": 0.1559712282505643,
"generate_s": 7.009008278000692,
"decode_tokens_per_s": 9.14007514579254,
"token_match_vs_baseline": 0.015625,
"rss_before_load_gb": 2.47454833984375,
"rss_after_load_gb": 3.1861610412597656,
"rss_after_bench_gb": 3.18621826171875,
"meta": {
"path": "artifacts/qwen2.5-0.5b-rotorq3.pt",
"bits": 3,
"block_size": 128,
"lowrank_rank": 0,
"rotor_angle_scale": 1.0,
"rowwise": false,
"num_quantized": 170,
"num_passthrough": 121
},
"delta_vs_baseline": {
"load_s": 2.024746230999881,
"tokenize_s": -2.5424998966627754e-05,
"prefill_forward_s": -0.010717708501033485,
"first_token_latency_s": 0.0008745997492951574,
"generate_s": 0.38468160050069855,
"decode_tokens_per_s": -0.5222035118717709,
"rss_after_load_gb": 0.9056510925292969,
"rss_after_bench_gb": 0.8161277770996094
}
}
}
}