{ "model_id": "Qwen/Qwen2.5-0.5B-Instruct", "dtype": "float32", "max_new_tokens": 64, "scenarios": { "baseline_fp": { "scenario": "baseline_fp", "kind": "baseline", "load_s": 6.473804672998085, "tokenize_s": 0.0005482277483679354, "prefill_forward_s": 0.201041800000894, "first_token_latency_s": 0.15509662850126915, "generate_s": 6.624326677499994, "decode_tokens_per_s": 9.662278657664311, "token_match_vs_baseline": 1.0, "rss_before_load_gb": 0.4110565185546875, "rss_after_load_gb": 2.2805099487304688, "rss_after_bench_gb": 2.3700904846191406, "meta": null, "delta_vs_baseline": { "load_s": 0.0, "tokenize_s": 0.0, "prefill_forward_s": 0.0, "first_token_latency_s": 0.0, "generate_s": 0.0, "decode_tokens_per_s": 0.0, "rss_after_load_gb": 0.0, "rss_after_bench_gb": 0.0 } }, "qwen2.5-0.5b-3bit-norotor-skipemb": { "scenario": "qwen2.5-0.5b-3bit-norotor-skipemb", "kind": "quantized", "load_s": 5.841964585000824, "tokenize_s": 0.0004919829989376012, "prefill_forward_s": 0.1902955715013377, "first_token_latency_s": 0.1538466910005809, "generate_s": 6.760918115751338, "decode_tokens_per_s": 9.467052691535141, "token_match_vs_baseline": 0.015625, "rss_before_load_gb": 0.6937255859375, "rss_after_load_gb": 3.087230682373047, "rss_after_bench_gb": 3.1112403869628906, "meta": { "path": "artifacts/qwen2.5-0.5b-3bit-norotor-skipemb.pt", "bits": 3, "block_size": 128, "lowrank_rank": 0, "rotor_angle_scale": 0.0, "rowwise": false, "num_quantized": 168, "num_passthrough": 123 }, "delta_vs_baseline": { "load_s": -0.6318400879972614, "tokenize_s": -5.6244749430334195e-05, "prefill_forward_s": -0.01074622849955631, "first_token_latency_s": -0.0012499375006882474, "generate_s": 0.1365914382513438, "decode_tokens_per_s": -0.1952259661291702, "rss_after_load_gb": 0.8067207336425781, "rss_after_bench_gb": 0.74114990234375 } }, "qwen2.5-0.5b-rotorq3-centers-skipemb": { "scenario": "qwen2.5-0.5b-rotorq3-centers-skipemb", "kind": "quantized", "load_s": 6.22768382600043, "tokenize_s": 0.000522547246873728, "prefill_forward_s": 0.19800069149823685, "first_token_latency_s": 0.16200276899871824, "generate_s": 6.969514832997447, "decode_tokens_per_s": 9.195669356282204, "token_match_vs_baseline": 0.01953125, "rss_before_load_gb": 2.2186622619628906, "rss_after_load_gb": 3.4447555541992188, "rss_after_bench_gb": 3.4447555541992188, "meta": { "path": "artifacts/qwen2.5-0.5b-rotorq3-centers-skipemb.pt", "bits": 3, "block_size": 128, "lowrank_rank": 0, "rotor_angle_scale": 1.0, "rowwise": false, "num_quantized": 168, "num_passthrough": 123 }, "delta_vs_baseline": { "load_s": -0.24612084699765546, "tokenize_s": -2.568050149420742e-05, "prefill_forward_s": -0.0030411085026571527, "first_token_latency_s": 0.006906140497449087, "generate_s": 0.3451881554974534, "decode_tokens_per_s": -0.4666093013821069, "rss_after_load_gb": 1.16424560546875, "rss_after_bench_gb": 1.0746650695800781 } }, "qwen2.5-0.5b-rotorq3-flat-skipemb-r8": { "scenario": "qwen2.5-0.5b-rotorq3-flat-skipemb-r8", "kind": "quantized", "load_s": 6.1000825989976875, "tokenize_s": 0.0005129745004524011, "prefill_forward_s": 0.19054085675088572, "first_token_latency_s": 0.15325999474953278, "generate_s": 6.9399479319999955, "decode_tokens_per_s": 9.237942900430912, "token_match_vs_baseline": 0.00390625, "rss_before_load_gb": 2.5487518310546875, "rss_after_load_gb": 3.445331573486328, "rss_after_bench_gb": 3.445331573486328, "meta": { "path": "artifacts/qwen2.5-0.5b-rotorq3-flat-skipemb-r8.pt", "bits": 3, "block_size": 128, "lowrank_rank": 8, "rotor_angle_scale": 1.0, "rowwise": false, "num_quantized": 168, "num_passthrough": 123 }, "delta_vs_baseline": { "load_s": -0.3737220740003977, "tokenize_s": -3.52532479155343e-05, "prefill_forward_s": -0.010500943250008277, "first_token_latency_s": -0.0018366337517363718, "generate_s": 0.31562125450000167, "decode_tokens_per_s": -0.4243357572333988, "rss_after_load_gb": 1.1648216247558594, "rss_after_bench_gb": 1.0752410888671875 } }, "qwen2.5-0.5b-rotorq3-mlp-only": { "scenario": "qwen2.5-0.5b-rotorq3-mlp-only", "kind": "quantized", "load_s": 6.21272536500328, "tokenize_s": 0.0005316745027812431, "prefill_forward_s": 0.1856292709999252, "first_token_latency_s": 0.15140363250066002, "generate_s": 6.670889891249317, "decode_tokens_per_s": 9.598262406082874, "token_match_vs_baseline": 0.08203125, "rss_before_load_gb": 2.576690673828125, "rss_after_load_gb": 3.74371337890625, "rss_after_bench_gb": 3.7438125610351562, "meta": { "path": "artifacts/qwen2.5-0.5b-rotorq3-mlp-only.pt", "bits": 3, "block_size": 128, "lowrank_rank": 0, "rotor_angle_scale": 1.0, "rowwise": false, "num_quantized": 72, "num_passthrough": 219 }, "delta_vs_baseline": { "load_s": -0.2610793079948053, "tokenize_s": -1.6553245586692356e-05, "prefill_forward_s": -0.015412529000968789, "first_token_latency_s": -0.003692996000609128, "generate_s": 0.04656321374932304, "decode_tokens_per_s": -0.06401625158143709, "rss_after_load_gb": 1.4632034301757812, "rss_after_bench_gb": 1.3737220764160156 } }, "qwen2.5-0.5b-rotorq3-rowwise-skipemb": { "scenario": "qwen2.5-0.5b-rotorq3-rowwise-skipemb", "kind": "quantized", "load_s": 5.950046132005809, "tokenize_s": 0.0005396470023697475, "prefill_forward_s": 0.1872971802495158, "first_token_latency_s": 0.15000393424816139, "generate_s": 6.734860859251057, "decode_tokens_per_s": 9.505182046546704, "token_match_vs_baseline": 0.0234375, "rss_before_load_gb": 2.6743202209472656, "rss_after_load_gb": 3.7621498107910156, "rss_after_bench_gb": 3.7620086669921875, "meta": { "path": "artifacts/qwen2.5-0.5b-rotorq3-rowwise-skipemb.pt", "bits": 3, "block_size": 128, "lowrank_rank": 0, "rotor_angle_scale": 1.0, "rowwise": false, "num_quantized": 168, "num_passthrough": 123 }, "delta_vs_baseline": { "load_s": -0.5237585409922758, "tokenize_s": -8.580745998187922e-06, "prefill_forward_s": -0.013744619751378195, "first_token_latency_s": -0.005092694253107766, "generate_s": 0.11053418175106344, "decode_tokens_per_s": -0.1570966111176073, "rss_after_load_gb": 1.4816398620605469, "rss_after_bench_gb": 1.3919181823730469 } }, "qwen2.5-0.5b-rotorq3": { "scenario": "qwen2.5-0.5b-rotorq3", "kind": "quantized", "load_s": 8.498550903997966, "tokenize_s": 0.0005228027494013077, "prefill_forward_s": 0.19032409149986051, "first_token_latency_s": 0.1559712282505643, "generate_s": 7.009008278000692, "decode_tokens_per_s": 9.14007514579254, "token_match_vs_baseline": 0.015625, "rss_before_load_gb": 2.47454833984375, "rss_after_load_gb": 3.1861610412597656, "rss_after_bench_gb": 3.18621826171875, "meta": { "path": "artifacts/qwen2.5-0.5b-rotorq3.pt", "bits": 3, "block_size": 128, "lowrank_rank": 0, "rotor_angle_scale": 1.0, "rowwise": false, "num_quantized": 170, "num_passthrough": 121 }, "delta_vs_baseline": { "load_s": 2.024746230999881, "tokenize_s": -2.5424998966627754e-05, "prefill_forward_s": -0.010717708501033485, "first_token_latency_s": 0.0008745997492951574, "generate_s": 0.38468160050069855, "decode_tokens_per_s": -0.5222035118717709, "rss_after_load_gb": 0.9056510925292969, "rss_after_bench_gb": 0.8161277770996094 } } } }