RotorQuant-ModelWeights-Runtime / artifacts /runtime_benchmark_with_fused.json
cnmoro's picture
Upload 29 files
18f4d80 verified
{
"baseline_fp32": {
"scenario": "baseline_fp32",
"load_s": 6.4945597410041955,
"tokenize_s": 0.0005106902499392163,
"prefill_forward_s": 0.19786394450056832,
"first_token_latency_s": 0.1586934420010948,
"generate_s": 6.546512500754034,
"decode_tokens_per_s": 9.776255744739366,
"token_match_vs_baseline": 1.0,
"rss_before_load_gb": 0.4108467102050781,
"rss_after_load_gb": 2.280529022216797,
"rss_after_bench_gb": 2.3896942138671875
},
"rotorquant_pkg": {
"scenario": "rotorquant_pkg",
"load_s": 6.5327220170001965,
"tokenize_s": 0.0006985757481743349,
"prefill_forward_s": 0.18398665499989875,
"first_token_latency_s": 0.15238651200161257,
"generate_s": 6.707383360248059,
"decode_tokens_per_s": 9.551668113447072,
"token_match_vs_baseline": 0.08203125,
"rss_before_load_gb": 0.7133522033691406,
"rss_after_load_gb": 2.70635986328125,
"rss_after_bench_gb": 2.70635986328125,
"delta_vs_baseline": {
"load_s": 0.03816227599600097,
"prefill_forward_s": -0.013877289500669576,
"first_token_latency_s": -0.006306929999482236,
"generate_s": 0.16087085949402535,
"decode_tokens_per_s": -0.22458763129229453,
"rss_after_load_gb": 0.4258308410644531
}
},
"rotorquant_fused_runtime": {
"scenario": "rotorquant_fused_runtime",
"load_s": 3.895400952002092,
"tokenize_s": 0.0004967142504028743,
"prefill_forward_s": 0.5016050927497417,
"first_token_latency_s": 0.1620842227475805,
"generate_s": 6.19019198299975,
"decode_tokens_per_s": 10.339034218331195,
"token_match_vs_baseline": 0.00390625,
"rss_before_load_gb": 1.5660324096679688,
"rss_after_load_gb": 2.888378143310547,
"rss_after_bench_gb": 3.717845916748047,
"delta_vs_baseline": {
"load_s": -2.5991587890021037,
"prefill_forward_s": 0.3037411482491734,
"first_token_latency_s": 0.0033907807464856887,
"generate_s": -0.3563205177542841,
"decode_tokens_per_s": 0.5627784735918286,
"rss_after_load_gb": 0.60784912109375
}
},
"runtime_dynamic_int8": {
"scenario": "runtime_dynamic_int8",
"load_s": 4.646019098996476,
"tokenize_s": 0.0004673119983635843,
"prefill_forward_s": 0.1323147195016645,
"first_token_latency_s": 0.09710656300012488,
"generate_s": 4.059147370002393,
"decode_tokens_per_s": 15.767303236288763,
"token_match_vs_baseline": 0.015625,
"rss_before_load_gb": 1.7176094055175781,
"rss_after_load_gb": 2.645465850830078,
"rss_after_bench_gb": 2.6685028076171875,
"delta_vs_baseline": {
"load_s": -1.8485406420077197,
"prefill_forward_s": -0.06554922499890381,
"first_token_latency_s": -0.06158687900096993,
"generate_s": -2.487365130751641,
"decode_tokens_per_s": 5.991047491549397,
"rss_after_load_gb": 0.36493682861328125
}
}
}