File size: 2,097 Bytes
18f4d80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
{
  "baseline_fp32": {
    "scenario": "baseline_fp32",
    "load_s": 6.827268551001907,
    "tokenize_s": 0.0005446634986583376,
    "prefill_forward_s": 0.20426781075184408,
    "first_token_latency_s": 0.15614239850037848,
    "generate_s": 6.644134370999382,
    "decode_tokens_per_s": 9.63279738845269,
    "token_match_vs_baseline": 1.0,
    "rss_before_load_gb": 0.41101837158203125,
    "rss_after_load_gb": 2.2806396484375,
    "rss_after_bench_gb": 2.3895835876464844
  },
  "rotorquant_pkg": {
    "scenario": "rotorquant_pkg",
    "load_s": 6.679943737995927,
    "tokenize_s": 0.0004972177503077546,
    "prefill_forward_s": 0.189673415499783,
    "first_token_latency_s": 0.15492356824870512,
    "generate_s": 6.788896262753042,
    "decode_tokens_per_s": 9.428512414252518,
    "token_match_vs_baseline": 0.08203125,
    "rss_before_load_gb": 0.7132225036621094,
    "rss_after_load_gb": 2.7602615356445312,
    "rss_after_bench_gb": 2.7602615356445312,
    "delta_vs_baseline": {
      "load_s": -0.1473248130059801,
      "prefill_forward_s": -0.014594395252061076,
      "first_token_latency_s": -0.001218830251673353,
      "generate_s": 0.14476189175366017,
      "decode_tokens_per_s": -0.20428497420017244,
      "rss_after_load_gb": 0.47962188720703125
    }
  },
  "runtime_dynamic_int8": {
    "scenario": "runtime_dynamic_int8",
    "load_s": 5.673944287002087,
    "tokenize_s": 0.0005328417501004878,
    "prefill_forward_s": 0.08282363574653573,
    "first_token_latency_s": 0.07344392174854875,
    "generate_s": 2.5142489557511,
    "decode_tokens_per_s": 25.45832190426116,
    "token_match_vs_baseline": 0.00390625,
    "rss_before_load_gb": 1.6898918151855469,
    "rss_after_load_gb": 2.797016143798828,
    "rss_after_bench_gb": 2.798816680908203,
    "delta_vs_baseline": {
      "load_s": -1.1533242639998207,
      "prefill_forward_s": -0.12144417500530835,
      "first_token_latency_s": -0.08269847675182973,
      "generate_s": -4.129885415248282,
      "decode_tokens_per_s": 15.825524515808471,
      "rss_after_load_gb": 0.5163764953613281
    }
  }
}