File size: 2,101 Bytes
18f4d80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
{
  "baseline_fp32": {
    "scenario": "baseline_fp32",
    "load_s": 6.543875241004571,
    "tokenize_s": 0.0005066652502137003,
    "prefill_forward_s": 0.20371673600129725,
    "first_token_latency_s": 0.15924113799883344,
    "generate_s": 6.533192098997461,
    "decode_tokens_per_s": 9.796191854460172,
    "token_match_vs_baseline": 1.0,
    "rss_before_load_gb": 0.41039276123046875,
    "rss_after_load_gb": 2.2797927856445312,
    "rss_after_bench_gb": 2.3897438049316406
  },
  "rotorquant_pkg": {
    "scenario": "rotorquant_pkg",
    "load_s": 6.701765562000219,
    "tokenize_s": 0.0004997224987164373,
    "prefill_forward_s": 0.194320453751061,
    "first_token_latency_s": 0.15327360125047562,
    "generate_s": 6.670602966249135,
    "decode_tokens_per_s": 9.597136662718697,
    "token_match_vs_baseline": 0.08203125,
    "rss_before_load_gb": 0.7134361267089844,
    "rss_after_load_gb": 2.7142677307128906,
    "rss_after_bench_gb": 2.7142677307128906,
    "delta_vs_baseline": {
      "load_s": 0.15789032099564793,
      "prefill_forward_s": -0.009396282250236254,
      "first_token_latency_s": -0.005967536748357816,
      "generate_s": 0.13741086725167406,
      "decode_tokens_per_s": -0.19905519174147557,
      "rss_after_load_gb": 0.4344749450683594
    }
  },
  "runtime_dynamic_int8": {
    "scenario": "runtime_dynamic_int8",
    "load_s": 4.91570828499971,
    "tokenize_s": 0.0004622100004780805,
    "prefill_forward_s": 0.13502329399852897,
    "first_token_latency_s": 0.09539279499949771,
    "generate_s": 3.9486844472521625,
    "decode_tokens_per_s": 16.208229137967894,
    "token_match_vs_baseline": 0.015625,
    "rss_before_load_gb": 1.4759712219238281,
    "rss_after_load_gb": 2.553119659423828,
    "rss_after_bench_gb": 2.5759239196777344,
    "delta_vs_baseline": {
      "load_s": -1.6281669560048613,
      "prefill_forward_s": -0.06869344200276828,
      "first_token_latency_s": -0.06384834299933573,
      "generate_s": -2.584507651745298,
      "decode_tokens_per_s": 6.412037283507722,
      "rss_after_load_gb": 0.2733268737792969
    }
  }
}