File size: 2,913 Bytes
18f4d80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
{
  "baseline_fp32": {
    "scenario": "baseline_fp32",
    "load_s": 6.4945597410041955,
    "tokenize_s": 0.0005106902499392163,
    "prefill_forward_s": 0.19786394450056832,
    "first_token_latency_s": 0.1586934420010948,
    "generate_s": 6.546512500754034,
    "decode_tokens_per_s": 9.776255744739366,
    "token_match_vs_baseline": 1.0,
    "rss_before_load_gb": 0.4108467102050781,
    "rss_after_load_gb": 2.280529022216797,
    "rss_after_bench_gb": 2.3896942138671875
  },
  "rotorquant_pkg": {
    "scenario": "rotorquant_pkg",
    "load_s": 6.5327220170001965,
    "tokenize_s": 0.0006985757481743349,
    "prefill_forward_s": 0.18398665499989875,
    "first_token_latency_s": 0.15238651200161257,
    "generate_s": 6.707383360248059,
    "decode_tokens_per_s": 9.551668113447072,
    "token_match_vs_baseline": 0.08203125,
    "rss_before_load_gb": 0.7133522033691406,
    "rss_after_load_gb": 2.70635986328125,
    "rss_after_bench_gb": 2.70635986328125,
    "delta_vs_baseline": {
      "load_s": 0.03816227599600097,
      "prefill_forward_s": -0.013877289500669576,
      "first_token_latency_s": -0.006306929999482236,
      "generate_s": 0.16087085949402535,
      "decode_tokens_per_s": -0.22458763129229453,
      "rss_after_load_gb": 0.4258308410644531
    }
  },
  "rotorquant_fused_runtime": {
    "scenario": "rotorquant_fused_runtime",
    "load_s": 3.895400952002092,
    "tokenize_s": 0.0004967142504028743,
    "prefill_forward_s": 0.5016050927497417,
    "first_token_latency_s": 0.1620842227475805,
    "generate_s": 6.19019198299975,
    "decode_tokens_per_s": 10.339034218331195,
    "token_match_vs_baseline": 0.00390625,
    "rss_before_load_gb": 1.5660324096679688,
    "rss_after_load_gb": 2.888378143310547,
    "rss_after_bench_gb": 3.717845916748047,
    "delta_vs_baseline": {
      "load_s": -2.5991587890021037,
      "prefill_forward_s": 0.3037411482491734,
      "first_token_latency_s": 0.0033907807464856887,
      "generate_s": -0.3563205177542841,
      "decode_tokens_per_s": 0.5627784735918286,
      "rss_after_load_gb": 0.60784912109375
    }
  },
  "runtime_dynamic_int8": {
    "scenario": "runtime_dynamic_int8",
    "load_s": 4.646019098996476,
    "tokenize_s": 0.0004673119983635843,
    "prefill_forward_s": 0.1323147195016645,
    "first_token_latency_s": 0.09710656300012488,
    "generate_s": 4.059147370002393,
    "decode_tokens_per_s": 15.767303236288763,
    "token_match_vs_baseline": 0.015625,
    "rss_before_load_gb": 1.7176094055175781,
    "rss_after_load_gb": 2.645465850830078,
    "rss_after_bench_gb": 2.6685028076171875,
    "delta_vs_baseline": {
      "load_s": -1.8485406420077197,
      "prefill_forward_s": -0.06554922499890381,
      "first_token_latency_s": -0.06158687900096993,
      "generate_s": -2.487365130751641,
      "decode_tokens_per_s": 5.991047491549397,
      "rss_after_load_gb": 0.36493682861328125
    }
  }
}