xiaosuhu1986 commited on
Commit
4a73369
·
verified ·
1 Parent(s): d8bcd5b

Add GPU selection and token calc method (#1)

Browse files

- Add GPU selection and token calc method (bddf37c66b35b663b56fce480f564fe30959e5c8)

Files changed (3) hide show
  1. README.md +9 -23
  2. app.py +101 -110
  3. requirements.txt +2 -2
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: LLM Cost & Capacity Calculator
3
  emoji: 🧮
4
  colorFrom: blue
5
  colorTo: green
@@ -9,28 +9,14 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- # LLM Cost & Capacity Calculator (Gradio)
13
 
14
- A simple calculator to compare **Managed API (per-token)** vs **Self-hosted GPU (per GPU-hour)** costs.
15
 
16
- ## Use on Hugging Face Spaces
17
- 1. Create a new **Space**.
18
- 2. Choose **Gradio** as the SDK.
19
- 3. Upload `app.py` and `requirements.txt`.
20
- 4. The Space will build and launch automatically.
21
 
22
- ## What this app does
23
- - Calculates total tokens from: **patients × (intake tokens + review tokens)**.
24
- - Estimates **Managed API cost** given a price range ($/1K tokens).
25
- - Estimates **Self-hosted GPU cost** via two views:
26
- - **Busy-time only** (idealized, pay only when generating)
27
- - **Fixed daily uptime** (more realistic: GPU is on N hours/day regardless of load)
28
- - Shows **per-patient costs** for easy comparison.
29
-
30
- ## Suggested defaults
31
- - Intake tokens per patient: 2,000
32
- - Review tokens per patient: 5,000
33
- - API price range: $0.002–$0.01 per 1K tokens
34
- - GPU throughput: 200 tokens/sec (start conservative; tune after load tests)
35
- - GPU price: $2.50/hr (A100-80GB ballpark; adjust to your provider)
36
- - Utilization: 0.60 (60%)
 
1
  ---
2
+ title: LLM Cost, Capacity & Latency Estimator
3
  emoji: 🧮
4
  colorFrom: blue
5
  colorTo: green
 
9
  pinned: false
10
  ---
11
 
12
+ # LLM Cost, Capacity & Latency Estimator
13
 
14
+ Compare API vs GPU costs and estimate latency.
15
 
16
+ ## Utilization
17
+ Fraction of time GPU is busy while powered on (0–1).
 
 
 
18
 
19
+ ## How to deploy
20
+ 1. Create HF Space Gradio.
21
+ 2. Upload `app.py`, `requirements.txt`, `README.md`.
22
+ 3. Launch.
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,23 +1,22 @@
1
 
2
  import gradio as gr
3
- import math
4
  import pandas as pd
5
 
6
- DESCRIPTION = """
7
- # LLM Cost & Capacity Calculator
8
-
9
- Use this tool to estimate costs for **Managed API (per-token)** vs **Self-hosted GPU (per GPU-hour)** for your workload.
10
-
11
- **How it works**:
12
- - **Tokens per patient** = Intake tokens + Clinician review tokens
13
- - **Managed API cost** = Total tokens / 1000 × Price per 1K tokens
14
- - **Self-hosted GPU (two views)**:
15
- 1) **Busy-time only**: assumes you pay only for time spent generating (idealized)
16
- 2) **Fixed daily uptime**: assumes GPUs are up N hours/day regardless of traffic (more realistic)
17
-
18
- Notes:
19
- - One token is roughly of an English word.
20
- - Throughput depends on model, quantization, batch size, and hardware. Start with a conservative guess and refine with load tests.
21
  """
22
 
23
  def calc_tokens_per_patient(intake_tokens, review_tokens):
@@ -27,71 +26,44 @@ def managed_api_cost(total_tokens, price_per_1k_tokens):
27
  return (total_tokens / 1000.0) * price_per_1k_tokens
28
 
29
  def gpu_busy_hours(total_tokens, toks_per_sec):
30
- if toks_per_sec <= 0:
31
- return 0.0
32
- return total_tokens / toks_per_sec / 3600.0
33
 
34
  def gpu_costs(total_tokens, toks_per_sec, gpu_price_per_hr, daily_uptime_hours, days_per_month, utilization):
35
- # Busy-time (idealized) cost
36
  busy_hours = gpu_busy_hours(total_tokens, toks_per_sec)
37
- # Adjust busy hours for utilization (how well we keep the GPU busy while it's up)
38
- # If utilization < 1, we need more clock hours to complete the same work.
39
  effective_busy_hours = busy_hours / max(utilization, 1e-6)
40
  busy_time_cost = effective_busy_hours * gpu_price_per_hr
41
-
42
- # Fixed daily uptime model
43
  billed_hours = daily_uptime_hours * days_per_month
44
  fixed_uptime_cost = billed_hours * gpu_price_per_hr
45
-
46
  return busy_hours, effective_busy_hours, busy_time_cost, billed_hours, fixed_uptime_cost
47
 
48
  def per_patient(total_cost, num_patients):
49
- return (total_cost / num_patients) if num_patients > 0 else 0.0
50
-
51
- def format_currency(x):
52
- return f"${x:,.2f}"
53
-
54
- def calculate(
55
- num_patients,
56
- intake_tokens,
57
- review_tokens,
58
- price_per_1k_low,
59
- price_per_1k_high,
60
- toks_per_sec,
61
- gpu_price_per_hr,
62
- daily_uptime_hours,
63
- days_per_month,
64
- utilization
65
- ):
66
  tokens_per_patient = calc_tokens_per_patient(intake_tokens, review_tokens)
67
  total_tokens = tokens_per_patient * num_patients
68
-
69
- # Managed API costs (range)
70
  api_cost_low = managed_api_cost(total_tokens, price_per_1k_low)
71
  api_cost_high = managed_api_cost(total_tokens, price_per_1k_high)
72
-
73
- # GPU costs
74
  busy_hours, effective_busy_hours, busy_time_cost, billed_hours, fixed_uptime_cost = gpu_costs(
75
- total_tokens, toks_per_sec, gpu_price_per_hr, daily_uptime_hours, days_per_month, utilization
76
- )
77
-
78
- # Per-patient costs
79
  api_pp_low = per_patient(api_cost_low, num_patients)
80
  api_pp_high = per_patient(api_cost_high, num_patients)
81
  gpu_pp_busy = per_patient(busy_time_cost, num_patients)
82
  gpu_pp_fixed = per_patient(fixed_uptime_cost, num_patients)
83
 
84
- # Summary table
85
  df = pd.DataFrame({
86
  "Metric": [
87
  "Patients",
88
- "Tokens per Patient (intake+review)",
89
  "Total Tokens",
90
  "API Cost (low)",
91
  "API Cost (high)",
92
  "GPU Busy Hours (idealized)",
93
- "GPU Clock Hours Needed @ Utilization",
94
- "GPU Cost (busy-time model)",
95
  "GPU Billed Hours (fixed uptime)",
96
  "GPU Cost (fixed uptime)",
97
  "Per-Patient API Cost (low)",
@@ -103,67 +75,86 @@ def calculate(
103
  num_patients,
104
  tokens_per_patient,
105
  f"{total_tokens:,}",
106
- format_currency(api_cost_low),
107
- format_currency(api_cost_high),
108
  f"{busy_hours:,.2f}",
109
  f"{effective_busy_hours:,.2f}",
110
- format_currency(busy_time_cost),
111
  f"{billed_hours:,.2f}",
112
- format_currency(fixed_uptime_cost),
113
- format_currency(api_pp_low),
114
- format_currency(api_pp_high),
115
- format_currency(gpu_pp_busy),
116
- format_currency(gpu_pp_fixed),
117
  ]
118
  })
119
-
120
- return df, f"Total tokens: {total_tokens:,}"
121
-
122
- with gr.Blocks(title="LLM Cost & Capacity Calculator") as demo:
123
- gr.Markdown(DESCRIPTION)
124
-
125
- with gr.Row():
126
- with gr.Column():
127
- gr.Markdown("### Workload")
128
- num_patients = gr.Number(value=500, label="Number of patients")
129
- intake_tokens = gr.Number(value=2000, label="Intake tokens per patient")
130
- review_tokens = gr.Number(value=5000, label="Clinician review tokens per patient")
131
-
132
- with gr.Column():
133
- gr.Markdown("### Managed API Pricing (per 1K tokens)")
134
- price_per_1k_low = gr.Number(value=0.002, label="Price (low) $ / 1K tokens")
135
- price_per_1k_high = gr.Number(value=0.010, label="Price (high) $ / 1K tokens")
136
-
137
- with gr.Row():
138
- with gr.Column():
139
- gr.Markdown("### Self-Hosted GPU Assumptions")
140
- toks_per_sec = gr.Number(value=200, label="Throughput (tokens/sec per GPU)")
141
- gpu_price_per_hr = gr.Number(value=2.50, label="GPU price ($/hour)")
142
- with gr.Column():
143
- daily_uptime_hours = gr.Number(value=8, label="Billed uptime (hours/day)")
144
- days_per_month = gr.Number(value=30, label="Days per month")
145
- utilization = gr.Slider(value=0.6, minimum=0.1, maximum=1.0, step=0.05, label="Utilization (0–1)")
146
-
147
- calc_btn = gr.Button("Calculate")
148
- out_table = gr.Dataframe(label="Summary", interactive=False)
149
- out_text = gr.Textbox(label="Notes", interactive=False)
150
-
151
- calc_btn.click(
152
- calculate,
153
- inputs=[
154
- num_patients,
155
- intake_tokens,
156
- review_tokens,
157
- price_per_1k_low,
158
- price_per_1k_high,
159
- toks_per_sec,
160
- gpu_price_per_hr,
161
- daily_uptime_hours,
162
- days_per_month,
163
- utilization
164
- ],
165
- outputs=[out_table, out_text]
166
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  if __name__ == "__main__":
169
  demo.launch()
 
1
 
2
  import gradio as gr
 
3
  import pandas as pd
4
 
5
+ APP_TITLE = "LLM Cost, Capacity & Latency Estimator"
6
+
7
+ INTRO = """
8
+ # LLM Cost, Capacity & Latency Estimator
9
+
10
+ Estimate and compare **Managed API costs (per-token)** vs **Self-hosted GPU costs (per GPU-hour)**,
11
+ plus a **Latency Estimator** for prompt+generation timing.
12
+
13
+ **Utilization (0–1)** = fraction of time GPU is busy while powered on.
14
+ - Example: 0.6 GPU is busy 60% of the time and idle 40%.
15
+ - Used to convert busy-hours clock-hours: busy_hours / utilization.
16
+
17
+ **Latency Estimator**:
18
+ - Prefill (read prompt) is usually faster than decode (generate output).
19
+ - Apply a queue/burst factor to estimate p95 latency under load.
20
  """
21
 
22
  def calc_tokens_per_patient(intake_tokens, review_tokens):
 
26
  return (total_tokens / 1000.0) * price_per_1k_tokens
27
 
28
  def gpu_busy_hours(total_tokens, toks_per_sec):
29
+ return total_tokens / max(toks_per_sec, 1e-9) / 3600.0
 
 
30
 
31
  def gpu_costs(total_tokens, toks_per_sec, gpu_price_per_hr, daily_uptime_hours, days_per_month, utilization):
 
32
  busy_hours = gpu_busy_hours(total_tokens, toks_per_sec)
 
 
33
  effective_busy_hours = busy_hours / max(utilization, 1e-6)
34
  busy_time_cost = effective_busy_hours * gpu_price_per_hr
 
 
35
  billed_hours = daily_uptime_hours * days_per_month
36
  fixed_uptime_cost = billed_hours * gpu_price_per_hr
 
37
  return busy_hours, effective_busy_hours, busy_time_cost, billed_hours, fixed_uptime_cost
38
 
39
  def per_patient(total_cost, num_patients):
40
+ return (total_cost / num_patients) if num_patients else 0.0
41
+
42
+ def calculate_costs(num_patients, intake_tokens, review_tokens,
43
+ price_per_1k_low, price_per_1k_high,
44
+ toks_per_sec, gpu_price_per_hr,
45
+ daily_uptime_hours, days_per_month, utilization):
 
 
 
 
 
 
 
 
 
 
 
46
  tokens_per_patient = calc_tokens_per_patient(intake_tokens, review_tokens)
47
  total_tokens = tokens_per_patient * num_patients
 
 
48
  api_cost_low = managed_api_cost(total_tokens, price_per_1k_low)
49
  api_cost_high = managed_api_cost(total_tokens, price_per_1k_high)
 
 
50
  busy_hours, effective_busy_hours, busy_time_cost, billed_hours, fixed_uptime_cost = gpu_costs(
51
+ total_tokens, toks_per_sec, gpu_price_per_hr, daily_uptime_hours, days_per_month, utilization)
 
 
 
52
  api_pp_low = per_patient(api_cost_low, num_patients)
53
  api_pp_high = per_patient(api_cost_high, num_patients)
54
  gpu_pp_busy = per_patient(busy_time_cost, num_patients)
55
  gpu_pp_fixed = per_patient(fixed_uptime_cost, num_patients)
56
 
 
57
  df = pd.DataFrame({
58
  "Metric": [
59
  "Patients",
60
+ "Tokens per Patient",
61
  "Total Tokens",
62
  "API Cost (low)",
63
  "API Cost (high)",
64
  "GPU Busy Hours (idealized)",
65
+ "GPU Clock Hours @ Utilization",
66
+ "GPU Cost (busy-time)",
67
  "GPU Billed Hours (fixed uptime)",
68
  "GPU Cost (fixed uptime)",
69
  "Per-Patient API Cost (low)",
 
75
  num_patients,
76
  tokens_per_patient,
77
  f"{total_tokens:,}",
78
+ f"${api_cost_low:,.2f}",
79
+ f"${api_cost_high:,.2f}",
80
  f"{busy_hours:,.2f}",
81
  f"{effective_busy_hours:,.2f}",
82
+ f"${busy_time_cost:,.2f}",
83
  f"{billed_hours:,.2f}",
84
+ f"${fixed_uptime_cost:,.2f}",
85
+ f"${api_pp_low:,.2f}",
86
+ f"${api_pp_high:,.2f}",
87
+ f"${gpu_pp_busy:,.2f}",
88
+ f"${gpu_pp_fixed:,.2f}",
89
  ]
90
  })
91
+ notes = f"Total tokens: {total_tokens:,}. Utilization = fraction of time GPU is busy while powered on."
92
+ return df, notes
93
+
94
+ def latency_estimator(prompt_tokens, output_tokens,
95
+ prefill_tps, decode_tps, overhead_ms, queue_factor):
96
+ prefill_sec = prompt_tokens / max(prefill_tps, 1e-9) if prefill_tps else 0.0
97
+ decode_sec = output_tokens / max(decode_tps, 1e-9) if decode_tps else 0.0
98
+ base_sec = prefill_sec + decode_sec + (overhead_ms / 1000.0)
99
+ p95_sec = base_sec * max(queue_factor, 1.0)
100
+ df = pd.DataFrame([
101
+ ["Prefill time (s)", f"{prefill_sec:,.3f}"],
102
+ ["Decode time (s)", f"{decode_sec:,.3f}"],
103
+ ["Overhead (s)", f"{overhead_ms/1000.0:,.3f}"],
104
+ ["Base total (s)", f"{base_sec:,.3f}"],
105
+ ["Estimated p95 (s)", f"{p95_sec:,.3f}"],
106
+ ], columns=["Component", "Seconds"])
107
+ msg = f"Base: {base_sec:,.3f}s, p95 (×{queue_factor}): {p95_sec:,.3f}s"
108
+ return df, msg
109
+
110
+ with gr.Blocks(title=APP_TITLE) as demo:
111
+ gr.Markdown(INTRO)
112
+ with gr.Tabs():
113
+ with gr.Tab("Cost & Capacity"):
114
+ with gr.Row():
115
+ with gr.Column():
116
+ num_patients = gr.Number(value=500, label="Number of patients")
117
+ intake_tokens = gr.Number(value=2000, label="Intake tokens")
118
+ review_tokens = gr.Number(value=5000, label="Review tokens")
119
+ with gr.Column():
120
+ price_per_1k_low = gr.Number(value=0.002, label="API Price Low ($/1K tok)")
121
+ price_per_1k_high = gr.Number(value=0.01, label="API Price High ($/1K tok)")
122
+ with gr.Row():
123
+ with gr.Column():
124
+ toks_per_sec = gr.Number(value=200, label="GPU Throughput (tok/s)")
125
+ gpu_price_per_hr = gr.Number(value=2.50, label="GPU Price ($/hr)")
126
+ with gr.Column():
127
+ daily_uptime_hours = gr.Number(value=8, label="Billed uptime (hr/day)")
128
+ days_per_month = gr.Number(value=30, label="Days/month")
129
+ utilization = gr.Slider(value=0.6, minimum=0.1, maximum=1.0, step=0.05,
130
+ label="Utilization (0–1) = busy time fraction")
131
+ calc_btn = gr.Button("Calculate Costs")
132
+ out_table = gr.Dataframe(label="Summary")
133
+ out_notes = gr.Textbox(label="Notes")
134
+ calc_btn.click(calculate_costs,
135
+ [num_patients, intake_tokens, review_tokens,
136
+ price_per_1k_low, price_per_1k_high,
137
+ toks_per_sec, gpu_price_per_hr,
138
+ daily_uptime_hours, days_per_month, utilization],
139
+ [out_table, out_notes])
140
+
141
+ with gr.Tab("Latency Estimator"):
142
+ with gr.Row():
143
+ with gr.Column():
144
+ prompt_tokens = gr.Number(value=8000, label="Prompt tokens")
145
+ output_tokens = gr.Number(value=300, label="Output tokens")
146
+ with gr.Column():
147
+ prefill_tps = gr.Number(value=1000, label="Prefill speed (tok/s)")
148
+ decode_tps = gr.Number(value=400, label="Decode speed (tok/s)")
149
+ with gr.Row():
150
+ overhead_ms = gr.Number(value=200, label="Overhead (ms)")
151
+ queue_factor = gr.Slider(value=1.3, minimum=1.0, maximum=3.0, step=0.05, label="Queue/Burst Factor ×")
152
+ lat_btn = gr.Button("Estimate Latency")
153
+ lat_table = gr.Dataframe(label="Latency Breakdown")
154
+ lat_notes = gr.Textbox(label="Notes")
155
+ lat_btn.click(latency_estimator,
156
+ [prompt_tokens, output_tokens, prefill_tps, decode_tps, overhead_ms, queue_factor],
157
+ [lat_table, lat_notes])
158
 
159
  if __name__ == "__main__":
160
  demo.launch()
requirements.txt CHANGED
@@ -1,2 +1,2 @@
1
- gradio>=4.18.0
2
- pandas>=2.2.0
 
1
+ gradio>=4.31.0
2
+ pandas>=2.2.0