Spaces:

xiaosuhu1986
/

DocTrek-LLM-cost-estimation

Sleeping

App Files Files Community

xiaosuhu1986 commited on Aug 13, 2025

Commit

4a73369

verified ·

1 Parent(s): d8bcd5b

Add GPU selection and token calc method (#1)

Browse files

- Add GPU selection and token calc method (bddf37c66b35b663b56fce480f564fe30959e5c8)

Files changed (3) hide show

README.md +9 -23
app.py +101 -110
requirements.txt +2 -2

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: LLM Cost & Capacity Calculator
 emoji: 🧮
 colorFrom: blue
 colorTo: green
@@ -9,28 +9,14 @@ app_file: app.py
 pinned: false
 ---
-# LLM Cost & Capacity Calculator (Gradio)
-A simple calculator to compare **Managed API (per-token)** vs **Self-hosted GPU (per GPU-hour)** costs.
-## Use on Hugging Face Spaces
-1. Create a new **Space**.
-2. Choose **Gradio** as the SDK.
-3. Upload `app.py` and `requirements.txt`.
-4. The Space will build and launch automatically.
-## What this app does
-- Calculates total tokens from: **patients × (intake tokens + review tokens)**.
-- Estimates **Managed API cost** given a price range ($/1K tokens).
-- Estimates **Self-hosted GPU cost** via two views:
-  - **Busy-time only** (idealized, pay only when generating)
-  - **Fixed daily uptime** (more realistic: GPU is on N hours/day regardless of load)
-- Shows **per-patient costs** for easy comparison.
-## Suggested defaults
-- Intake tokens per patient: 2,000
-- Review tokens per patient: 5,000
-- API price range: $0.002–$0.01 per 1K tokens
-- GPU throughput: 200 tokens/sec (start conservative; tune after load tests)
-- GPU price: $2.50/hr (A100-80GB ballpark; adjust to your provider)
-- Utilization: 0.60 (60%)

 ---
+title: LLM Cost, Capacity & Latency Estimator
 emoji: 🧮
 colorFrom: blue
 colorTo: green
 pinned: false
 ---
+# LLM Cost, Capacity & Latency Estimator
+Compare API vs GPU costs and estimate latency.
+## Utilization
+Fraction of time GPU is busy while powered on (0–1).
+## How to deploy
+1. Create HF Space → Gradio.
+2. Upload `app.py`, `requirements.txt`, `README.md`.
+3. Launch.

app.py CHANGED Viewed

@@ -1,23 +1,22 @@
 import gradio as gr
-import math
 import pandas as pd
-DESCRIPTION = """
-# LLM Cost & Capacity Calculator
-Use this tool to estimate costs for **Managed API (per-token)** vs **Self-hosted GPU (per GPU-hour)** for your workload.
-**How it works**:
-- **Tokens per patient** = Intake tokens + Clinician review tokens
-- **Managed API cost** = Total tokens / 1000 × Price per 1K tokens
-- **Self-hosted GPU (two views)**:
-  1) **Busy-time only**: assumes you pay only for time spent generating (idealized)
-  2) **Fixed daily uptime**: assumes GPUs are up N hours/day regardless of traffic (more realistic)
-Notes:
-- One token is roughly ~¾ of an English word.
-- Throughput depends on model, quantization, batch size, and hardware. Start with a conservative guess and refine with load tests.
 """
 def calc_tokens_per_patient(intake_tokens, review_tokens):
@@ -27,71 +26,44 @@ def managed_api_cost(total_tokens, price_per_1k_tokens):
     return (total_tokens / 1000.0) * price_per_1k_tokens
 def gpu_busy_hours(total_tokens, toks_per_sec):
-    if toks_per_sec <= 0:
-        return 0.0
-    return total_tokens / toks_per_sec / 3600.0
 def gpu_costs(total_tokens, toks_per_sec, gpu_price_per_hr, daily_uptime_hours, days_per_month, utilization):
-    # Busy-time (idealized) cost
     busy_hours = gpu_busy_hours(total_tokens, toks_per_sec)
-    # Adjust busy hours for utilization (how well we keep the GPU busy while it's up)
-    # If utilization < 1, we need more clock hours to complete the same work.
     effective_busy_hours = busy_hours / max(utilization, 1e-6)
     busy_time_cost = effective_busy_hours * gpu_price_per_hr
-    # Fixed daily uptime model
     billed_hours = daily_uptime_hours * days_per_month
     fixed_uptime_cost = billed_hours * gpu_price_per_hr
     return busy_hours, effective_busy_hours, busy_time_cost, billed_hours, fixed_uptime_cost
 def per_patient(total_cost, num_patients):
-    return (total_cost / num_patients) if num_patients > 0 else 0.0
-def format_currency(x):
-    return f"${x:,.2f}"
-def calculate(
-    num_patients,
-    intake_tokens,
-    review_tokens,
-    price_per_1k_low,
-    price_per_1k_high,
-    toks_per_sec,
-    gpu_price_per_hr,
-    daily_uptime_hours,
-    days_per_month,
-    utilization
-):
     tokens_per_patient = calc_tokens_per_patient(intake_tokens, review_tokens)
     total_tokens = tokens_per_patient * num_patients
-    # Managed API costs (range)
     api_cost_low = managed_api_cost(total_tokens, price_per_1k_low)
     api_cost_high = managed_api_cost(total_tokens, price_per_1k_high)
-    # GPU costs
     busy_hours, effective_busy_hours, busy_time_cost, billed_hours, fixed_uptime_cost = gpu_costs(
-        total_tokens, toks_per_sec, gpu_price_per_hr, daily_uptime_hours, days_per_month, utilization
-    )
-    # Per-patient costs
     api_pp_low = per_patient(api_cost_low, num_patients)
     api_pp_high = per_patient(api_cost_high, num_patients)
     gpu_pp_busy = per_patient(busy_time_cost, num_patients)
     gpu_pp_fixed = per_patient(fixed_uptime_cost, num_patients)
-    # Summary table
     df = pd.DataFrame({
         "Metric": [
             "Patients",
-            "Tokens per Patient (intake+review)",
             "Total Tokens",
             "API Cost (low)",
             "API Cost (high)",
             "GPU Busy Hours (idealized)",
-            "GPU Clock Hours Needed @ Utilization",
-            "GPU Cost (busy-time model)",
             "GPU Billed Hours (fixed uptime)",
             "GPU Cost (fixed uptime)",
             "Per-Patient API Cost (low)",
@@ -103,67 +75,86 @@ def calculate(
             num_patients,
             tokens_per_patient,
             f"{total_tokens:,}",
-            format_currency(api_cost_low),
-            format_currency(api_cost_high),
             f"{busy_hours:,.2f}",
             f"{effective_busy_hours:,.2f}",
-            format_currency(busy_time_cost),
             f"{billed_hours:,.2f}",
-            format_currency(fixed_uptime_cost),
-            format_currency(api_pp_low),
-            format_currency(api_pp_high),
-            format_currency(gpu_pp_busy),
-            format_currency(gpu_pp_fixed),
         ]
     })
-    return df, f"Total tokens: {total_tokens:,}"
-with gr.Blocks(title="LLM Cost & Capacity Calculator") as demo:
-    gr.Markdown(DESCRIPTION)
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### Workload")
-            num_patients = gr.Number(value=500, label="Number of patients")
-            intake_tokens = gr.Number(value=2000, label="Intake tokens per patient")
-            review_tokens = gr.Number(value=5000, label="Clinician review tokens per patient")
-        with gr.Column():
-            gr.Markdown("### Managed API Pricing (per 1K tokens)")
-            price_per_1k_low = gr.Number(value=0.002, label="Price (low) $ / 1K tokens")
-            price_per_1k_high = gr.Number(value=0.010, label="Price (high) $ / 1K tokens")
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### Self-Hosted GPU Assumptions")
-            toks_per_sec = gr.Number(value=200, label="Throughput (tokens/sec per GPU)")
-            gpu_price_per_hr = gr.Number(value=2.50, label="GPU price ($/hour)")
-        with gr.Column():
-            daily_uptime_hours = gr.Number(value=8, label="Billed uptime (hours/day)")
-            days_per_month = gr.Number(value=30, label="Days per month")
-            utilization = gr.Slider(value=0.6, minimum=0.1, maximum=1.0, step=0.05, label="Utilization (0–1)")
-    calc_btn = gr.Button("Calculate")
-    out_table = gr.Dataframe(label="Summary", interactive=False)
-    out_text = gr.Textbox(label="Notes", interactive=False)
-    calc_btn.click(
-        calculate,
-        inputs=[
-            num_patients,
-            intake_tokens,
-            review_tokens,
-            price_per_1k_low,
-            price_per_1k_high,
-            toks_per_sec,
-            gpu_price_per_hr,
-            daily_uptime_hours,
-            days_per_month,
-            utilization
-        ],
-        outputs=[out_table, out_text]
-    )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import pandas as pd
+APP_TITLE = "LLM Cost, Capacity & Latency Estimator"
+INTRO = """
+# LLM Cost, Capacity & Latency Estimator
+Estimate and compare **Managed API costs (per-token)** vs **Self-hosted GPU costs (per GPU-hour)**,
+plus a **Latency Estimator** for prompt+generation timing.
+**Utilization (0–1)** = fraction of time GPU is busy while powered on.
+- Example: 0.6 → GPU is busy 60% of the time and idle 40%.
+- Used to convert busy-hours → clock-hours: busy_hours / utilization.
+**Latency Estimator**:
+- Prefill (read prompt) is usually faster than decode (generate output).
+- Apply a queue/burst factor to estimate p95 latency under load.
 """
 def calc_tokens_per_patient(intake_tokens, review_tokens):
     return (total_tokens / 1000.0) * price_per_1k_tokens
 def gpu_busy_hours(total_tokens, toks_per_sec):
+    return total_tokens / max(toks_per_sec, 1e-9) / 3600.0
 def gpu_costs(total_tokens, toks_per_sec, gpu_price_per_hr, daily_uptime_hours, days_per_month, utilization):
     busy_hours = gpu_busy_hours(total_tokens, toks_per_sec)
     effective_busy_hours = busy_hours / max(utilization, 1e-6)
     busy_time_cost = effective_busy_hours * gpu_price_per_hr
     billed_hours = daily_uptime_hours * days_per_month
     fixed_uptime_cost = billed_hours * gpu_price_per_hr
     return busy_hours, effective_busy_hours, busy_time_cost, billed_hours, fixed_uptime_cost
 def per_patient(total_cost, num_patients):
+    return (total_cost / num_patients) if num_patients else 0.0
+def calculate_costs(num_patients, intake_tokens, review_tokens,
+                    price_per_1k_low, price_per_1k_high,
+                    toks_per_sec, gpu_price_per_hr,
+                    daily_uptime_hours, days_per_month, utilization):
     tokens_per_patient = calc_tokens_per_patient(intake_tokens, review_tokens)
     total_tokens = tokens_per_patient * num_patients
     api_cost_low = managed_api_cost(total_tokens, price_per_1k_low)
     api_cost_high = managed_api_cost(total_tokens, price_per_1k_high)
     busy_hours, effective_busy_hours, busy_time_cost, billed_hours, fixed_uptime_cost = gpu_costs(
+        total_tokens, toks_per_sec, gpu_price_per_hr, daily_uptime_hours, days_per_month, utilization)
     api_pp_low = per_patient(api_cost_low, num_patients)
     api_pp_high = per_patient(api_cost_high, num_patients)
     gpu_pp_busy = per_patient(busy_time_cost, num_patients)
     gpu_pp_fixed = per_patient(fixed_uptime_cost, num_patients)
     df = pd.DataFrame({
         "Metric": [
             "Patients",
+            "Tokens per Patient",
             "Total Tokens",
             "API Cost (low)",
             "API Cost (high)",
             "GPU Busy Hours (idealized)",
+            "GPU Clock Hours @ Utilization",
+            "GPU Cost (busy-time)",
             "GPU Billed Hours (fixed uptime)",
             "GPU Cost (fixed uptime)",
             "Per-Patient API Cost (low)",
             num_patients,
             tokens_per_patient,
             f"{total_tokens:,}",
+            f"${api_cost_low:,.2f}",
+            f"${api_cost_high:,.2f}",
             f"{busy_hours:,.2f}",
             f"{effective_busy_hours:,.2f}",
+            f"${busy_time_cost:,.2f}",
             f"{billed_hours:,.2f}",
+            f"${fixed_uptime_cost:,.2f}",
+            f"${api_pp_low:,.2f}",
+            f"${api_pp_high:,.2f}",
+            f"${gpu_pp_busy:,.2f}",
+            f"${gpu_pp_fixed:,.2f}",
         ]
     })
+    notes = f"Total tokens: {total_tokens:,}. Utilization = fraction of time GPU is busy while powered on."
+    return df, notes
+def latency_estimator(prompt_tokens, output_tokens,
+                      prefill_tps, decode_tps, overhead_ms, queue_factor):
+    prefill_sec = prompt_tokens / max(prefill_tps, 1e-9) if prefill_tps else 0.0
+    decode_sec = output_tokens / max(decode_tps, 1e-9) if decode_tps else 0.0
+    base_sec = prefill_sec + decode_sec + (overhead_ms / 1000.0)
+    p95_sec = base_sec * max(queue_factor, 1.0)
+    df = pd.DataFrame([
+        ["Prefill time (s)", f"{prefill_sec:,.3f}"],
+        ["Decode time (s)", f"{decode_sec:,.3f}"],
+        ["Overhead (s)", f"{overhead_ms/1000.0:,.3f}"],
+        ["Base total (s)", f"{base_sec:,.3f}"],
+        ["Estimated p95 (s)", f"{p95_sec:,.3f}"],
+    ], columns=["Component", "Seconds"])
+    msg = f"Base: {base_sec:,.3f}s, p95 (×{queue_factor}): {p95_sec:,.3f}s"
+    return df, msg
+with gr.Blocks(title=APP_TITLE) as demo:
+    gr.Markdown(INTRO)
+    with gr.Tabs():
+        with gr.Tab("Cost & Capacity"):
+            with gr.Row():
+                with gr.Column():
+                    num_patients = gr.Number(value=500, label="Number of patients")
+                    intake_tokens = gr.Number(value=2000, label="Intake tokens")
+                    review_tokens = gr.Number(value=5000, label="Review tokens")
+                with gr.Column():
+                    price_per_1k_low = gr.Number(value=0.002, label="API Price Low ($/1K tok)")
+                    price_per_1k_high = gr.Number(value=0.01, label="API Price High ($/1K tok)")
+            with gr.Row():
+                with gr.Column():
+                    toks_per_sec = gr.Number(value=200, label="GPU Throughput (tok/s)")
+                    gpu_price_per_hr = gr.Number(value=2.50, label="GPU Price ($/hr)")
+                with gr.Column():
+                    daily_uptime_hours = gr.Number(value=8, label="Billed uptime (hr/day)")
+                    days_per_month = gr.Number(value=30, label="Days/month")
+                    utilization = gr.Slider(value=0.6, minimum=0.1, maximum=1.0, step=0.05,
+                                             label="Utilization (0–1) = busy time fraction")
+            calc_btn = gr.Button("Calculate Costs")
+            out_table = gr.Dataframe(label="Summary")
+            out_notes = gr.Textbox(label="Notes")
+            calc_btn.click(calculate_costs,
+                           [num_patients, intake_tokens, review_tokens,
+                            price_per_1k_low, price_per_1k_high,
+                            toks_per_sec, gpu_price_per_hr,
+                            daily_uptime_hours, days_per_month, utilization],
+                           [out_table, out_notes])
+        with gr.Tab("Latency Estimator"):
+            with gr.Row():
+                with gr.Column():
+                    prompt_tokens = gr.Number(value=8000, label="Prompt tokens")
+                    output_tokens = gr.Number(value=300, label="Output tokens")
+                with gr.Column():
+                    prefill_tps = gr.Number(value=1000, label="Prefill speed (tok/s)")
+                    decode_tps = gr.Number(value=400, label="Decode speed (tok/s)")
+            with gr.Row():
+                overhead_ms = gr.Number(value=200, label="Overhead (ms)")
+                queue_factor = gr.Slider(value=1.3, minimum=1.0, maximum=3.0, step=0.05, label="Queue/Burst Factor ×")
+            lat_btn = gr.Button("Estimate Latency")
+            lat_table = gr.Dataframe(label="Latency Breakdown")
+            lat_notes = gr.Textbox(label="Notes")
+            lat_btn.click(latency_estimator,
+                          [prompt_tokens, output_tokens, prefill_tps, decode_tps, overhead_ms, queue_factor],
+                          [lat_table, lat_notes])
 if __name__ == "__main__":
     demo.launch()

requirements.txt CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- gradio>=4.18.0
2	- pandas>=2.2.0


1	+ gradio>=4.31.0
2	+ pandas>=2.2.0