Spaces:
Runtime error
Runtime error
File size: 5,880 Bytes
7fd7a32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | """
LLM API Cost Optimizer
Estimate model-serving cost and identify caching, batching, and routing savings.
"""
from pathlib import Path
import pandas as pd
import plotly.express as px
import streamlit as st
st.set_page_config(page_title="LLM API Cost Optimizer", page_icon="💰", layout="wide")
def load_shared_css() -> None:
current_dir = Path(__file__).resolve().parent
candidates = [
current_dir / "shared" / "styles.css",
current_dir.parent / "shared" / "styles.css",
]
css_path = next(path for path in candidates if path.exists())
st.markdown(f"<style>{css_path.read_text(encoding='utf-8')}</style>", unsafe_allow_html=True)
load_shared_css()
MODEL_PRESETS = {
"Small open model endpoint": {"input": 0.15, "output": 0.20},
"Mid-size instruction model": {"input": 0.60, "output": 0.90},
"Frontier API baseline": {"input": 5.00, "output": 15.00},
"Custom": {"input": 1.00, "output": 2.00},
}
def monthly_cost(calls, input_tokens, output_tokens, input_price, output_price):
input_cost = calls * input_tokens / 1_000_000 * input_price
output_cost = calls * output_tokens / 1_000_000 * output_price
return input_cost + output_cost
def optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share):
baseline = monthly_cost(calls, input_tokens, output_tokens, input_price, output_price)
after_cache = monthly_cost(calls * (1 - cache_hit), input_tokens, output_tokens, input_price, output_price)
after_batch = after_cache * (1 - batch_gain)
routed_calls = calls * route_share
routed_savings = monthly_cost(routed_calls, input_tokens, output_tokens, input_price * 0.65, output_price * 0.65)
unrouted_cost = monthly_cost(calls * (1 - route_share), input_tokens, output_tokens, input_price, output_price)
routing_total = routed_savings + unrouted_cost
combined = after_batch * (1 - route_share * 0.35)
rows = [
{"strategy": "Baseline", "monthly_cost": baseline, "savings": 0.0},
{"strategy": "Semantic cache", "monthly_cost": after_cache, "savings": baseline - after_cache},
{"strategy": "Batching/window packing", "monthly_cost": after_batch, "savings": baseline - after_batch},
{"strategy": "Model routing", "monthly_cost": routing_total, "savings": baseline - routing_total},
{"strategy": "Combined plan", "monthly_cost": combined, "savings": baseline - combined},
]
return pd.DataFrame(rows)
st.markdown("""
<div class="hero">
<div class="hf-badge">Model Economics</div>
<h1>💰 LLM API Cost Optimizer</h1>
<p>Model the cost of LLM traffic and quantify savings from semantic caching, batching, and routing.</p>
<div class="pill-row">
<span class="hf-chip">Token economics</span>
<span class="hf-chip">Caching strategy</span>
<span class="hf-chip">Deployment planning</span>
</div>
</div>
""", unsafe_allow_html=True)
with st.sidebar:
st.markdown("### Model Pricing")
preset = st.selectbox("Preset", list(MODEL_PRESETS.keys()), index=1)
default = MODEL_PRESETS[preset]
input_price = st.number_input("Input price / 1M tokens ($)", min_value=0.0, value=float(default["input"]), step=0.05)
output_price = st.number_input("Output price / 1M tokens ($)", min_value=0.0, value=float(default["output"]), step=0.05)
st.markdown("### Optimization Assumptions")
cache_hit = st.slider("Semantic cache hit rate", 0.0, 0.9, 0.28, 0.01)
batch_gain = st.slider("Batching efficiency gain", 0.0, 0.5, 0.12, 0.01)
route_share = st.slider("Traffic routable to smaller model", 0.0, 0.9, 0.35, 0.01)
left, right = st.columns([1, 1])
with left:
calls = st.number_input("Monthly requests", min_value=1_000, value=1_000_000, step=50_000)
input_tokens = st.number_input("Average input tokens", min_value=1, value=850, step=50)
output_tokens = st.number_input("Average output tokens", min_value=1, value=260, step=25)
with right:
st.markdown("### Professional Framing")
st.markdown("""
This is the kind of utility ML teams actually need before deploying a Space, API, or agent. It connects model choice, token volume, caching, and routing into an engineering decision instead of a vague cost guess.
""")
report = optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share)
baseline = float(report.iloc[0]["monthly_cost"])
combined = float(report.iloc[-1]["monthly_cost"])
savings = baseline - combined
savings_pct = savings / baseline * 100 if baseline else 0
metric_cols = st.columns(3)
metric_cols[0].metric("Baseline monthly cost", f"${baseline:,.2f}")
metric_cols[1].metric("Combined plan", f"${combined:,.2f}")
metric_cols[2].metric("Estimated savings", f"{savings_pct:.1f}%")
tab1, tab2, tab3 = st.tabs(["Cost Curve", "Strategy Table", "Architecture Notes"])
with tab1:
fig = px.bar(
report,
x="strategy",
y="monthly_cost",
color="strategy",
title="Monthly cost by optimization strategy",
color_discrete_sequence=["#b8a9d9", "#ffad7a", "#7accff", "#e8935c", "#4b5563"],
)
st.plotly_chart(fig, use_container_width=True)
with tab2:
display = report.copy()
display["monthly_cost"] = display["monthly_cost"].map(lambda value: f"${value:,.2f}")
display["savings"] = display["savings"].map(lambda value: f"${value:,.2f}")
st.dataframe(display, use_container_width=True, hide_index=True)
with tab3:
st.markdown("""
### HF-Native Extension Path
- Deploy a smaller model as a Hugging Face Inference Endpoint for routable traffic.
- Store prompt embeddings in a vector database for semantic caching.
- Add request logs as a private Hugging Face Dataset for offline evaluation.
- Compare quality against a frontier baseline before routing production traffic.
""")
|