"""
LLM API Cost Optimizer
Estimate model-serving cost and identify caching, batching, and routing savings.
"""
from pathlib import Path
import pandas as pd
import plotly.express as px
import streamlit as st
st.set_page_config(page_title="LLM API Cost Optimizer", page_icon="💰", layout="wide")
def load_shared_css() -> None:
current_dir = Path(__file__).resolve().parent
candidates = [
current_dir / "shared" / "styles.css",
current_dir.parent / "shared" / "styles.css",
]
css_path = next(path for path in candidates if path.exists())
st.markdown(f"", unsafe_allow_html=True)
load_shared_css()
MODEL_PRESETS = {
"Small open model endpoint": {"input": 0.15, "output": 0.20},
"Mid-size instruction model": {"input": 0.60, "output": 0.90},
"Frontier API baseline": {"input": 5.00, "output": 15.00},
"Custom": {"input": 1.00, "output": 2.00},
}
def monthly_cost(calls, input_tokens, output_tokens, input_price, output_price):
input_cost = calls * input_tokens / 1_000_000 * input_price
output_cost = calls * output_tokens / 1_000_000 * output_price
return input_cost + output_cost
def optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share):
baseline = monthly_cost(calls, input_tokens, output_tokens, input_price, output_price)
after_cache = monthly_cost(calls * (1 - cache_hit), input_tokens, output_tokens, input_price, output_price)
after_batch = after_cache * (1 - batch_gain)
routed_calls = calls * route_share
routed_savings = monthly_cost(routed_calls, input_tokens, output_tokens, input_price * 0.65, output_price * 0.65)
unrouted_cost = monthly_cost(calls * (1 - route_share), input_tokens, output_tokens, input_price, output_price)
routing_total = routed_savings + unrouted_cost
combined = after_batch * (1 - route_share * 0.35)
rows = [
{"strategy": "Baseline", "monthly_cost": baseline, "savings": 0.0},
{"strategy": "Semantic cache", "monthly_cost": after_cache, "savings": baseline - after_cache},
{"strategy": "Batching/window packing", "monthly_cost": after_batch, "savings": baseline - after_batch},
{"strategy": "Model routing", "monthly_cost": routing_total, "savings": baseline - routing_total},
{"strategy": "Combined plan", "monthly_cost": combined, "savings": baseline - combined},
]
return pd.DataFrame(rows)
st.markdown("""
Model Economics
💰 LLM API Cost Optimizer
Model the cost of LLM traffic and quantify savings from semantic caching, batching, and routing.
Token economics
Caching strategy
Deployment planning
""", unsafe_allow_html=True)
with st.sidebar:
st.markdown("### Model Pricing")
preset = st.selectbox("Preset", list(MODEL_PRESETS.keys()), index=1)
default = MODEL_PRESETS[preset]
input_price = st.number_input("Input price / 1M tokens ($)", min_value=0.0, value=float(default["input"]), step=0.05)
output_price = st.number_input("Output price / 1M tokens ($)", min_value=0.0, value=float(default["output"]), step=0.05)
st.markdown("### Optimization Assumptions")
cache_hit = st.slider("Semantic cache hit rate", 0.0, 0.9, 0.28, 0.01)
batch_gain = st.slider("Batching efficiency gain", 0.0, 0.5, 0.12, 0.01)
route_share = st.slider("Traffic routable to smaller model", 0.0, 0.9, 0.35, 0.01)
left, right = st.columns([1, 1])
with left:
calls = st.number_input("Monthly requests", min_value=1_000, value=1_000_000, step=50_000)
input_tokens = st.number_input("Average input tokens", min_value=1, value=850, step=50)
output_tokens = st.number_input("Average output tokens", min_value=1, value=260, step=25)
with right:
st.markdown("### Professional Framing")
st.markdown("""
This is the kind of utility ML teams actually need before deploying a Space, API, or agent. It connects model choice, token volume, caching, and routing into an engineering decision instead of a vague cost guess.
""")
report = optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share)
baseline = float(report.iloc[0]["monthly_cost"])
combined = float(report.iloc[-1]["monthly_cost"])
savings = baseline - combined
savings_pct = savings / baseline * 100 if baseline else 0
metric_cols = st.columns(3)
metric_cols[0].metric("Baseline monthly cost", f"${baseline:,.2f}")
metric_cols[1].metric("Combined plan", f"${combined:,.2f}")
metric_cols[2].metric("Estimated savings", f"{savings_pct:.1f}%")
tab1, tab2, tab3 = st.tabs(["Cost Curve", "Strategy Table", "Architecture Notes"])
with tab1:
fig = px.bar(
report,
x="strategy",
y="monthly_cost",
color="strategy",
title="Monthly cost by optimization strategy",
color_discrete_sequence=["#b8a9d9", "#ffad7a", "#7accff", "#e8935c", "#4b5563"],
)
st.plotly_chart(fig, use_container_width=True)
with tab2:
display = report.copy()
display["monthly_cost"] = display["monthly_cost"].map(lambda value: f"${value:,.2f}")
display["savings"] = display["savings"].map(lambda value: f"${value:,.2f}")
st.dataframe(display, use_container_width=True, hide_index=True)
with tab3:
st.markdown("""
### HF-Native Extension Path
- Deploy a smaller model as a Hugging Face Inference Endpoint for routable traffic.
- Store prompt embeddings in a vector database for semantic caching.
- Add request logs as a private Hugging Face Dataset for offline evaluation.
- Compare quality against a frontier baseline before routing production traffic.
""")