Spaces:

sammoftah
/

llm-api-cost-optimizer

Runtime error

File size: 5,880 Bytes

7fd7a32

"""
LLM API Cost Optimizer
Estimate model-serving cost and identify caching, batching, and routing savings.
"""

from pathlib import Path

import pandas as pd
import plotly.express as px
import streamlit as st


st.set_page_config(page_title="LLM API Cost Optimizer", page_icon="💰", layout="wide")


def load_shared_css() -> None:
    current_dir = Path(__file__).resolve().parent
    candidates = [
        current_dir / "shared" / "styles.css",
        current_dir.parent / "shared" / "styles.css",
    ]
    css_path = next(path for path in candidates if path.exists())
    st.markdown(f"<style>{css_path.read_text(encoding='utf-8')}</style>", unsafe_allow_html=True)


load_shared_css()


MODEL_PRESETS = {
    "Small open model endpoint": {"input": 0.15, "output": 0.20},
    "Mid-size instruction model": {"input": 0.60, "output": 0.90},
    "Frontier API baseline": {"input": 5.00, "output": 15.00},
    "Custom": {"input": 1.00, "output": 2.00},
}


def monthly_cost(calls, input_tokens, output_tokens, input_price, output_price):
    input_cost = calls * input_tokens / 1_000_000 * input_price
    output_cost = calls * output_tokens / 1_000_000 * output_price
    return input_cost + output_cost


def optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share):
    baseline = monthly_cost(calls, input_tokens, output_tokens, input_price, output_price)
    after_cache = monthly_cost(calls * (1 - cache_hit), input_tokens, output_tokens, input_price, output_price)
    after_batch = after_cache * (1 - batch_gain)
    routed_calls = calls * route_share
    routed_savings = monthly_cost(routed_calls, input_tokens, output_tokens, input_price * 0.65, output_price * 0.65)
    unrouted_cost = monthly_cost(calls * (1 - route_share), input_tokens, output_tokens, input_price, output_price)
    routing_total = routed_savings + unrouted_cost
    combined = after_batch * (1 - route_share * 0.35)
    rows = [
        {"strategy": "Baseline", "monthly_cost": baseline, "savings": 0.0},
        {"strategy": "Semantic cache", "monthly_cost": after_cache, "savings": baseline - after_cache},
        {"strategy": "Batching/window packing", "monthly_cost": after_batch, "savings": baseline - after_batch},
        {"strategy": "Model routing", "monthly_cost": routing_total, "savings": baseline - routing_total},
        {"strategy": "Combined plan", "monthly_cost": combined, "savings": baseline - combined},
    ]
    return pd.DataFrame(rows)


st.markdown("""
<div class="hero">
  <div class="hf-badge">Model Economics</div>
  <h1>💰 LLM API Cost Optimizer</h1>
  <p>Model the cost of LLM traffic and quantify savings from semantic caching, batching, and routing.</p>
  <div class="pill-row">
    <span class="hf-chip">Token economics</span>
    <span class="hf-chip">Caching strategy</span>
    <span class="hf-chip">Deployment planning</span>
  </div>
</div>
""", unsafe_allow_html=True)

with st.sidebar:
    st.markdown("### Model Pricing")
    preset = st.selectbox("Preset", list(MODEL_PRESETS.keys()), index=1)
    default = MODEL_PRESETS[preset]
    input_price = st.number_input("Input price / 1M tokens ($)", min_value=0.0, value=float(default["input"]), step=0.05)
    output_price = st.number_input("Output price / 1M tokens ($)", min_value=0.0, value=float(default["output"]), step=0.05)
    st.markdown("### Optimization Assumptions")
    cache_hit = st.slider("Semantic cache hit rate", 0.0, 0.9, 0.28, 0.01)
    batch_gain = st.slider("Batching efficiency gain", 0.0, 0.5, 0.12, 0.01)
    route_share = st.slider("Traffic routable to smaller model", 0.0, 0.9, 0.35, 0.01)

left, right = st.columns([1, 1])
with left:
    calls = st.number_input("Monthly requests", min_value=1_000, value=1_000_000, step=50_000)
    input_tokens = st.number_input("Average input tokens", min_value=1, value=850, step=50)
    output_tokens = st.number_input("Average output tokens", min_value=1, value=260, step=25)
with right:
    st.markdown("### Professional Framing")
    st.markdown("""
This is the kind of utility ML teams actually need before deploying a Space, API, or agent. It connects model choice, token volume, caching, and routing into an engineering decision instead of a vague cost guess.
""")

report = optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share)
baseline = float(report.iloc[0]["monthly_cost"])
combined = float(report.iloc[-1]["monthly_cost"])
savings = baseline - combined
savings_pct = savings / baseline * 100 if baseline else 0

metric_cols = st.columns(3)
metric_cols[0].metric("Baseline monthly cost", f"${baseline:,.2f}")
metric_cols[1].metric("Combined plan", f"${combined:,.2f}")
metric_cols[2].metric("Estimated savings", f"{savings_pct:.1f}%")

tab1, tab2, tab3 = st.tabs(["Cost Curve", "Strategy Table", "Architecture Notes"])

with tab1:
    fig = px.bar(
        report,
        x="strategy",
        y="monthly_cost",
        color="strategy",
        title="Monthly cost by optimization strategy",
        color_discrete_sequence=["#b8a9d9", "#ffad7a", "#7accff", "#e8935c", "#4b5563"],
    )
    st.plotly_chart(fig, use_container_width=True)

with tab2:
    display = report.copy()
    display["monthly_cost"] = display["monthly_cost"].map(lambda value: f"${value:,.2f}")
    display["savings"] = display["savings"].map(lambda value: f"${value:,.2f}")
    st.dataframe(display, use_container_width=True, hide_index=True)

with tab3:
    st.markdown("""
### HF-Native Extension Path

- Deploy a smaller model as a Hugging Face Inference Endpoint for routable traffic.
- Store prompt embeddings in a vector database for semantic caching.
- Add request logs as a private Hugging Face Dataset for offline evaluation.
- Compare quality against a frontier baseline before routing production traffic.
""")