File size: 5,880 Bytes
7fd7a32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""
LLM API Cost Optimizer
Estimate model-serving cost and identify caching, batching, and routing savings.
"""

from pathlib import Path

import pandas as pd
import plotly.express as px
import streamlit as st


st.set_page_config(page_title="LLM API Cost Optimizer", page_icon="💰", layout="wide")


def load_shared_css() -> None:
    current_dir = Path(__file__).resolve().parent
    candidates = [
        current_dir / "shared" / "styles.css",
        current_dir.parent / "shared" / "styles.css",
    ]
    css_path = next(path for path in candidates if path.exists())
    st.markdown(f"<style>{css_path.read_text(encoding='utf-8')}</style>", unsafe_allow_html=True)


load_shared_css()


MODEL_PRESETS = {
    "Small open model endpoint": {"input": 0.15, "output": 0.20},
    "Mid-size instruction model": {"input": 0.60, "output": 0.90},
    "Frontier API baseline": {"input": 5.00, "output": 15.00},
    "Custom": {"input": 1.00, "output": 2.00},
}


def monthly_cost(calls, input_tokens, output_tokens, input_price, output_price):
    input_cost = calls * input_tokens / 1_000_000 * input_price
    output_cost = calls * output_tokens / 1_000_000 * output_price
    return input_cost + output_cost


def optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share):
    baseline = monthly_cost(calls, input_tokens, output_tokens, input_price, output_price)
    after_cache = monthly_cost(calls * (1 - cache_hit), input_tokens, output_tokens, input_price, output_price)
    after_batch = after_cache * (1 - batch_gain)
    routed_calls = calls * route_share
    routed_savings = monthly_cost(routed_calls, input_tokens, output_tokens, input_price * 0.65, output_price * 0.65)
    unrouted_cost = monthly_cost(calls * (1 - route_share), input_tokens, output_tokens, input_price, output_price)
    routing_total = routed_savings + unrouted_cost
    combined = after_batch * (1 - route_share * 0.35)
    rows = [
        {"strategy": "Baseline", "monthly_cost": baseline, "savings": 0.0},
        {"strategy": "Semantic cache", "monthly_cost": after_cache, "savings": baseline - after_cache},
        {"strategy": "Batching/window packing", "monthly_cost": after_batch, "savings": baseline - after_batch},
        {"strategy": "Model routing", "monthly_cost": routing_total, "savings": baseline - routing_total},
        {"strategy": "Combined plan", "monthly_cost": combined, "savings": baseline - combined},
    ]
    return pd.DataFrame(rows)


st.markdown("""
<div class="hero">
  <div class="hf-badge">Model Economics</div>
  <h1>💰 LLM API Cost Optimizer</h1>
  <p>Model the cost of LLM traffic and quantify savings from semantic caching, batching, and routing.</p>
  <div class="pill-row">
    <span class="hf-chip">Token economics</span>
    <span class="hf-chip">Caching strategy</span>
    <span class="hf-chip">Deployment planning</span>
  </div>
</div>
""", unsafe_allow_html=True)

with st.sidebar:
    st.markdown("### Model Pricing")
    preset = st.selectbox("Preset", list(MODEL_PRESETS.keys()), index=1)
    default = MODEL_PRESETS[preset]
    input_price = st.number_input("Input price / 1M tokens ($)", min_value=0.0, value=float(default["input"]), step=0.05)
    output_price = st.number_input("Output price / 1M tokens ($)", min_value=0.0, value=float(default["output"]), step=0.05)
    st.markdown("### Optimization Assumptions")
    cache_hit = st.slider("Semantic cache hit rate", 0.0, 0.9, 0.28, 0.01)
    batch_gain = st.slider("Batching efficiency gain", 0.0, 0.5, 0.12, 0.01)
    route_share = st.slider("Traffic routable to smaller model", 0.0, 0.9, 0.35, 0.01)

left, right = st.columns([1, 1])
with left:
    calls = st.number_input("Monthly requests", min_value=1_000, value=1_000_000, step=50_000)
    input_tokens = st.number_input("Average input tokens", min_value=1, value=850, step=50)
    output_tokens = st.number_input("Average output tokens", min_value=1, value=260, step=25)
with right:
    st.markdown("### Professional Framing")
    st.markdown("""
This is the kind of utility ML teams actually need before deploying a Space, API, or agent. It connects model choice, token volume, caching, and routing into an engineering decision instead of a vague cost guess.
""")

report = optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share)
baseline = float(report.iloc[0]["monthly_cost"])
combined = float(report.iloc[-1]["monthly_cost"])
savings = baseline - combined
savings_pct = savings / baseline * 100 if baseline else 0

metric_cols = st.columns(3)
metric_cols[0].metric("Baseline monthly cost", f"${baseline:,.2f}")
metric_cols[1].metric("Combined plan", f"${combined:,.2f}")
metric_cols[2].metric("Estimated savings", f"{savings_pct:.1f}%")

tab1, tab2, tab3 = st.tabs(["Cost Curve", "Strategy Table", "Architecture Notes"])

with tab1:
    fig = px.bar(
        report,
        x="strategy",
        y="monthly_cost",
        color="strategy",
        title="Monthly cost by optimization strategy",
        color_discrete_sequence=["#b8a9d9", "#ffad7a", "#7accff", "#e8935c", "#4b5563"],
    )
    st.plotly_chart(fig, use_container_width=True)

with tab2:
    display = report.copy()
    display["monthly_cost"] = display["monthly_cost"].map(lambda value: f"${value:,.2f}")
    display["savings"] = display["savings"].map(lambda value: f"${value:,.2f}")
    st.dataframe(display, use_container_width=True, hide_index=True)

with tab3:
    st.markdown("""
### HF-Native Extension Path

- Deploy a smaller model as a Hugging Face Inference Endpoint for routable traffic.
- Store prompt embeddings in a vector database for semantic caching.
- Add request logs as a private Hugging Face Dataset for offline evaluation.
- Compare quality against a frontier baseline before routing production traffic.
""")