""" LLM API Cost Optimizer Estimate model-serving cost and identify caching, batching, and routing savings. """ from pathlib import Path import pandas as pd import plotly.express as px import streamlit as st st.set_page_config(page_title="LLM API Cost Optimizer", page_icon="💰", layout="wide") def load_shared_css() -> None: current_dir = Path(__file__).resolve().parent candidates = [ current_dir / "shared" / "styles.css", current_dir.parent / "shared" / "styles.css", ] css_path = next(path for path in candidates if path.exists()) st.markdown(f"", unsafe_allow_html=True) load_shared_css() MODEL_PRESETS = { "Small open model endpoint": {"input": 0.15, "output": 0.20}, "Mid-size instruction model": {"input": 0.60, "output": 0.90}, "Frontier API baseline": {"input": 5.00, "output": 15.00}, "Custom": {"input": 1.00, "output": 2.00}, } def monthly_cost(calls, input_tokens, output_tokens, input_price, output_price): input_cost = calls * input_tokens / 1_000_000 * input_price output_cost = calls * output_tokens / 1_000_000 * output_price return input_cost + output_cost def optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share): baseline = monthly_cost(calls, input_tokens, output_tokens, input_price, output_price) after_cache = monthly_cost(calls * (1 - cache_hit), input_tokens, output_tokens, input_price, output_price) after_batch = after_cache * (1 - batch_gain) routed_calls = calls * route_share routed_savings = monthly_cost(routed_calls, input_tokens, output_tokens, input_price * 0.65, output_price * 0.65) unrouted_cost = monthly_cost(calls * (1 - route_share), input_tokens, output_tokens, input_price, output_price) routing_total = routed_savings + unrouted_cost combined = after_batch * (1 - route_share * 0.35) rows = [ {"strategy": "Baseline", "monthly_cost": baseline, "savings": 0.0}, {"strategy": "Semantic cache", "monthly_cost": after_cache, "savings": baseline - after_cache}, {"strategy": "Batching/window packing", "monthly_cost": after_batch, "savings": baseline - after_batch}, {"strategy": "Model routing", "monthly_cost": routing_total, "savings": baseline - routing_total}, {"strategy": "Combined plan", "monthly_cost": combined, "savings": baseline - combined}, ] return pd.DataFrame(rows) st.markdown("""

Model Economics

💰 LLM API Cost Optimizer

Model the cost of LLM traffic and quantify savings from semantic caching, batching, and routing.

Token economics Caching strategy Deployment planning

""", unsafe_allow_html=True) with st.sidebar: st.markdown("### Model Pricing") preset = st.selectbox("Preset", list(MODEL_PRESETS.keys()), index=1) default = MODEL_PRESETS[preset] input_price = st.number_input("Input price / 1M tokens ($)", min_value=0.0, value=float(default["input"]), step=0.05) output_price = st.number_input("Output price / 1M tokens ($)", min_value=0.0, value=float(default["output"]), step=0.05) st.markdown("### Optimization Assumptions") cache_hit = st.slider("Semantic cache hit rate", 0.0, 0.9, 0.28, 0.01) batch_gain = st.slider("Batching efficiency gain", 0.0, 0.5, 0.12, 0.01) route_share = st.slider("Traffic routable to smaller model", 0.0, 0.9, 0.35, 0.01) left, right = st.columns([1, 1]) with left: calls = st.number_input("Monthly requests", min_value=1_000, value=1_000_000, step=50_000) input_tokens = st.number_input("Average input tokens", min_value=1, value=850, step=50) output_tokens = st.number_input("Average output tokens", min_value=1, value=260, step=25) with right: st.markdown("### Professional Framing") st.markdown(""" This is the kind of utility ML teams actually need before deploying a Space, API, or agent. It connects model choice, token volume, caching, and routing into an engineering decision instead of a vague cost guess. """) report = optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share) baseline = float(report.iloc[0]["monthly_cost"]) combined = float(report.iloc[-1]["monthly_cost"]) savings = baseline - combined savings_pct = savings / baseline * 100 if baseline else 0 metric_cols = st.columns(3) metric_cols[0].metric("Baseline monthly cost", f"${baseline:,.2f}") metric_cols[1].metric("Combined plan", f"${combined:,.2f}") metric_cols[2].metric("Estimated savings", f"{savings_pct:.1f}%") tab1, tab2, tab3 = st.tabs(["Cost Curve", "Strategy Table", "Architecture Notes"]) with tab1: fig = px.bar( report, x="strategy", y="monthly_cost", color="strategy", title="Monthly cost by optimization strategy", color_discrete_sequence=["#b8a9d9", "#ffad7a", "#7accff", "#e8935c", "#4b5563"], ) st.plotly_chart(fig, use_container_width=True) with tab2: display = report.copy() display["monthly_cost"] = display["monthly_cost"].map(lambda value: f"${value:,.2f}") display["savings"] = display["savings"].map(lambda value: f"${value:,.2f}") st.dataframe(display, use_container_width=True, hide_index=True) with tab3: st.markdown(""" ### HF-Native Extension Path - Deploy a smaller model as a Hugging Face Inference Endpoint for routable traffic. - Store prompt embeddings in a vector database for semantic caching. - Add request logs as a private Hugging Face Dataset for offline evaluation. - Compare quality against a frontier baseline before routing production traffic. """)