Spaces:
Runtime error
Runtime error
| """ | |
| LLM API Cost Optimizer | |
| Estimate model-serving cost and identify caching, batching, and routing savings. | |
| """ | |
| from pathlib import Path | |
| import pandas as pd | |
| import plotly.express as px | |
| import streamlit as st | |
| st.set_page_config(page_title="LLM API Cost Optimizer", page_icon="💰", layout="wide") | |
| def load_shared_css() -> None: | |
| current_dir = Path(__file__).resolve().parent | |
| candidates = [ | |
| current_dir / "shared" / "styles.css", | |
| current_dir.parent / "shared" / "styles.css", | |
| ] | |
| css_path = next(path for path in candidates if path.exists()) | |
| st.markdown(f"<style>{css_path.read_text(encoding='utf-8')}</style>", unsafe_allow_html=True) | |
| load_shared_css() | |
| MODEL_PRESETS = { | |
| "Small open model endpoint": {"input": 0.15, "output": 0.20}, | |
| "Mid-size instruction model": {"input": 0.60, "output": 0.90}, | |
| "Frontier API baseline": {"input": 5.00, "output": 15.00}, | |
| "Custom": {"input": 1.00, "output": 2.00}, | |
| } | |
| def monthly_cost(calls, input_tokens, output_tokens, input_price, output_price): | |
| input_cost = calls * input_tokens / 1_000_000 * input_price | |
| output_cost = calls * output_tokens / 1_000_000 * output_price | |
| return input_cost + output_cost | |
| def optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share): | |
| baseline = monthly_cost(calls, input_tokens, output_tokens, input_price, output_price) | |
| after_cache = monthly_cost(calls * (1 - cache_hit), input_tokens, output_tokens, input_price, output_price) | |
| after_batch = after_cache * (1 - batch_gain) | |
| routed_calls = calls * route_share | |
| routed_savings = monthly_cost(routed_calls, input_tokens, output_tokens, input_price * 0.65, output_price * 0.65) | |
| unrouted_cost = monthly_cost(calls * (1 - route_share), input_tokens, output_tokens, input_price, output_price) | |
| routing_total = routed_savings + unrouted_cost | |
| combined = after_batch * (1 - route_share * 0.35) | |
| rows = [ | |
| {"strategy": "Baseline", "monthly_cost": baseline, "savings": 0.0}, | |
| {"strategy": "Semantic cache", "monthly_cost": after_cache, "savings": baseline - after_cache}, | |
| {"strategy": "Batching/window packing", "monthly_cost": after_batch, "savings": baseline - after_batch}, | |
| {"strategy": "Model routing", "monthly_cost": routing_total, "savings": baseline - routing_total}, | |
| {"strategy": "Combined plan", "monthly_cost": combined, "savings": baseline - combined}, | |
| ] | |
| return pd.DataFrame(rows) | |
| st.markdown(""" | |
| <div class="hero"> | |
| <div class="hf-badge">Model Economics</div> | |
| <h1>💰 LLM API Cost Optimizer</h1> | |
| <p>Model the cost of LLM traffic and quantify savings from semantic caching, batching, and routing.</p> | |
| <div class="pill-row"> | |
| <span class="hf-chip">Token economics</span> | |
| <span class="hf-chip">Caching strategy</span> | |
| <span class="hf-chip">Deployment planning</span> | |
| </div> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with st.sidebar: | |
| st.markdown("### Model Pricing") | |
| preset = st.selectbox("Preset", list(MODEL_PRESETS.keys()), index=1) | |
| default = MODEL_PRESETS[preset] | |
| input_price = st.number_input("Input price / 1M tokens ($)", min_value=0.0, value=float(default["input"]), step=0.05) | |
| output_price = st.number_input("Output price / 1M tokens ($)", min_value=0.0, value=float(default["output"]), step=0.05) | |
| st.markdown("### Optimization Assumptions") | |
| cache_hit = st.slider("Semantic cache hit rate", 0.0, 0.9, 0.28, 0.01) | |
| batch_gain = st.slider("Batching efficiency gain", 0.0, 0.5, 0.12, 0.01) | |
| route_share = st.slider("Traffic routable to smaller model", 0.0, 0.9, 0.35, 0.01) | |
| left, right = st.columns([1, 1]) | |
| with left: | |
| calls = st.number_input("Monthly requests", min_value=1_000, value=1_000_000, step=50_000) | |
| input_tokens = st.number_input("Average input tokens", min_value=1, value=850, step=50) | |
| output_tokens = st.number_input("Average output tokens", min_value=1, value=260, step=25) | |
| with right: | |
| st.markdown("### Professional Framing") | |
| st.markdown(""" | |
| This is the kind of utility ML teams actually need before deploying a Space, API, or agent. It connects model choice, token volume, caching, and routing into an engineering decision instead of a vague cost guess. | |
| """) | |
| report = optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share) | |
| baseline = float(report.iloc[0]["monthly_cost"]) | |
| combined = float(report.iloc[-1]["monthly_cost"]) | |
| savings = baseline - combined | |
| savings_pct = savings / baseline * 100 if baseline else 0 | |
| metric_cols = st.columns(3) | |
| metric_cols[0].metric("Baseline monthly cost", f"${baseline:,.2f}") | |
| metric_cols[1].metric("Combined plan", f"${combined:,.2f}") | |
| metric_cols[2].metric("Estimated savings", f"{savings_pct:.1f}%") | |
| tab1, tab2, tab3 = st.tabs(["Cost Curve", "Strategy Table", "Architecture Notes"]) | |
| with tab1: | |
| fig = px.bar( | |
| report, | |
| x="strategy", | |
| y="monthly_cost", | |
| color="strategy", | |
| title="Monthly cost by optimization strategy", | |
| color_discrete_sequence=["#b8a9d9", "#ffad7a", "#7accff", "#e8935c", "#4b5563"], | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with tab2: | |
| display = report.copy() | |
| display["monthly_cost"] = display["monthly_cost"].map(lambda value: f"${value:,.2f}") | |
| display["savings"] = display["savings"].map(lambda value: f"${value:,.2f}") | |
| st.dataframe(display, use_container_width=True, hide_index=True) | |
| with tab3: | |
| st.markdown(""" | |
| ### HF-Native Extension Path | |
| - Deploy a smaller model as a Hugging Face Inference Endpoint for routable traffic. | |
| - Store prompt embeddings in a vector database for semantic caching. | |
| - Add request logs as a private Hugging Face Dataset for offline evaluation. | |
| - Compare quality against a frontier baseline before routing production traffic. | |
| """) | |