Spaces:

sammoftah
/

llm-api-cost-optimizer

Runtime error

App Files Files Community

llm-api-cost-optimizer / app.py

sammoftah

Deploy LLM API Cost Optimizer

7fd7a32 verified 17 days ago

raw

history blame contribute delete

5.88 kB

	"""
	LLM API Cost Optimizer
	Estimate model-serving cost and identify caching, batching, and routing savings.
	"""

	from pathlib import Path

	import pandas as pd
	import plotly.express as px
	import streamlit as st


	st.set_page_config(page_title="LLM API Cost Optimizer", page_icon="💰", layout="wide")


	def load_shared_css() -> None:
	current_dir = Path(__file__).resolve().parent
	candidates = [
	current_dir / "shared" / "styles.css",
	current_dir.parent / "shared" / "styles.css",
	]
	css_path = next(path for path in candidates if path.exists())
	st.markdown(f"<style>{css_path.read_text(encoding='utf-8')}</style>", unsafe_allow_html=True)


	load_shared_css()


	MODEL_PRESETS = {
	"Small open model endpoint": {"input": 0.15, "output": 0.20},
	"Mid-size instruction model": {"input": 0.60, "output": 0.90},
	"Frontier API baseline": {"input": 5.00, "output": 15.00},
	"Custom": {"input": 1.00, "output": 2.00},
	}


	def monthly_cost(calls, input_tokens, output_tokens, input_price, output_price):
	input_cost = calls * input_tokens / 1_000_000 * input_price
	output_cost = calls * output_tokens / 1_000_000 * output_price
	return input_cost + output_cost


	def optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share):
	baseline = monthly_cost(calls, input_tokens, output_tokens, input_price, output_price)
	after_cache = monthly_cost(calls * (1 - cache_hit), input_tokens, output_tokens, input_price, output_price)
	after_batch = after_cache * (1 - batch_gain)
	routed_calls = calls * route_share
	routed_savings = monthly_cost(routed_calls, input_tokens, output_tokens, input_price * 0.65, output_price * 0.65)
	unrouted_cost = monthly_cost(calls * (1 - route_share), input_tokens, output_tokens, input_price, output_price)
	routing_total = routed_savings + unrouted_cost
	combined = after_batch * (1 - route_share * 0.35)
	rows = [
	{"strategy": "Baseline", "monthly_cost": baseline, "savings": 0.0},
	{"strategy": "Semantic cache", "monthly_cost": after_cache, "savings": baseline - after_cache},
	{"strategy": "Batching/window packing", "monthly_cost": after_batch, "savings": baseline - after_batch},
	{"strategy": "Model routing", "monthly_cost": routing_total, "savings": baseline - routing_total},
	{"strategy": "Combined plan", "monthly_cost": combined, "savings": baseline - combined},
	]
	return pd.DataFrame(rows)


	st.markdown("""
	<div class="hero">
	<div class="hf-badge">Model Economics</div>
	<h1>💰 LLM API Cost Optimizer</h1>
	<p>Model the cost of LLM traffic and quantify savings from semantic caching, batching, and routing.</p>
	<div class="pill-row">
	<span class="hf-chip">Token economics</span>
	<span class="hf-chip">Caching strategy</span>
	<span class="hf-chip">Deployment planning</span>
	</div>
	</div>
	""", unsafe_allow_html=True)

	with st.sidebar:
	st.markdown("### Model Pricing")
	preset = st.selectbox("Preset", list(MODEL_PRESETS.keys()), index=1)
	default = MODEL_PRESETS[preset]
	input_price = st.number_input("Input price / 1M tokens ($)", min_value=0.0, value=float(default["input"]), step=0.05)
	output_price = st.number_input("Output price / 1M tokens ($)", min_value=0.0, value=float(default["output"]), step=0.05)
	st.markdown("### Optimization Assumptions")
	cache_hit = st.slider("Semantic cache hit rate", 0.0, 0.9, 0.28, 0.01)
	batch_gain = st.slider("Batching efficiency gain", 0.0, 0.5, 0.12, 0.01)
	route_share = st.slider("Traffic routable to smaller model", 0.0, 0.9, 0.35, 0.01)

	left, right = st.columns([1, 1])
	with left:
	calls = st.number_input("Monthly requests", min_value=1_000, value=1_000_000, step=50_000)
	input_tokens = st.number_input("Average input tokens", min_value=1, value=850, step=50)
	output_tokens = st.number_input("Average output tokens", min_value=1, value=260, step=25)
	with right:
	st.markdown("### Professional Framing")
	st.markdown("""
	This is the kind of utility ML teams actually need before deploying a Space, API, or agent. It connects model choice, token volume, caching, and routing into an engineering decision instead of a vague cost guess.
	""")

	report = optimization_report(calls, input_tokens, output_tokens, input_price, output_price, cache_hit, batch_gain, route_share)
	baseline = float(report.iloc[0]["monthly_cost"])
	combined = float(report.iloc[-1]["monthly_cost"])
	savings = baseline - combined
	savings_pct = savings / baseline * 100 if baseline else 0

	metric_cols = st.columns(3)
	metric_cols[0].metric("Baseline monthly cost", f"${baseline:,.2f}")
	metric_cols[1].metric("Combined plan", f"${combined:,.2f}")
	metric_cols[2].metric("Estimated savings", f"{savings_pct:.1f}%")

	tab1, tab2, tab3 = st.tabs(["Cost Curve", "Strategy Table", "Architecture Notes"])

	with tab1:
	fig = px.bar(
	report,
	x="strategy",
	y="monthly_cost",
	color="strategy",
	title="Monthly cost by optimization strategy",
	color_discrete_sequence=["#b8a9d9", "#ffad7a", "#7accff", "#e8935c", "#4b5563"],
	)
	st.plotly_chart(fig, use_container_width=True)

	with tab2:
	display = report.copy()
	display["monthly_cost"] = display["monthly_cost"].map(lambda value: f"${value:,.2f}")
	display["savings"] = display["savings"].map(lambda value: f"${value:,.2f}")
	st.dataframe(display, use_container_width=True, hide_index=True)

	with tab3:
	st.markdown("""
	### HF-Native Extension Path

	- Deploy a smaller model as a Hugging Face Inference Endpoint for routable traffic.
	- Store prompt embeddings in a vector database for semantic caching.
	- Add request logs as a private Hugging Face Dataset for offline evaluation.
	- Compare quality against a frontier baseline before routing production traffic.
	""")