agent-nexus / main.py
Chris4K's picture
Update main.py
859eba4 verified
"""
NEXUS — Model Router & Inference Gateway
OpenAI-compatible API with intelligent routing across 3 providers.
Providers (priority order):
1. ki_fusion — ki-fusion-labs.de/v1 (primary, LM Studio via PHP)
2. hf_api — HuggingFace Serverless Inference (HF_TOKEN env)
3. local_cpu — transformers, Qwen2.5-0.5B on CPU (always-on fallback)
Routing decisions based on:
task_type : simple / reasoning / planning / code / vision / embedding
complexity : 1-10 score from message analysis
cost_mode : cheap / balanced / best
provider : explicit override
MCP tools: nexus_chat, nexus_route_info, nexus_stats, nexus_models, nexus_health
"""
import os, uuid, json, asyncio, time, re, math, logging
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, AsyncGenerator
import httpx
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse, HTMLResponse, StreamingResponse
logging.basicConfig(level=logging.INFO)
log = logging.getLogger("nexus")
BASE = Path(__file__).parent
STATS_FILE = BASE / "stats.json"
# ── Env ───────────────────────────────────────────────────────────
KF_BASE = os.environ.get("KI_FUSION_URL", "https://ki-fusion-labs.de/v1")
KF_KEY = os.environ.get("KI_FUSION_KEY", "")
HF_TOKEN = os.environ.get("HF_TOKEN", "")
HF_BASE = "https://api-inference.huggingface.co/models"
HF_OAI = "https://api-inference.huggingface.co/v1"
# ── Model catalogue ───────────────────────────────────────────────
# ki_fusion uses whatever LM Studio has loaded — model name is configurable
KF_DEFAULT_MODEL = os.environ.get("KF_MODEL", "lm-studio")
HF_MODELS = {
"simple": "Qwen/Qwen2.5-7B-Instruct",
"reasoning": "meta-llama/Llama-3.1-8B-Instruct",
"planning": "mistralai/Mistral-7B-Instruct-v0.3",
"code": "Qwen/Qwen2.5-Coder-7B-Instruct",
"vision": "Qwen/Qwen2.5-VL-7B-Instruct",
"fast": "Qwen/Qwen2.5-0.5B-Instruct",
}
LOCAL_MODEL_ID = os.environ.get("LOCAL_MODEL", "Qwen/Qwen2.5-0.5B-Instruct")
# ── Stats ─────────────────────────────────────────────────────────
def load_stats():
if STATS_FILE.exists():
try: return json.loads(STATS_FILE.read_text())
except: pass
return {
"total_requests": 0,
"by_provider": {"ki_fusion":{"ok":0,"fail":0,"total_ms":0,"tokens":0},
"hf_api": {"ok":0,"fail":0,"total_ms":0,"tokens":0},
"local_cpu":{"ok":0,"fail":0,"total_ms":0,"tokens":0}},
"by_task": {"simple":0,"reasoning":0,"planning":0,"code":0,"vision":0,"embedding":0,"unknown":0},
"by_cost": {"cheap":0,"balanced":0,"best":0},
"recent": [], # last 20 routing decisions
}
def save_stats(s):
STATS_FILE.write_text(json.dumps(s, indent=2))
STATS = load_stats()
def record(provider, task, cost_mode, ok, ms, tokens, model, reason):
STATS["total_requests"] += 1
p = STATS["by_provider"][provider]
if ok: p["ok"] += 1
else: p["fail"] += 1
p["total_ms"] += ms
p["tokens"] += tokens
STATS["by_task"][task] = STATS["by_task"].get(task, 0) + 1
STATS["by_cost"][cost_mode] = STATS["by_cost"].get(cost_mode, 0) + 1
entry = {
"id": uuid.uuid4().hex[:8],
"ts": int(time.time()),
"provider": provider, "model": model,
"task": task, "cost_mode": cost_mode,
"ok": ok, "ms": ms, "tokens": tokens,
"reason": reason
}
STATS["recent"] = ([entry] + STATS["recent"])[:20]
save_stats(STATS)
# ── Task classifier ───────────────────────────────────────────────
TASK_PATTERNS = {
"vision": [r"\bimage\b",r"\bscreenshot\b",r"\bphoto\b",r"\bpicture\b",r"\bdescribe.{0,20}image\b",r"\bvision\b"],
"code": [r"\bcode\b",r"\bfunction\b",r"\bclass\b",r"\bdebug\b",r"\bimplements?\b",r"\bpython\b",
r"\bjavascript\b",r"\brefactor\b",r"\bscript\b",r"\bbug\b",r"\bsyntax\b"],
"reasoning": [r"\bwhy\b",r"\bexplain\b",r"\banalyze\b",r"\banalyse\b",r"\breason\b",
r"\bprove\b",r"\bcompare\b",r"\bdifference\b",r"\badvantages?\b",r"\bthink\b"],
"planning": [r"\bplan\b",r"\bstrategy\b",r"\bsteps?\b",r"\broadmap\b",r"\bschedule\b",
r"\bprioritize\b",r"\bworkflow\b",r"\barchitecture\b",r"\bdesign\b"],
"embedding": [r"\bembed\b",r"\bvector\b",r"\bsimilarity\b",r"\bsemantic\b",r"\bencod"],
"simple": [], # fallthrough
}
def classify_task(messages: list) -> str:
text = " ".join(
m.get("content","") if isinstance(m.get("content"), str)
else " ".join(c.get("text","") for c in m.get("content",[]) if isinstance(c,dict))
for m in messages
).lower()
# Vision: check for image content blocks
for m in messages:
if isinstance(m.get("content"), list):
for c in m["content"]:
if isinstance(c, dict) and c.get("type") == "image_url":
return "vision"
for task, patterns in TASK_PATTERNS.items():
if task == "simple": continue
for p in patterns:
if re.search(p, text): return task
return "simple"
def score_complexity(messages: list) -> int:
text = " ".join(
m.get("content","") if isinstance(m.get("content"), str) else ""
for m in messages
)
score = 1
words = len(text.split())
if words > 50: score += 1
if words > 150: score += 1
if words > 400: score += 2
# Multi-step indicators
if re.search(r"\bstep\s*\d|first.*then.*finally|\d+\.\s+", text.lower()): score += 1
# Technical density
tech_words = ["algorithm","optimization","architecture","implement","integrate",
"distributed","concurrent","neural","transformer","gradient","latency"]
hits = sum(1 for w in tech_words if w in text.lower())
score += min(hits, 3)
# Question count
score += min(text.count("?"), 2)
return min(score, 10)
# ── Provider health ───────────────────────────────────────────────
provider_health = {"ki_fusion": True, "hf_api": bool(HF_TOKEN), "local_cpu": True}
# Real model ID discovered from LM Studio at startup (or falls back to KF_DEFAULT_MODEL)
_kf_actual_model: str = KF_DEFAULT_MODEL
async def probe_ki_fusion() -> bool:
"""Test ki_fusion with a minimal chat completion — /v1/models may not be available.
Uses KF_MODEL env var as the model name (set it to your actual loaded model id).
"""
global _kf_actual_model
_kf_actual_model = KF_DEFAULT_MODEL # always use configured name, no discovery
try:
headers = {"Content-Type": "application/json"}
if KF_KEY: headers["Authorization"] = f"Bearer {KF_KEY}"
payload = {
"model": KF_DEFAULT_MODEL,
"messages": [{"role": "user", "content": "ping"}],
"max_tokens": 1,
"temperature": 0.0,
}
async with httpx.AsyncClient(timeout=httpx.Timeout(None, connect=6.0, read=15.0), verify=False) as c:
r = await c.post(f"{KF_BASE}/chat/completions/", headers=headers, json=payload)
if r.status_code < 400:
log.info(f"[NEXUS] ki_fusion online ✓ model={_kf_actual_model} url={KF_BASE}")
provider_health["ki_fusion"] = True
return True
else:
log.warning(f"[NEXUS] ki_fusion probe HTTP {r.status_code}: {r.text[:120]}")
provider_health["ki_fusion"] = False
return False
except Exception as e:
log.warning(f"[NEXUS] ki_fusion probe failed: {e}")
provider_health["ki_fusion"] = False
return False
async def ki_fusion_watchdog():
"""Background task: probe ki_fusion every 30s to auto-recover after outages."""
while True:
await asyncio.sleep(30)
was_ok = provider_health["ki_fusion"]
now_ok = await probe_ki_fusion()
if not was_ok and now_ok:
log.info("[NEXUS] ki_fusion recovered — back online")
elif was_ok and not now_ok:
log.warning("[NEXUS] ki_fusion went offline")
# ── Local CPU model (lazy) ────────────────────────────────────────
_local_pipe = None
_local_loading = False
def get_local_pipe():
global _local_pipe, _local_loading
if _local_pipe is not None:
return _local_pipe
if _local_loading:
return None
_local_loading = True
try:
from transformers import pipeline
log.info(f"Loading local model {LOCAL_MODEL_ID} on CPU...")
_local_pipe = pipeline(
"text-generation",
model=LOCAL_MODEL_ID,
device="cpu",
torch_dtype="auto",
max_new_tokens=512,
)
log.info("Local model loaded.")
except Exception as e:
log.warning(f"Local model load failed: {e}")
_local_pipe = None
provider_health["local_cpu"] = False
finally:
_local_loading = False
return _local_pipe
# Pre-warm in background
async def warm_local():
await asyncio.sleep(5)
loop = asyncio.get_event_loop()
await loop.run_in_executor(None, get_local_pipe)
# ── Router ────────────────────────────────────────────────────────
def select_provider_and_model(task: str, complexity: int, cost_mode: str,
force_provider: str = "") -> tuple[str,str,str]:
"""Returns (provider, model, reason)"""
# Explicit override
if force_provider and provider_health.get(force_provider, False):
model = _kf_actual_model if force_provider=="ki_fusion" else HF_MODELS.get(task, HF_MODELS["simple"])
if force_provider == "local_cpu": model = LOCAL_MODEL_ID
return force_provider, model, f"explicit override to {force_provider}"
# Vision always -> HF (vision models)
if task == "vision":
if provider_health["hf_api"]:
return "hf_api", HF_MODELS["vision"], "vision task -> HF Qwen2.5-VL"
if provider_health["ki_fusion"]:
return "ki_fusion", _kf_actual_model, "vision fallback -> ki_fusion"
# Embedding -> HF
if task == "embedding":
if provider_health["hf_api"]:
return "hf_api", "sentence-transformers/all-MiniLM-L6-v2", "embedding -> HF sentence-transformers"
# Cost mode: cheap -> prefer HF or local
if cost_mode == "cheap":
if task == "simple" and complexity <= 4:
if provider_health["hf_api"]:
return "hf_api", HF_MODELS["fast"], f"cheap+simple(c={complexity}) -> HF fast"
if provider_health["local_cpu"]:
return "local_cpu", LOCAL_MODEL_ID, f"cheap+simple -> local CPU"
# Cost mode: best -> ki_fusion first (your own GPU)
if cost_mode == "best":
if provider_health["ki_fusion"]:
return "ki_fusion", _kf_actual_model, f"best mode -> ki_fusion (LM Studio)"
# Balanced routing by task + complexity
if task in ("planning",) and complexity >= 6:
if provider_health["ki_fusion"]:
return "ki_fusion", _kf_actual_model, f"planning+complex(c={complexity}) -> ki_fusion"
if task == "code":
if provider_health["ki_fusion"]:
return "ki_fusion", _kf_actual_model, f"code task -> ki_fusion (LM Studio)"
if provider_health["hf_api"]:
return "hf_api", HF_MODELS["code"], "code -> HF Qwen2.5-Coder"
if task == "reasoning" and complexity >= 7:
if provider_health["ki_fusion"]:
return "ki_fusion", _kf_actual_model, f"hard reasoning(c={complexity}) -> ki_fusion"
# Default balanced: HF for most tasks (free tier, good quality)
if provider_health["hf_api"]:
hf_model = HF_MODELS.get(task, HF_MODELS["simple"])
return "hf_api", hf_model, f"{task}(c={complexity}) -> HF {hf_model.split('/')[-1]}"
# Fallback: ki_fusion
if provider_health["ki_fusion"]:
return "ki_fusion", _kf_actual_model, f"fallback -> ki_fusion"
# Last resort: local CPU
return "local_cpu", LOCAL_MODEL_ID, "last resort -> local CPU"
# ── Inference calls ───────────────────────────────────────────────
async def call_ki_fusion(messages, model, max_tokens=1024, temperature=0.7, stream=False):
headers = {"Content-Type":"application/json"}
if KF_KEY: headers["Authorization"] = f"Bearer {KF_KEY}"
payload = {"model": model, "messages": messages,
"max_tokens": max_tokens, "temperature": temperature, "stream": stream}
# verify=False: ki-fusion-labs.de SSL cert may be expired (self-hosted).
# Fast-fail connect: 6s tells us immediately if your server is off.
timeout = httpx.Timeout(None, connect=6.0, read=90.0, write=10.0, pool=5.0)
async with httpx.AsyncClient(timeout=timeout, verify=False) as client:
if stream:
async with client.stream("POST", f"{KF_BASE}/chat/completions/",
headers=headers, json=payload) as resp:
resp.raise_for_status()
async for chunk in resp.aiter_bytes():
yield chunk
else:
r = await client.post(f"{KF_BASE}/chat/completions/",
headers=headers, json=payload)
r.raise_for_status()
yield r.json()
async def call_hf_api(messages, model, max_tokens=1024, temperature=0.7, stream=False):
if not HF_TOKEN:
raise Exception("HF_TOKEN not set")
headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type":"application/json"}
# HF OpenAI-compatible endpoint
payload = {"model": model, "messages": messages,
"max_tokens": max_tokens, "temperature": temperature, "stream": stream}
url = f"{HF_OAI}/chat/completions"
async with httpx.AsyncClient(timeout=90) as client:
if stream:
async with client.stream("POST", url, headers=headers, json=payload) as resp:
resp.raise_for_status()
async for chunk in resp.aiter_bytes():
yield chunk
else:
r = await client.post(url, headers=headers, json=payload)
r.raise_for_status()
yield r.json()
async def call_local_cpu(messages, model, max_tokens=512, temperature=0.7, stream=False):
loop = asyncio.get_event_loop()
# Bug fix: if model is still loading (_local_loading=True), wait up to 90s
# instead of failing immediately. This is the guaranteed last-resort provider.
waited = 0
while _local_loading and waited < 90:
log.info(f"[local_cpu] Model still loading, waiting… ({waited}s)")
await asyncio.sleep(3)
waited += 3
# If not loaded yet, trigger a load attempt now (synchronously in thread)
if not _local_pipe and not _local_loading:
log.info("[local_cpu] Triggering model load now (first request)")
await loop.run_in_executor(None, get_local_pipe)
def _run():
pipe = get_local_pipe()
if not pipe:
raise Exception("Local model not available — transformers load failed. Check logs for OOM or missing dependencies.")
# Build prompt from messages
chat_messages = [{"role": m.get("role","user"),
"content": m.get("content","") if isinstance(m.get("content"), str) else ""}
for m in messages]
result = pipe(chat_messages, max_new_tokens=max_tokens, do_sample=temperature > 0,
temperature=max(temperature, 0.01), pad_token_id=pipe.tokenizer.eos_token_id)
if result and result[0]:
generated = result[0].get("generated_text", "")
if isinstance(generated, list):
# Chat format: last message is the new assistant response
last = generated[-1] if generated else {}
content = last.get("content","") if isinstance(last, dict) else str(last)
else:
content = str(generated)
# Strip prompt echo
prompt_text = " ".join(m.get("content","") for m in messages if isinstance(m.get("content"),str))
if content.startswith(prompt_text):
content = content[len(prompt_text):].strip()
return content
return ""
content = await loop.run_in_executor(None, _run)
response = {
"id": f"local-{uuid.uuid4().hex[:8]}",
"object": "chat.completion",
"created": int(time.time()),
"model": LOCAL_MODEL_ID,
"choices": [{"index":0,"message":{"role":"assistant","content":content},
"finish_reason":"stop"}],
"usage": {"prompt_tokens": 0, "completion_tokens": len(content.split()), "total_tokens": len(content.split())}
}
yield response
# ── Core route function ───────────────────────────────────────────
async def route_inference(messages: list, max_tokens: int = 1024, temperature: float = 0.7,
cost_mode: str = "balanced", force_provider: str = "",
force_model: str = "", stream: bool = False):
task = classify_task(messages)
complexity = score_complexity(messages)
provider, model, reason = select_provider_and_model(task, complexity, cost_mode, force_provider)
if force_model: model = force_model
t0 = time.time()
tokens = 0
ok = True
tried = []
providers_to_try = [provider]
# Build fallback chain: ki_fusion -> hf_api can be skipped if health=False,
# but local_cpu is ALWAYS added last — it's the guaranteed offline fallback.
for fb in ["ki_fusion", "hf_api"]:
if fb not in providers_to_try and provider_health.get(fb, True):
providers_to_try.append(fb)
# local_cpu: always last, always tried — never skip it
if "local_cpu" not in providers_to_try:
providers_to_try.append("local_cpu")
last_err = None
for p in providers_to_try:
tried.append(p)
try:
fb_model = model
if p == "ki_fusion": caller = call_ki_fusion
elif p == "hf_api": caller = call_hf_api; fb_model = HF_MODELS.get(task, HF_MODELS["simple"])
else: caller = call_local_cpu; fb_model = LOCAL_MODEL_ID
if p != provider:
reason += f" | fallback to {p}"
if stream:
async def _stream_gen():
async for chunk in caller(messages, fb_model, max_tokens, temperature, stream=True):
yield chunk
ms = int((time.time()-t0)*1000)
record(p, task, cost_mode, True, ms, 0, fb_model, reason)
return {
"_stream": True,
"_gen": _stream_gen(),
"_meta": {"provider":p,"model":fb_model,"task":task,
"complexity":complexity,"reason":reason}
}
result = None
async for r in caller(messages, fb_model, max_tokens, temperature, stream=False):
result = r
break
ms = int((time.time()-t0)*1000)
if isinstance(result, dict):
tokens = result.get("usage",{}).get("total_tokens", 0)
result.setdefault("_nexus", {})
result["_nexus"] = {"provider":p,"model":fb_model,"task":task,
"complexity":complexity,"reason":reason,
"latency_ms":ms,"fallback_chain":tried}
record(p, task, cost_mode, True, ms, tokens, fb_model, reason)
return result
except Exception as e:
last_err = str(e)
log.error(f"[NEXUS] Provider '{p}' FAILED: {last_err}")
# Mark unhealthy — watchdog will re-probe every 30s and restore when live again
if p != "local_cpu":
provider_health[p] = False
ok = False
ms = int((time.time()-t0)*1000)
record(tried[-1] if tried else "none", task, cost_mode, False, ms, 0, model, reason)
raise HTTPException(503, f"All providers failed. Last error: {last_err}")
# ── FastAPI ───────────────────────────────────────────────────────
app = FastAPI(title="NEXUS Model Router")
@app.on_event("startup")
async def startup():
asyncio.create_task(warm_local())
# Probe ki_fusion immediately — discover actual model, set health state
asyncio.create_task(probe_ki_fusion())
# Keep probing every 30s so recovery after outage is automatic
asyncio.create_task(ki_fusion_watchdog())
def jresp(data, status=200): return JSONResponse(content=data, status_code=status)
# ── OpenAI-compatible endpoints ───────────────────────────────────
@app.post("/v1/chat/completions")
async def oai_chat(request: Request):
body = await request.json()
messages = body.get("messages", [])
max_tokens = body.get("max_tokens", 1024)
temperature = body.get("temperature", 0.7)
stream = body.get("stream", False)
cost_mode = body.get("cost_mode", "balanced") # nexus extension
force_prov = body.get("provider", "") # nexus extension
force_model = body.get("model", "")
# Detect if model is actually a provider name
if force_model in ("ki_fusion","hf_api","local_cpu"):
force_prov = force_model; force_model = ""
result = await route_inference(messages, max_tokens, temperature,
cost_mode, force_prov, force_model, stream)
if isinstance(result, dict) and result.get("_stream"):
return StreamingResponse(result["_gen"], media_type="text/event-stream",
headers={"Cache-Control":"no-cache","X-Accel-Buffering":"no",
"X-Nexus-Provider": result["_meta"]["provider"],
"X-Nexus-Task": result["_meta"]["task"]})
# Add headers for transparency
# HTTP headers must be latin-1 — strip any unicode (e.g. -> arrows in reason strings)
def h(v): return str(v).encode('latin-1', errors='replace').decode('latin-1')
return JSONResponse(content=result, headers={
"X-Nexus-Provider": h(result.get("_nexus",{}).get("provider","")),
"X-Nexus-Task": h(result.get("_nexus",{}).get("task","")),
"X-Nexus-Reason": h(result.get("_nexus",{}).get("reason","")[:120]),
})
@app.get("/v1/models")
async def oai_models():
models = [
{"id":"nexus-auto","object":"model","owned_by":"nexus","description":"Auto-routed"},
{"id":"nexus-cheap","object":"model","owned_by":"nexus","description":"Cost-optimized routing"},
{"id":"nexus-best","object":"model","owned_by":"nexus","description":"Best-quality routing"},
{"id":"ki_fusion","object":"model","owned_by":"ki-fusion-labs","description":f"Primary LM Studio ({_kf_actual_model})"},
{"id":_kf_actual_model,"object":"model","owned_by":"ki-fusion-labs","description":"Ki-Fusion loaded model"},
{"id":"hf_api","object":"model","owned_by":"huggingface","description":"HF Serverless Inference"},
{"id":"local_cpu","object":"model","owned_by":"local","description":f"Local CPU: {LOCAL_MODEL_ID}"},
]
for k,v in HF_MODELS.items():
models.append({"id":v,"object":"model","owned_by":"huggingface","description":f"HF {k} model"})
return jresp({"object":"list","data":models})
# ── Nexus-specific API ────────────────────────────────────────────
@app.post("/api/route")
async def api_route(request: Request):
"""Route with full metadata returned"""
body = await request.json()
messages = body.get("messages",[{"role":"user","content":body.get("prompt","")}])
result = await route_inference(
messages,
body.get("max_tokens",512),
body.get("temperature",0.7),
body.get("cost_mode","balanced"),
body.get("provider",""),
)
return jresp(result)
@app.post("/api/classify")
async def api_classify(request: Request):
body = await request.json()
messages = body.get("messages",[{"role":"user","content":body.get("prompt","")}])
task = classify_task(messages)
complexity = score_complexity(messages)
provider, model, reason = select_provider_and_model(task, complexity,
body.get("cost_mode","balanced"))
return jresp({"task":task,"complexity":complexity,
"selected_provider":provider,"selected_model":model,"reason":reason})
@app.get("/api/stats")
async def api_stats():
s = STATS.copy()
# Compute avg latencies
for p, d in s["by_provider"].items():
total = d["ok"] + d["fail"]
d["total"] = total
d["success_rate"] = round(d["ok"]/total*100,1) if total else 0
d["avg_ms"] = round(d["total_ms"]/d["ok"],0) if d["ok"] else 0
return jresp(s)
@app.get("/api/health")
async def api_health():
checks = {}
# ki_fusion: ping via chat/completions (v1/models may not be available on PHP proxy)
try:
headers = {"Content-Type": "application/json"}
if KF_KEY: headers["Authorization"] = f"Bearer {KF_KEY}"
payload = {"model": _kf_actual_model, "messages": [{"role":"user","content":"ping"}],
"max_tokens": 1, "temperature": 0.0}
async with httpx.AsyncClient(timeout=httpx.Timeout(None, connect=6.0, read=8.0), verify=False) as c:
r = await c.post(f"{KF_BASE}/chat/completions", headers=headers, json=payload)
checks["ki_fusion"] = {
"ok": r.status_code < 400,
"http_status": r.status_code,
"model": _kf_actual_model,
"url": KF_BASE,
"error": r.text[:120] if r.status_code >= 400 else None,
}
provider_health["ki_fusion"] = r.status_code < 400
except Exception as e:
checks["ki_fusion"] = {"ok": False, "error": str(e)[:120], "model": _kf_actual_model}
provider_health["ki_fusion"] = False
# HF
checks["hf_api"] = {"ok": bool(HF_TOKEN), "status": "token configured" if HF_TOKEN else "HF_TOKEN not set"}
# Local
checks["local_cpu"] = {"ok": _local_pipe is not None,
"status": "loaded" if _local_pipe else ("loading" if _local_loading else "not loaded")}
return jresp(checks)
@app.post("/api/providers/{provider}/toggle")
async def toggle_provider(provider: str, request: Request):
if provider not in provider_health:
raise HTTPException(404)
body = await request.json()
provider_health[provider] = body.get("enabled", not provider_health[provider])
return jresp({"provider":provider,"enabled":provider_health[provider]})
# ── MCP ───────────────────────────────────────────────────────────
MCP_TOOLS = [
{"name":"nexus_chat","description":"Send a chat completion through the NEXUS router. Auto-selects best provider.",
"inputSchema":{"type":"object","required":["messages"],"properties":{
"messages": {"type":"array","items":{"type":"object"}},
"max_tokens": {"type":"integer","default":1024},
"temperature":{"type":"number","default":0.7},
"cost_mode": {"type":"string","enum":["cheap","balanced","best"],"default":"balanced"},
"provider": {"type":"string","enum":["","ki_fusion","hf_api","local_cpu"]},
}}},
{"name":"nexus_route_info","description":"Predict routing for a prompt without running inference.",
"inputSchema":{"type":"object","required":["prompt"],"properties":{
"prompt": {"type":"string"},
"cost_mode": {"type":"string","default":"balanced"},
}}},
{"name":"nexus_stats","description":"Get routing statistics and provider performance.",
"inputSchema":{"type":"object","properties":{}}},
{"name":"nexus_models","description":"List all available models and providers.",
"inputSchema":{"type":"object","properties":{}}},
{"name":"nexus_health","description":"Check provider health and availability.",
"inputSchema":{"type":"object","properties":{}}},
]
async def mcp_call(name, args):
if name == "nexus_chat":
result = await route_inference(
args["messages"], args.get("max_tokens",1024),
args.get("temperature",0.7), args.get("cost_mode","balanced"),
args.get("provider",""))
return json.dumps(result)
if name == "nexus_route_info":
msgs = [{"role":"user","content":args["prompt"]}]
task = classify_task(msgs); comp = score_complexity(msgs)
p, m, r = select_provider_and_model(task, comp, args.get("cost_mode","balanced"))
return json.dumps({"task":task,"complexity":comp,"provider":p,"model":m,"reason":r})
if name == "nexus_stats":
return json.dumps(STATS)
if name == "nexus_models":
return json.dumps({"hf_models":HF_MODELS,"local_model":LOCAL_MODEL_ID,
"ki_fusion_model":_kf_actual_model})
if name == "nexus_health":
return json.dumps(provider_health)
return json.dumps({"error":f"unknown: {name}"})
@app.get("/mcp/sse")
async def mcp_sse():
async def stream():
init = {"jsonrpc":"2.0","method":"notifications/initialized",
"params":{"serverInfo":{"name":"nexus-router","version":"1.0"},
"capabilities":{"tools":{}}}}
yield f"data: {json.dumps(init)}\n\n"
await asyncio.sleep(0.1)
yield f"data: {json.dumps({'jsonrpc':'2.0','method':'notifications/tools/list_changed','params':{}})}\n\n"
while True:
await asyncio.sleep(25)
yield f"data: {json.dumps({'jsonrpc':'2.0','method':'ping'})}\n\n"
return StreamingResponse(stream(), media_type="text/event-stream",
headers={"Cache-Control":"no-cache","X-Accel-Buffering":"no"})
@app.post("/mcp")
async def mcp_rpc(request: Request):
body = await request.json()
method = body.get("method",""); rid = body.get("id",1)
if method == "initialize":
return jresp({"jsonrpc":"2.0","id":rid,"result":{
"serverInfo":{"name":"nexus-router","version":"1.0"},"capabilities":{"tools":{}}}})
if method == "tools/list":
return jresp({"jsonrpc":"2.0","id":rid,"result":{"tools":MCP_TOOLS}})
if method == "tools/call":
p = body.get("params",{}); res = await mcp_call(p.get("name",""), p.get("arguments",{}))
return jresp({"jsonrpc":"2.0","id":rid,"result":{"content":[{"type":"text","text":res}]}})
return jresp({"jsonrpc":"2.0","id":rid,"error":{"code":-32601,"message":"not found"}})
# ── SPA ───────────────────────────────────────────────────────────
@app.get("/", response_class=HTMLResponse)
async def ui():
return HTMLResponse(content=SPA, media_type="text/html; charset=utf-8")
SPA = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>NEXUS &mdash; Model Router</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link href="https://fonts.googleapis.com/css2?family=Space+Mono:wght@400;700&display=swap" rel="stylesheet">
<style>
:root{
--bg:#08080f;--s1:#0f0f1a;--s2:#141422;--bd:#1a1a2e;--bd2:#20203a;
--acc:#ff6b00;--acc2:#ff9500;--txt:#d8d8f0;--sub:#4a4a70;--dim:#1e1e38;
--p1:#0ea5e9;--p2:#7c3aed;--p3:#2ed573;
--cr:#ff2244;--lo:#2ed573;--font:'Space Mono',monospace;
--cheap:#2ed573;--balanced:#0ea5e9;--best:#ff9500;
--simple:#5a5a80;--reasoning:#7c3aed;--planning:#ff6b00;
--code:#0ea5e9;--vision:#ff6b9d;--embedding:#2ed573;
}
*{box-sizing:border-box;margin:0;padding:0;}
html,body{height:100%;overflow:hidden;}
body{font-family:var(--font);background:var(--bg);color:var(--txt);
display:flex;flex-direction:column;height:100vh;}
body::after{content:'';position:fixed;inset:0;pointer-events:none;
background:repeating-linear-gradient(0deg,transparent,transparent 3px,rgba(255,107,0,.005) 3px,rgba(255,107,0,.005) 4px);}
/* HEADER */
#hdr{flex-shrink:0;display:flex;align-items:center;padding:.75rem 1.6rem;gap:1rem;
border-bottom:1px solid var(--bd);background:linear-gradient(180deg,#0d0d1a,var(--bg));z-index:10;}
#logo{font-size:1.25rem;font-weight:700;letter-spacing:2px;
background:linear-gradient(90deg,var(--acc),var(--p1));
-webkit-background-clip:text;-webkit-text-fill-color:transparent;background-clip:text;}
#logo-sub{font-size:.5rem;color:var(--sub);letter-spacing:.26em;text-transform:uppercase;margin-top:2px;}
#hdr-stats{display:flex;gap:.45rem;flex:1;flex-wrap:wrap;}
.hs{display:flex;align-items:center;gap:.35rem;background:var(--s1);border:1px solid var(--bd);
border-radius:4px;padding:.22rem .5rem;font-size:.54rem;color:var(--sub);}
.hs-n{font-size:.85rem;font-weight:700;line-height:1;}
.provider-dot{width:8px;height:8px;border-radius:50%;flex-shrink:0;}
.pulse{animation:pulse 2s infinite;}
@keyframes pulse{0%,100%{opacity:1}50%{opacity:.25}}
/* MAIN LAYOUT */
#main{flex:1;display:flex;min-height:0;overflow:hidden;}
/* LEFT PANEL: playground */
#left{width:480px;flex-shrink:0;display:flex;flex-direction:column;
border-right:1px solid var(--bd);overflow:hidden;}
#left-hdr{flex-shrink:0;padding:.6rem 1rem;border-bottom:1px solid var(--bd);
background:var(--s1);display:flex;align-items:center;justify-content:space-between;}
#left-hdr-title{font-size:.62rem;font-weight:700;letter-spacing:.12em;color:var(--acc);}
#left-body{flex:1;overflow-y:auto;padding:.9rem 1rem;}
#left-body::-webkit-scrollbar{width:3px;}
#left-body::-webkit-scrollbar-thumb{background:var(--bd2);}
/* PLAYGROUND */
.pg-section{margin-bottom:.8rem;}
.pg-label{font-size:.5rem;color:var(--sub);text-transform:uppercase;letter-spacing:.12em;margin-bottom:.22rem;}
#pg-prompt{width:100%;background:var(--s2);border:1px solid var(--bd2);border-radius:6px;
padding:.55rem .7rem;font-family:var(--font);font-size:.7rem;color:var(--txt);outline:none;
min-height:80px;resize:vertical;line-height:1.55;transition:border-color .12s;}
#pg-prompt:focus{border-color:var(--acc);}
.ctrl-row{display:grid;grid-template-columns:1fr 1fr 1fr;gap:.45rem;}
.ctl{display:flex;flex-direction:column;gap:.18rem;}
.ctl label{font-size:.48rem;color:var(--sub);text-transform:uppercase;letter-spacing:.1em;}
.ctl select,.ctl input{background:var(--s2);border:1px solid var(--bd2);border-radius:4px;
padding:.32rem .5rem;font-family:var(--font);font-size:.65rem;color:var(--txt);outline:none;
transition:border-color .12s;}
.ctl select:focus,.ctl input:focus{border-color:var(--acc);}
.ctl select option{background:var(--s2);}
#btn-send{width:100%;background:var(--acc);color:#000;border:none;padding:.52rem;
font-family:var(--font);font-size:.7rem;font-weight:700;letter-spacing:.1em;
text-transform:uppercase;border-radius:5px;cursor:pointer;transition:background .1s,transform .08s;
margin-top:.35rem;}
#btn-send:hover{background:var(--acc2);transform:translateY(-1px);}
#btn-send:disabled{background:var(--dim);color:var(--sub);cursor:not-allowed;transform:none;}
#btn-classify{width:100%;background:var(--s2);color:var(--sub);border:1px dashed var(--bd2);
padding:.38rem;font-family:var(--font);font-size:.62rem;letter-spacing:.1em;text-transform:uppercase;
border-radius:5px;cursor:pointer;transition:all .1s;margin-top:.25rem;}
#btn-classify:hover{border-color:var(--p2);color:var(--p2);}
/* ROUTING PREVIEW */
#route-preview{background:var(--s1);border:1px solid var(--bd);border-radius:7px;
padding:.65rem .8rem;margin-top:.6rem;display:none;}
#route-preview.show{display:block;}
.rp-row{display:flex;align-items:center;gap:.4rem;margin-bottom:.3rem;flex-wrap:wrap;}
.rp-badge{font-size:.5rem;padding:2px 7px;border-radius:3px;font-weight:700;text-transform:uppercase;letter-spacing:.08em;}
.rp-arrow{font-size:.65rem;color:var(--sub);}
.rp-reason{font-size:.56rem;color:var(--sub);margin-top:.35rem;line-height:1.5;}
/* RESPONSE */
#response-box{background:var(--s1);border:1px solid var(--bd);border-radius:7px;
padding:.75rem .9rem;margin-top:.65rem;display:none;}
#response-box.show{display:block;}
#response-meta{display:flex;gap:.35rem;align-items:center;flex-wrap:wrap;margin-bottom:.55rem;
padding-bottom:.45rem;border-bottom:1px solid var(--bd);}
.r-badge{font-size:.48rem;padding:2px 7px;border-radius:3px;font-weight:700;}
#response-text{font-size:.7rem;color:var(--txt);line-height:1.68;white-space:pre-wrap;
max-height:260px;overflow-y:auto;word-break:break-word;}
#response-text::-webkit-scrollbar{width:3px;}
#response-text::-webkit-scrollbar-thumb{background:var(--bd2);}
/* RIGHT PANEL */
#right{flex:1;display:flex;flex-direction:column;overflow:hidden;}
#tabs{flex-shrink:0;display:flex;border-bottom:1px solid var(--bd);background:var(--s1);}
.tab{padding:.52rem 1.1rem;font-size:.6rem;font-weight:700;letter-spacing:.1em;
cursor:pointer;color:var(--sub);border-bottom:2px solid transparent;transition:all .12s;}
.tab.on{color:var(--acc);border-bottom-color:var(--acc);}
/* PANELS */
.panel{flex:1;overflow-y:auto;padding:1.1rem 1.4rem;display:none;}
.panel::-webkit-scrollbar{width:4px;}
.panel::-webkit-scrollbar-thumb{background:var(--bd2);}
.panel.on{display:block;}
/* ROUTING LOG */
.log-entry{background:var(--s1);border:1px solid var(--bd);border-radius:7px;
padding:.6rem .75rem;margin-bottom:.4rem;animation:cin .15s ease;}
@keyframes cin{from{opacity:0;transform:translateY(3px)}to{opacity:1;transform:none}}
.le-top{display:flex;align-items:center;gap:.38rem;margin-bottom:.3rem;flex-wrap:wrap;}
.le-id{font-size:.48rem;color:var(--sub);font-family:monospace;}
.le-provider{font-size:.5rem;padding:1px 6px;border-radius:3px;font-weight:700;text-transform:uppercase;}
.le-task{font-size:.5rem;padding:1px 6px;border-radius:3px;font-weight:700;}
.le-status{font-size:.5rem;padding:1px 5px;border-radius:3px;}
.le-ok{background:#02130a;color:var(--lo);border:1px solid rgba(46,213,115,.2);}
.le-fail{background:#1a0308;color:var(--cr);border:1px solid rgba(255,34,68,.2);}
.le-ms{font-size:.52rem;color:var(--sub);margin-left:auto;}
.le-reason{font-size:.57rem;color:var(--sub);line-height:1.45;}
.le-model{font-size:.5rem;color:var(--dim);margin-top:.2rem;font-family:monospace;}
/* STATS GRID */
.stats-grid{display:grid;grid-template-columns:repeat(3,1fr);gap:.7rem;margin-bottom:1rem;}
.stat-card{background:var(--s1);border:1px solid var(--bd);border-radius:8px;padding:.75rem .9rem;}
.sc-title{font-size:.5rem;color:var(--sub);text-transform:uppercase;letter-spacing:.12em;margin-bottom:.45rem;
display:flex;align-items:center;gap:.4rem;}
.sc-n{font-size:1.4rem;font-weight:700;line-height:1;color:var(--txt);}
.sc-sub{font-size:.52rem;color:var(--sub);margin-top:.2rem;}
.bar-container{margin-top:.5rem;}
.bar-row{display:flex;align-items:center;gap:.4rem;margin-bottom:.28rem;}
.bar-label{font-size:.5rem;color:var(--sub);width:70px;flex-shrink:0;text-transform:capitalize;}
.bar-track{flex:1;height:5px;background:var(--bd2);border-radius:3px;overflow:hidden;}
.bar-fill{height:100%;border-radius:3px;transition:width .4s;}
.bar-val{font-size:.5rem;color:var(--sub);min-width:28px;text-align:right;}
/* PROVIDER CARDS */
.provider-grid{display:grid;grid-template-columns:1fr 1fr 1fr;gap:.7rem;margin-bottom:1rem;}
.pc{background:var(--s1);border:1px solid var(--bd);border-radius:8px;padding:.9rem 1rem;
position:relative;overflow:hidden;}
.pc::before{content:'';position:absolute;top:0;left:0;right:0;height:2px;}
.pc.ki::before{background:var(--p1);}.pc.hf::before{background:var(--p2);}.pc.lc::before{background:var(--p3);}
.pc-name{font-size:.75rem;font-weight:700;color:var(--txt);margin-bottom:.18rem;}
.pc-sub{font-size:.52rem;color:var(--sub);margin-bottom:.5rem;}
.pc-status{display:flex;align-items:center;gap:.3rem;margin-bottom:.4rem;}
.pc-dot{width:7px;height:7px;border-radius:50%;}
.pc-active{color:var(--lo);font-size:.55rem;}.pc-inactive{color:var(--cr);font-size:.55rem;}
.pc-stats{font-size:.54rem;color:var(--sub);line-height:1.7;}
.pc-stat-n{color:var(--txt);}
.pc-toggle{width:100%;margin-top:.6rem;background:var(--s2);border:1px solid var(--bd2);
color:var(--sub);padding:.3rem;font-family:var(--font);font-size:.58rem;
border-radius:4px;cursor:pointer;transition:all .1s;}
.pc-toggle:hover{border-color:var(--acc);color:var(--acc);}
/* MODELS TABLE */
.models-table{width:100%;border-collapse:collapse;font-size:.6rem;}
.models-table th{text-align:left;padding:.4rem .6rem;font-size:.5rem;color:var(--sub);
text-transform:uppercase;letter-spacing:.1em;border-bottom:1px solid var(--bd);}
.models-table td{padding:.38rem .6rem;border-bottom:1px solid var(--bd);vertical-align:top;}
.models-table tr:hover td{background:var(--s1);}
.m-provider{font-size:.48rem;padding:1px 5px;border-radius:3px;font-weight:700;}
/* TOASTS */
#toasts{position:fixed;bottom:1rem;right:1rem;z-index:200;display:flex;flex-direction:column;gap:.35rem;}
.tst{background:var(--s1);border:1px solid var(--bd2);border-left:3px solid var(--acc);
padding:.4rem .75rem;font-size:.6rem;border-radius:5px;animation:tin .15s ease;color:var(--txt);}
.tst.err{border-left-color:var(--cr);}.tst.ok{border-left-color:var(--lo);}
@keyframes tin{from{opacity:0;transform:translateX(12px)}to{opacity:1;transform:none}}
#mcp-hint{position:fixed;bottom:1rem;left:1rem;z-index:10;background:var(--s1);
border:1px solid var(--bd2);border-left:3px solid var(--p1);border-radius:5px;
padding:.38rem .75rem;font-size:.52rem;color:var(--sub);}
#mcp-hint code{color:var(--p1);}
</style>
</head>
<body>
<div id="hdr">
<div>
<div id="logo">NEXUS</div>
<div id="logo-sub">Model Router &amp; Inference Gateway &middot; ki-fusion-labs.de</div>
</div>
<div id="hdr-stats">
<div class="hs"><span class="hs-n" id="hs-total" style="color:var(--txt)">0</span>REQUESTS</div>
<div class="hs"><span class="provider-dot pulse" style="background:var(--p1)"></span><span class="hs-n" id="hs-kf" style="color:var(--p1)">?</span>KI-FUSION</div>
<div class="hs"><span class="provider-dot" id="dot-hf" style="background:var(--sub)"></span><span class="hs-n" id="hs-hf" style="color:var(--p2)">?</span>HF API</div>
<div class="hs"><span class="provider-dot" id="dot-lc" style="background:var(--sub)"></span><span class="hs-n" id="hs-lc" style="color:var(--p3)">?</span>LOCAL</div>
</div>
</div>
<div id="main">
<!-- LEFT: Playground -->
<div id="left">
<div id="left-hdr">
<span id="left-hdr-title">INFERENCE PLAYGROUND</span>
<span style="font-size:.52rem;color:var(--sub)">OpenAI-compatible</span>
</div>
<div id="left-body">
<div class="pg-section">
<div class="pg-label">Prompt</div>
<textarea id="pg-prompt" placeholder="Enter your prompt... The router will automatically classify it and select the best model."></textarea>
</div>
<div class="ctrl-row">
<div class="ctl">
<label>Cost Mode</label>
<select id="pg-cost">
<option value="cheap">Cheap</option>
<option value="balanced" selected>Balanced</option>
<option value="best">Best</option>
</select>
</div>
<div class="ctl">
<label>Force Provider</label>
<select id="pg-prov">
<option value="">Auto-route</option>
<option value="ki_fusion">ki_fusion</option>
<option value="hf_api">hf_api</option>
<option value="local_cpu">local_cpu</option>
</select>
</div>
<div class="ctl">
<label>Max Tokens</label>
<input type="number" id="pg-tokens" value="512" min="64" max="4096" step="64">
</div>
</div>
<button id="btn-classify">&#128202; Classify Only (no inference)</button>
<button id="btn-send">&#9889; Route &amp; Infer</button>
<div id="route-preview">
<div class="rp-row" id="rp-badges"></div>
<div class="rp-reason" id="rp-reason"></div>
</div>
<div id="response-box">
<div id="response-meta"></div>
<div id="response-text"></div>
</div>
</div>
</div>
<!-- RIGHT: Panels -->
<div id="right">
<div id="tabs">
<div class="tab on" id="tab-log">ROUTING LOG</div>
<div class="tab" id="tab-providers">PROVIDERS</div>
<div class="tab" id="tab-stats">STATS</div>
<div class="tab" id="tab-models">MODELS</div>
</div>
<div class="panel on" id="panel-log"></div>
<div class="panel" id="panel-providers"></div>
<div class="panel" id="panel-stats"></div>
<div class="panel" id="panel-models"></div>
</div>
</div>
<div id="toasts"></div>
<div id="mcp-hint">MCP: <code>nexus_chat</code> &nbsp;|&nbsp; <code>POST /v1/chat/completions</code> &nbsp;|&nbsp; <code>GET /mcp/sse</code></div>
<script>
var STATS_DATA={};
var PROVIDER_COLORS={ki_fusion:'#0ea5e9',hf_api:'#7c3aed',local_cpu:'#2ed573'};
var TASK_COLORS={simple:'#5a5a80',reasoning:'#7c3aed',planning:'#ff6b00',
code:'#0ea5e9',vision:'#ff6b9d',embedding:'#2ed573',unknown:'#3a3a60'};
var COST_COLORS={cheap:'#2ed573',balanced:'#0ea5e9',best:'#ff9500'};
function esc(s){return String(s||'').replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;');}
function toast(msg,type){
var el=document.createElement('div');el.className='tst'+(type?' '+type:'');
el.textContent=msg;document.getElementById('toasts').appendChild(el);
setTimeout(function(){el.remove();},2800);
}
// Tabs
function showTab(t){
['log','providers','stats','models'].forEach(function(x){
document.getElementById('tab-'+x).className='tab'+(x==t?' on':'');
document.getElementById('panel-'+x).className='panel'+(x==t?' on':'');
});
if(t=='providers') renderProviders();
if(t=='stats') renderStats();
if(t=='models') renderModels();
}
document.getElementById('tab-log').addEventListener('click',function(){showTab('log');});
document.getElementById('tab-providers').addEventListener('click',function(){showTab('providers');});
document.getElementById('tab-stats').addEventListener('click',function(){showTab('stats');});
document.getElementById('tab-models').addEventListener('click',function(){showTab('models');});
// Classify
document.getElementById('btn-classify').addEventListener('click',function(){
var prompt=document.getElementById('pg-prompt').value.trim();
if(!prompt){toast('Enter a prompt','err');return;}
var cost=document.getElementById('pg-cost').value;
fetch('/api/classify',{method:'POST',headers:{'Content-Type':'application/json'},
body:JSON.stringify({prompt:prompt,cost_mode:cost})})
.then(function(r){return r.json();}).then(function(d){
showRoutePreview(d);
toast('Classified: '+d.task+' (complexity '+d.complexity+')');
}).catch(function(){toast('Error','err');});
});
function showRoutePreview(d){
var prev=document.getElementById('route-preview');
prev.classList.add('show');
var tc=TASK_COLORS[d.task]||'#aaa';
var pc=PROVIDER_COLORS[d.selected_provider||d.provider]||'#aaa';
var cc=COST_COLORS[d.cost_mode]||'#aaa';
var badges=document.getElementById('rp-badges');
badges.innerHTML=
'<span class="rp-badge" style="background:'+tc+'18;color:'+tc+';border:1px solid '+tc+'33">'+esc(d.task)+'</span>'
+'<span class="rp-arrow">&#8594;</span>'
+'<span class="rp-badge" style="font-size:.5rem;background:var(--s2);color:var(--sub)">complexity '+esc(String(d.complexity||d.complexity_score||'?'))+'/10</span>'
+'<span class="rp-arrow">&#8594;</span>'
+'<span class="rp-badge" style="background:'+pc+'18;color:'+pc+';border:1px solid '+pc+'33">'
+esc(d.selected_provider||d.provider)+'</span>';
document.getElementById('rp-reason').textContent=d.reason||'';
}
// Infer
document.getElementById('btn-send').addEventListener('click',function(){
var prompt=document.getElementById('pg-prompt').value.trim();
if(!prompt){toast('Enter a prompt','err');return;}
var cost=document.getElementById('pg-cost').value;
var prov=document.getElementById('pg-prov').value;
var tokens=parseInt(document.getElementById('pg-tokens').value)||512;
var btn=document.getElementById('btn-send');
btn.disabled=true;btn.textContent='Routing...';
var t0=Date.now();
fetch('/v1/chat/completions',{method:'POST',headers:{'Content-Type':'application/json'},
body:JSON.stringify({
messages:[{role:'user',content:prompt}],
max_tokens:tokens,cost_mode:cost,
provider:prov||undefined
})})
.then(function(r){
var provider=r.headers.get('X-Nexus-Provider')||'';
var task=r.headers.get('X-Nexus-Task')||'';
var reason=r.headers.get('X-Nexus-Reason')||'';
return r.json().then(function(d){return {data:d,provider:provider,task:task,reason:reason};});
})
.then(function(obj){
var d=obj.data; var ms=Date.now()-t0;
var nx=d._nexus||{provider:obj.provider,task:obj.task,reason:obj.reason,latency_ms:ms,model:'?'};
var content='';
if(d.choices&&d.choices[0]){
var msg=d.choices[0].message;
content=msg.content||msg.text||'';
}
showResponse(content,nx);
showRoutePreview({task:nx.task,complexity:nx.complexity||'?',
selected_provider:nx.provider,reason:nx.reason});
loadLog();loadHeaderStats();
})
.catch(function(e){toast('Error: '+e.message,'err');})
.finally(function(){btn.disabled=false;btn.innerHTML='&#9889; Route &amp; Infer';});
});
function showResponse(content,nx){
var box=document.getElementById('response-box');
box.classList.add('show');
var pc=PROVIDER_COLORS[nx.provider]||'#aaa';
var tc=TASK_COLORS[nx.task]||'#aaa';
var meta=document.getElementById('response-meta');
meta.innerHTML=
'<span class="r-badge" style="background:'+pc+'18;color:'+pc+';border:1px solid '+pc+'33">'+esc(nx.provider)+'</span>'
+'<span class="r-badge" style="background:'+tc+'18;color:'+tc+';border:1px solid '+tc+'33">'+esc(nx.task)+'</span>'
+'<span class="r-badge" style="background:var(--s2);color:var(--sub);font-size:.46rem">'+esc(String(nx.latency_ms||'?'))+'ms</span>'
+(nx.model?'<span class="r-badge" style="background:var(--dim);color:var(--sub);font-size:.44rem;font-family:monospace">'+esc((nx.model||'').split('/').pop())+'</span>':'');
document.getElementById('response-text').textContent=content;
}
// Routing Log
function loadLog(){
fetch('/api/stats').then(function(r){return r.json();}).then(function(s){
STATS_DATA=s;
renderLog(s.recent||[]);
loadHeaderStats(s);
}).catch(function(){});
}
function loadHeaderStats(s){
if(!s){fetch('/api/stats').then(function(r){return r.json();}).then(loadHeaderStats);return;}
document.getElementById('hs-total').textContent=s.total_requests||0;
var bp=s.by_provider||{};
document.getElementById('hs-kf').textContent=(bp.ki_fusion||{}).ok||0;
document.getElementById('hs-hf').textContent=(bp.hf_api||{}).ok||0;
document.getElementById('hs-lc').textContent=(bp.local_cpu||{}).ok||0;
// Health dots
fetch('/api/health').then(function(r){return r.json();}).then(function(h){
var hf=h.hf_api||{};
var lc=h.local_cpu||{};
document.getElementById('dot-hf').style.background=hf.ok?'var(--p2)':'var(--sub)';
document.getElementById('dot-lc').style.background=lc.ok?'var(--p3)':'var(--sub)';
if(hf.ok)document.getElementById('dot-hf').classList.add('pulse');
if(lc.ok)document.getElementById('dot-lc').classList.add('pulse');
}).catch(function(){});
}
function renderLog(entries){
var panel=document.getElementById('panel-log');
if(!entries.length){
panel.innerHTML='<div style="text-align:center;padding:2rem;font-size:.6rem;color:var(--sub)">No routing decisions yet<br><span style="font-size:.52rem;opacity:.5">Use the playground to send requests</span></div>';
return;
}
panel.innerHTML=entries.map(function(e){
var pc=PROVIDER_COLORS[e.provider]||'#aaa';
var tc=TASK_COLORS[e.task]||'#aaa';
var cc=COST_COLORS[e.cost_mode]||'#aaa';
return '<div class="log-entry">'
+'<div class="le-top">'
+'<span class="le-id">'+esc(e.id)+'</span>'
+'<span class="le-provider" style="background:'+pc+'18;color:'+pc+';border:1px solid '+pc+'33">'+esc(e.provider)+'</span>'
+'<span class="le-task" style="background:'+tc+'18;color:'+tc+';border:1px solid '+tc+'33">'+esc(e.task)+'</span>'
+'<span class="le-status '+(e.ok?'le-ok':'le-fail')+'">'+(e.ok?'OK':'FAIL')+'</span>'
+'<span class="le-ms">'+e.ms+'ms'+(e.tokens?' &middot; '+e.tokens+'tok':'')+'</span>'
+'</div>'
+'<div class="le-reason">'+esc(e.reason||'')+'</div>'
+'<div class="le-model">'+esc(e.model||'')+'</div>'
+'</div>';
}).join('');
}
function renderProviders(){
fetch('/api/health').then(function(r){return r.json();}).then(function(h){
var s=STATS_DATA.by_provider||{};
var panel=document.getElementById('panel-providers');
var providers=[
{key:'ki_fusion',cls:'ki',name:'KI-FUSION LABS',sub:'LM Studio via PHP proxy',
url:'ki-fusion-labs.de/v1',desc:'Your local RTX 5090 — highest quality, zero cost'},
{key:'hf_api',cls:'hf',name:'HUGGINGFACE API',sub:'Serverless Inference (HF_TOKEN)',
url:'api-inference.huggingface.co',desc:'Free tier with rate limits; Llama, Qwen, Mistral, Phi'},
{key:'local_cpu',cls:'lc',name:'LOCAL CPU',sub:'Qwen2.5-0.5B via transformers',
url:'in-process',desc:'Always-on fallback; slow but zero-latency network; no API key needed'},
];
panel.innerHTML='<div class="provider-grid">'+providers.map(function(p){
var health=h[p.key]||{};
var stat=s[p.key]||{ok:0,fail:0,avg_ms:0};
var ok=health.ok;
return '<div class="pc '+p.cls+'">'
+'<div class="pc-name">'+esc(p.name)+'</div>'
+'<div class="pc-sub">'+esc(p.sub)+'</div>'
+'<div class="pc-status"><div class="pc-dot '+(ok?'pulse':'')+'" style="background:'+(ok?PROVIDER_COLORS[p.key]:'var(--cr)')+'"></div>'
+'<span class="'+(ok?'pc-active':'pc-inactive')+'">'+(ok?'ACTIVE':'OFFLINE')+'</span></div>'
+'<div class="pc-stats">'
+'Requests: <span class="pc-stat-n">'+(stat.ok||0)+'</span> ok / <span style="color:var(--cr)">'+(stat.fail||0)+'</span> fail<br>'
+'Avg latency: <span class="pc-stat-n">'+(stat.avg_ms||'—')+'</span> ms<br>'
+'URL: <span style="font-size:.48rem;opacity:.5">'+esc(p.url)+'</span><br>'
+'<span style="font-size:.52rem;opacity:.6;line-height:1.5">'+esc(p.desc)+'</span>'
+'</div>'
+'<button class="pc-toggle" data-key="'+p.key+'">'+(ok?'Disable':'Enable')+'</button>'
+'</div>';
}).join('')+'</div>';
panel.querySelectorAll('.pc-toggle').forEach(function(btn){
btn.addEventListener('click',function(){
var key=this.getAttribute('data-key');
var h_entry=h[key]||{};
fetch('/api/providers/'+key+'/toggle',{method:'POST',
headers:{'Content-Type':'application/json'},
body:JSON.stringify({enabled:!h_entry.ok})})
.then(function(){renderProviders();toast('Provider updated','ok');})
.catch(function(){toast('Error','err');});
});
});
}).catch(function(){});
}
function renderStats(){
var s=STATS_DATA;
if(!s||!s.total_requests){loadLog();setTimeout(renderStats,500);return;}
var panel=document.getElementById('panel-stats');
var bp=s.by_provider||{};
var bt=s.by_task||{};
var bc=s.by_cost||{};
var total=s.total_requests||1;
function barRow(label,val,max,col){
var pct=max>0?Math.round(val/max*100):0;
return '<div class="bar-row"><span class="bar-label">'+esc(label)+'</span>'
+'<div class="bar-track"><div class="bar-fill" style="width:'+pct+'%;background:'+col+'"></div></div>'
+'<span class="bar-val">'+val+'</span></div>';
}
var maxTask=Math.max(1,...Object.values(bt));
var maxProv=Math.max(1,...Object.values(bp).map(function(d){return d.ok||0;}));
panel.innerHTML=
'<div class="stats-grid">'
+'<div class="stat-card"><div class="sc-title">Total Requests</div>'
+'<div class="sc-n">'+total+'</div></div>'
+'<div class="stat-card"><div class="sc-title">By Task</div>'
+'<div class="bar-container">'
+Object.entries(bt).filter(function(e){return e[1]>0;}).map(function(e){
return barRow(e[0],e[1],maxTask,TASK_COLORS[e[0]]||'#aaa');}).join('')
+'</div></div>'
+'<div class="stat-card"><div class="sc-title">By Cost Mode</div>'
+'<div class="bar-container">'
+Object.entries(bc).filter(function(e){return e[1]>0;}).map(function(e){
return barRow(e[0],e[1],total,COST_COLORS[e[0]]||'#aaa');}).join('')
+'</div></div>'
+'</div>'
+'<div class="stats-grid">'
+['ki_fusion','hf_api','local_cpu'].map(function(k){
var d=bp[k]||{ok:0,fail:0,avg_ms:0,tokens:0};
var pc=PROVIDER_COLORS[k]||'#aaa';
var total_p=((d.ok||0)+(d.fail||0))||1;
var sr=Math.round((d.ok||0)/total_p*100);
return '<div class="stat-card"><div class="sc-title" style="color:'+pc+'">'+k+'</div>'
+'<div class="sc-n" style="color:'+pc+'">'+(d.ok||0)+'</div>'
+'<div class="sc-sub">success rate: '+sr+'%</div>'
+'<div class="sc-sub">avg latency: '+(d.avg_ms||'—')+'ms</div>'
+'<div class="sc-sub">tokens: '+(d.tokens||0)+'</div>'
+'</div>';
}).join('')
+'</div>';
}
function renderModels(){
fetch('/v1/models').then(function(r){return r.json();}).then(function(d){
var models=d.data||[];
var panel=document.getElementById('panel-models');
panel.innerHTML='<table class="models-table"><thead><tr>'
+'<th>Model ID</th><th>Provider</th><th>Description</th></tr></thead><tbody>'
+models.map(function(m){
var owner=m.owned_by||'';
var col=owner=='ki-fusion-labs'?'var(--p1)':owner=='huggingface'?'var(--p2)':owner=='local'?'var(--p3)':'var(--sub)';
return '<tr><td style="font-family:monospace;font-size:.58rem">'+esc(m.id)+'</td>'
+'<td><span class="m-provider" style="background:'+col+'18;color:'+col+';border:1px solid '+col+'33">'+esc(m.owned_by||'')+'</span></td>'
+'<td style="font-size:.56rem;color:var(--sub)">'+esc(m.description||'')+'</td></tr>';
}).join('')
+'</tbody></table>';
}).catch(function(){toast('Error loading models','err');});
}
document.addEventListener('keydown',function(e){
if((e.ctrlKey||e.metaKey)&&e.key=='Enter'){
var active=document.activeElement;
if(active&&active.id=='pg-prompt') document.getElementById('btn-send').click();
}
});
loadLog();
setInterval(function(){loadLog();},8000);
</script>
</body>
</html>"""