DevodG's picture
deploy: Janus full system stabilization
24f95f0
"""
Unified model client for Janus.
Smart router: Gemini → Groq → OpenRouter → Cloudflare → Ollama.
All tiers use the OpenAI-compatible messages format.
Includes retry-with-backoff for 429 rate limits.
"""
import os
import json
import re
import logging
import time
import httpx
from typing import Any
logger = logging.getLogger(__name__)
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
# FIXED: replaced dead/renamed model IDs (all were returning HTTP 400)
FREE_MODEL_LADDER = [
"deepseek/deepseek-r1:free",
"google/gemini-2.0-flash-thinking-exp:free",
"meta-llama/llama-3.3-70b-instruct:free",
"google/gemma-3-27b-it:free",
"nousresearch/hermes-3-llama-3.1-405b:free",
]
OLLAMA_BASE = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
TIMEOUT = 90
OLLAMA_TIMEOUT = 30
MAX_RETRIES_PER_MODEL = 2
BASE_BACKOFF = 3
OLLAMA_REACHABILITY_TIMEOUT = 1.5
def _ollama_is_reachable() -> bool:
base = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
if base.endswith("/api"):
probe_url = f"{base}/tags"
else:
probe_url = f"{base}/api/tags"
try:
with httpx.Client(timeout=OLLAMA_REACHABILITY_TIMEOUT) as client:
response = client.get(probe_url)
return response.status_code < 500
except Exception:
return False
def _huggingface_call(messages: list[dict], **kwargs) -> str:
"""Call HuggingFace Inference API."""
from app.agents.huggingface import hf_client
return hf_client.chat(messages, **kwargs)
def _openrouter_call(messages: list[dict], model: str, **kwargs) -> str:
"""Single call to OpenRouter. Raises on non-200."""
api_key = os.getenv("OPENROUTER_API_KEY", "")
if not api_key:
raise ValueError("OPENROUTER_API_KEY is not set")
headers = {
"Authorization": f"Bearer {api_key}",
"HTTP-Referer": "https://huggingface.co/spaces/DevodG/Janus-backend",
"X-Title": "Janus",
"Content-Type": "application/json",
}
body = {"model": model, "messages": messages, "max_tokens": 4096, **kwargs}
r = httpx.post(
f"{OPENROUTER_BASE}/chat/completions",
headers=headers,
json=body,
timeout=TIMEOUT,
)
r.raise_for_status()
data = r.json()
msg_data = data["choices"][0]["message"]
content = msg_data.get("content") or ""
reasoning = msg_data.get("reasoning")
if reasoning:
content = f"<think>\n{reasoning}\n</think>\n\n{content}"
if not content:
raise ValueError(f"Empty response from {model}")
return content
def _ollama_call(messages: list[dict], **kwargs) -> str:
"""Fallback: Ollama local via OpenAI-compatible endpoint."""
if not _ollama_is_reachable():
raise RuntimeError("Ollama server is not reachable")
base = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
if base.endswith("/api"):
base = base[:-4]
ollama_model = os.getenv(
"OLLAMA_CHAT_MODEL", os.getenv("OLLAMA_MODEL", "qwen2.5-coder:3b")
)
body = {"model": ollama_model, "messages": messages, "stream": False}
r = httpx.post(f"{base}/v1/chat/completions", json=body, timeout=OLLAMA_TIMEOUT)
r.raise_for_status()
return r.json()["choices"][0]["message"]["content"]
def _call_with_retry(messages: list[dict], model: str, **kwargs) -> str:
"""
Call OpenRouter with retry-on-429 backoff.
Retries up to MAX_RETRIES_PER_MODEL times for rate limits.
"""
for attempt in range(MAX_RETRIES_PER_MODEL + 1):
try:
return _openrouter_call(messages, model, **kwargs)
except httpx.HTTPStatusError as e:
if e.response.status_code == 429:
if attempt >= MAX_RETRIES_PER_MODEL:
raise
retry_after = e.response.headers.get("retry-after")
if retry_after:
try:
wait = min(float(retry_after), 30)
except ValueError:
wait = BASE_BACKOFF * (2 ** attempt)
else:
wait = BASE_BACKOFF * (2 ** attempt)
logger.warning(
f"Rate limited on {model} (attempt {attempt + 1}/{MAX_RETRIES_PER_MODEL + 1}), "
f"waiting {wait:.1f}s..."
)
time.sleep(wait)
else:
raise
return _openrouter_call(messages, model, **kwargs)
def call_model(messages: list[dict], **kwargs) -> str:
"""
Smart router: Gemini → Groq → OpenRouter → Cloudflare → Ollama.
Returns raw text. Never returns None.
"""
try:
from app.agents.smart_router import call_model as smart_call
return smart_call(messages, **kwargs)
except Exception as e:
logger.error(f"Smart router failed: {e}")
# Direct OpenRouter fallback with fixed model list
errors = []
if os.getenv("OPENROUTER_API_KEY", ""):
for model in FREE_MODEL_LADDER:
try:
result = _call_with_retry(messages, model, **kwargs)
logger.info(f"OpenRouter direct succeeded: {model}")
return result
except Exception as e2:
errors.append(f"OpenRouter [{model}]: {e2}")
else:
errors.append("OpenRouter: OPENROUTER_API_KEY is not set")
# Ollama last resort
if os.getenv("OLLAMA_ENABLED", "true").lower() == "true":
try:
return _ollama_call(messages, **kwargs)
except Exception as e3:
errors.append(f"Ollama: {e3}")
else:
errors.append("Ollama: disabled")
raise RuntimeError("All model tiers failed:\n" + "\n".join(errors))
def safe_parse(text: str) -> dict:
"""
Strip markdown fences, attempt JSON parse.
On failure returns a structured error dict — NEVER returns None.
"""
cleaned = re.sub(r"```(?:json)?|```", "", text).strip()
try:
return json.loads(cleaned)
except json.JSONDecodeError:
match = re.search(r"\{.*\}", cleaned, re.DOTALL)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
return {"error": "parse_failed", "raw": text[:800]}