Spaces:
Running
Running
File size: 6,283 Bytes
24f95f0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | """
Unified model client for Janus.
Smart router: Gemini β Groq β OpenRouter β Cloudflare β Ollama.
All tiers use the OpenAI-compatible messages format.
Includes retry-with-backoff for 429 rate limits.
"""
import os
import json
import re
import logging
import time
import httpx
from typing import Any
logger = logging.getLogger(__name__)
OPENROUTER_BASE = "https://openrouter.ai/api/v1"
# FIXED: replaced dead/renamed model IDs (all were returning HTTP 400)
FREE_MODEL_LADDER = [
"deepseek/deepseek-r1:free",
"google/gemini-2.0-flash-thinking-exp:free",
"meta-llama/llama-3.3-70b-instruct:free",
"google/gemma-3-27b-it:free",
"nousresearch/hermes-3-llama-3.1-405b:free",
]
OLLAMA_BASE = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
TIMEOUT = 90
OLLAMA_TIMEOUT = 30
MAX_RETRIES_PER_MODEL = 2
BASE_BACKOFF = 3
OLLAMA_REACHABILITY_TIMEOUT = 1.5
def _ollama_is_reachable() -> bool:
base = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
if base.endswith("/api"):
probe_url = f"{base}/tags"
else:
probe_url = f"{base}/api/tags"
try:
with httpx.Client(timeout=OLLAMA_REACHABILITY_TIMEOUT) as client:
response = client.get(probe_url)
return response.status_code < 500
except Exception:
return False
def _huggingface_call(messages: list[dict], **kwargs) -> str:
"""Call HuggingFace Inference API."""
from app.agents.huggingface import hf_client
return hf_client.chat(messages, **kwargs)
def _openrouter_call(messages: list[dict], model: str, **kwargs) -> str:
"""Single call to OpenRouter. Raises on non-200."""
api_key = os.getenv("OPENROUTER_API_KEY", "")
if not api_key:
raise ValueError("OPENROUTER_API_KEY is not set")
headers = {
"Authorization": f"Bearer {api_key}",
"HTTP-Referer": "https://huggingface.co/spaces/DevodG/Janus-backend",
"X-Title": "Janus",
"Content-Type": "application/json",
}
body = {"model": model, "messages": messages, "max_tokens": 4096, **kwargs}
r = httpx.post(
f"{OPENROUTER_BASE}/chat/completions",
headers=headers,
json=body,
timeout=TIMEOUT,
)
r.raise_for_status()
data = r.json()
msg_data = data["choices"][0]["message"]
content = msg_data.get("content") or ""
reasoning = msg_data.get("reasoning")
if reasoning:
content = f"<think>\n{reasoning}\n</think>\n\n{content}"
if not content:
raise ValueError(f"Empty response from {model}")
return content
def _ollama_call(messages: list[dict], **kwargs) -> str:
"""Fallback: Ollama local via OpenAI-compatible endpoint."""
if not _ollama_is_reachable():
raise RuntimeError("Ollama server is not reachable")
base = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
if base.endswith("/api"):
base = base[:-4]
ollama_model = os.getenv(
"OLLAMA_CHAT_MODEL", os.getenv("OLLAMA_MODEL", "qwen2.5-coder:3b")
)
body = {"model": ollama_model, "messages": messages, "stream": False}
r = httpx.post(f"{base}/v1/chat/completions", json=body, timeout=OLLAMA_TIMEOUT)
r.raise_for_status()
return r.json()["choices"][0]["message"]["content"]
def _call_with_retry(messages: list[dict], model: str, **kwargs) -> str:
"""
Call OpenRouter with retry-on-429 backoff.
Retries up to MAX_RETRIES_PER_MODEL times for rate limits.
"""
for attempt in range(MAX_RETRIES_PER_MODEL + 1):
try:
return _openrouter_call(messages, model, **kwargs)
except httpx.HTTPStatusError as e:
if e.response.status_code == 429:
if attempt >= MAX_RETRIES_PER_MODEL:
raise
retry_after = e.response.headers.get("retry-after")
if retry_after:
try:
wait = min(float(retry_after), 30)
except ValueError:
wait = BASE_BACKOFF * (2 ** attempt)
else:
wait = BASE_BACKOFF * (2 ** attempt)
logger.warning(
f"Rate limited on {model} (attempt {attempt + 1}/{MAX_RETRIES_PER_MODEL + 1}), "
f"waiting {wait:.1f}s..."
)
time.sleep(wait)
else:
raise
return _openrouter_call(messages, model, **kwargs)
def call_model(messages: list[dict], **kwargs) -> str:
"""
Smart router: Gemini β Groq β OpenRouter β Cloudflare β Ollama.
Returns raw text. Never returns None.
"""
try:
from app.agents.smart_router import call_model as smart_call
return smart_call(messages, **kwargs)
except Exception as e:
logger.error(f"Smart router failed: {e}")
# Direct OpenRouter fallback with fixed model list
errors = []
if os.getenv("OPENROUTER_API_KEY", ""):
for model in FREE_MODEL_LADDER:
try:
result = _call_with_retry(messages, model, **kwargs)
logger.info(f"OpenRouter direct succeeded: {model}")
return result
except Exception as e2:
errors.append(f"OpenRouter [{model}]: {e2}")
else:
errors.append("OpenRouter: OPENROUTER_API_KEY is not set")
# Ollama last resort
if os.getenv("OLLAMA_ENABLED", "true").lower() == "true":
try:
return _ollama_call(messages, **kwargs)
except Exception as e3:
errors.append(f"Ollama: {e3}")
else:
errors.append("Ollama: disabled")
raise RuntimeError("All model tiers failed:\n" + "\n".join(errors))
def safe_parse(text: str) -> dict:
"""
Strip markdown fences, attempt JSON parse.
On failure returns a structured error dict β NEVER returns None.
"""
cleaned = re.sub(r"```(?:json)?|```", "", text).strip()
try:
return json.loads(cleaned)
except json.JSONDecodeError:
match = re.search(r"\{.*\}", cleaned, re.DOTALL)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
return {"error": "parse_failed", "raw": text[:800]}
|