Spaces:

DevodG
/

Janus-backend

Running

App Files Files Community

Janus-backend / backend /app /agents /_model.py

DevodG

deploy: Janus full system stabilization

24f95f0 24 days ago

raw

history blame contribute delete

6.28 kB

	"""
	Unified model client for Janus.
	Smart router: Gemini → Groq → OpenRouter → Cloudflare → Ollama.
	All tiers use the OpenAI-compatible messages format.
	Includes retry-with-backoff for 429 rate limits.
	"""

	import os
	import json
	import re
	import logging
	import time
	import httpx
	from typing import Any

	logger = logging.getLogger(__name__)

	OPENROUTER_BASE = "https://openrouter.ai/api/v1"

	# FIXED: replaced dead/renamed model IDs (all were returning HTTP 400)
	FREE_MODEL_LADDER = [
	"deepseek/deepseek-r1:free",
	"google/gemini-2.0-flash-thinking-exp:free",
	"meta-llama/llama-3.3-70b-instruct:free",
	"google/gemma-3-27b-it:free",
	"nousresearch/hermes-3-llama-3.1-405b:free",
	]

	OLLAMA_BASE = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
	TIMEOUT = 90
	OLLAMA_TIMEOUT = 30
	MAX_RETRIES_PER_MODEL = 2
	BASE_BACKOFF = 3
	OLLAMA_REACHABILITY_TIMEOUT = 1.5


	def _ollama_is_reachable() -> bool:
	base = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
	if base.endswith("/api"):
	probe_url = f"{base}/tags"
	else:
	probe_url = f"{base}/api/tags"
	try:
	with httpx.Client(timeout=OLLAMA_REACHABILITY_TIMEOUT) as client:
	response = client.get(probe_url)
	return response.status_code < 500
	except Exception:
	return False


	def _huggingface_call(messages: list[dict], **kwargs) -> str:
	"""Call HuggingFace Inference API."""
	from app.agents.huggingface import hf_client
	return hf_client.chat(messages, **kwargs)


	def _openrouter_call(messages: list[dict], model: str, **kwargs) -> str:
	"""Single call to OpenRouter. Raises on non-200."""
	api_key = os.getenv("OPENROUTER_API_KEY", "")
	if not api_key:
	raise ValueError("OPENROUTER_API_KEY is not set")

	headers = {
	"Authorization": f"Bearer {api_key}",
	"HTTP-Referer": "https://huggingface.co/spaces/DevodG/Janus-backend",
	"X-Title": "Janus",
	"Content-Type": "application/json",
	}
	body = {"model": model, "messages": messages, "max_tokens": 4096, **kwargs}
	r = httpx.post(
	f"{OPENROUTER_BASE}/chat/completions",
	headers=headers,
	json=body,
	timeout=TIMEOUT,
	)
	r.raise_for_status()
	data = r.json()
	msg_data = data["choices"][0]["message"]
	content = msg_data.get("content") or ""
	reasoning = msg_data.get("reasoning")
	if reasoning:
	content = f"<think>\n{reasoning}\n</think>\n\n{content}"
	if not content:
	raise ValueError(f"Empty response from {model}")
	return content


	def _ollama_call(messages: list[dict], **kwargs) -> str:
	"""Fallback: Ollama local via OpenAI-compatible endpoint."""
	if not _ollama_is_reachable():
	raise RuntimeError("Ollama server is not reachable")

	base = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
	if base.endswith("/api"):
	base = base[:-4]

	ollama_model = os.getenv(
	"OLLAMA_CHAT_MODEL", os.getenv("OLLAMA_MODEL", "qwen2.5-coder:3b")
	)
	body = {"model": ollama_model, "messages": messages, "stream": False}
	r = httpx.post(f"{base}/v1/chat/completions", json=body, timeout=OLLAMA_TIMEOUT)
	r.raise_for_status()
	return r.json()["choices"][0]["message"]["content"]


	def _call_with_retry(messages: list[dict], model: str, **kwargs) -> str:
	"""
	Call OpenRouter with retry-on-429 backoff.
	Retries up to MAX_RETRIES_PER_MODEL times for rate limits.
	"""
	for attempt in range(MAX_RETRIES_PER_MODEL + 1):
	try:
	return _openrouter_call(messages, model, **kwargs)
	except httpx.HTTPStatusError as e:
	if e.response.status_code == 429:
	if attempt >= MAX_RETRIES_PER_MODEL:
	raise
	retry_after = e.response.headers.get("retry-after")
	if retry_after:
	try:
	wait = min(float(retry_after), 30)
	except ValueError:
	wait = BASE_BACKOFF * (2 ** attempt)
	else:
	wait = BASE_BACKOFF * (2 ** attempt)
	logger.warning(
	f"Rate limited on {model} (attempt {attempt + 1}/{MAX_RETRIES_PER_MODEL + 1}), "
	f"waiting {wait:.1f}s..."
	)
	time.sleep(wait)
	else:
	raise
	return _openrouter_call(messages, model, **kwargs)


	def call_model(messages: list[dict], **kwargs) -> str:
	"""
	Smart router: Gemini → Groq → OpenRouter → Cloudflare → Ollama.
	Returns raw text. Never returns None.
	"""
	try:
	from app.agents.smart_router import call_model as smart_call
	return smart_call(messages, **kwargs)
	except Exception as e:
	logger.error(f"Smart router failed: {e}")

	# Direct OpenRouter fallback with fixed model list
	errors = []
	if os.getenv("OPENROUTER_API_KEY", ""):
	for model in FREE_MODEL_LADDER:
	try:
	result = _call_with_retry(messages, model, **kwargs)
	logger.info(f"OpenRouter direct succeeded: {model}")
	return result
	except Exception as e2:
	errors.append(f"OpenRouter [{model}]: {e2}")
	else:
	errors.append("OpenRouter: OPENROUTER_API_KEY is not set")

	# Ollama last resort
	if os.getenv("OLLAMA_ENABLED", "true").lower() == "true":
	try:
	return _ollama_call(messages, **kwargs)
	except Exception as e3:
	errors.append(f"Ollama: {e3}")
	else:
	errors.append("Ollama: disabled")

	raise RuntimeError("All model tiers failed:\n" + "\n".join(errors))


	def safe_parse(text: str) -> dict:
	"""
	Strip markdown fences, attempt JSON parse.
	On failure returns a structured error dict — NEVER returns None.
	"""
	cleaned = re.sub(r"```(?:json)?\|```", "", text).strip()
	try:
	return json.loads(cleaned)
	except json.JSONDecodeError:
	match = re.search(r"\{.*\}", cleaned, re.DOTALL)
	if match:
	try:
	return json.loads(match.group())
	except json.JSONDecodeError:
	pass
	return {"error": "parse_failed", "raw": text[:800]}