Spaces:

DevodG
/

Janus-backend

Running

File size: 6,283 Bytes

24f95f0

"""
Unified model client for Janus.
Smart router: Gemini → Groq → OpenRouter → Cloudflare → Ollama.
All tiers use the OpenAI-compatible messages format.
Includes retry-with-backoff for 429 rate limits.
"""

import os
import json
import re
import logging
import time
import httpx
from typing import Any

logger = logging.getLogger(__name__)

OPENROUTER_BASE = "https://openrouter.ai/api/v1"

# FIXED: replaced dead/renamed model IDs (all were returning HTTP 400)
FREE_MODEL_LADDER = [
    "deepseek/deepseek-r1:free",
    "google/gemini-2.0-flash-thinking-exp:free",
    "meta-llama/llama-3.3-70b-instruct:free",
    "google/gemma-3-27b-it:free",
    "nousresearch/hermes-3-llama-3.1-405b:free",
]

OLLAMA_BASE = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
TIMEOUT = 90
OLLAMA_TIMEOUT = 30
MAX_RETRIES_PER_MODEL = 2
BASE_BACKOFF = 3
OLLAMA_REACHABILITY_TIMEOUT = 1.5


def _ollama_is_reachable() -> bool:
    base = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
    if base.endswith("/api"):
        probe_url = f"{base}/tags"
    else:
        probe_url = f"{base}/api/tags"
    try:
        with httpx.Client(timeout=OLLAMA_REACHABILITY_TIMEOUT) as client:
            response = client.get(probe_url)
            return response.status_code < 500
    except Exception:
        return False


def _huggingface_call(messages: list[dict], **kwargs) -> str:
    """Call HuggingFace Inference API."""
    from app.agents.huggingface import hf_client
    return hf_client.chat(messages, **kwargs)


def _openrouter_call(messages: list[dict], model: str, **kwargs) -> str:
    """Single call to OpenRouter. Raises on non-200."""
    api_key = os.getenv("OPENROUTER_API_KEY", "")
    if not api_key:
        raise ValueError("OPENROUTER_API_KEY is not set")

    headers = {
        "Authorization": f"Bearer {api_key}",
        "HTTP-Referer": "https://huggingface.co/spaces/DevodG/Janus-backend",
        "X-Title": "Janus",
        "Content-Type": "application/json",
    }
    body = {"model": model, "messages": messages, "max_tokens": 4096, **kwargs}
    r = httpx.post(
        f"{OPENROUTER_BASE}/chat/completions",
        headers=headers,
        json=body,
        timeout=TIMEOUT,
    )
    r.raise_for_status()
    data = r.json()
    msg_data = data["choices"][0]["message"]
    content = msg_data.get("content") or ""
    reasoning = msg_data.get("reasoning")
    if reasoning:
        content = f"<think>\n{reasoning}\n</think>\n\n{content}"
    if not content:
        raise ValueError(f"Empty response from {model}")
    return content


def _ollama_call(messages: list[dict], **kwargs) -> str:
    """Fallback: Ollama local via OpenAI-compatible endpoint."""
    if not _ollama_is_reachable():
        raise RuntimeError("Ollama server is not reachable")

    base = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
    if base.endswith("/api"):
        base = base[:-4]

    ollama_model = os.getenv(
        "OLLAMA_CHAT_MODEL", os.getenv("OLLAMA_MODEL", "qwen2.5-coder:3b")
    )
    body = {"model": ollama_model, "messages": messages, "stream": False}
    r = httpx.post(f"{base}/v1/chat/completions", json=body, timeout=OLLAMA_TIMEOUT)
    r.raise_for_status()
    return r.json()["choices"][0]["message"]["content"]


def _call_with_retry(messages: list[dict], model: str, **kwargs) -> str:
    """
    Call OpenRouter with retry-on-429 backoff.
    Retries up to MAX_RETRIES_PER_MODEL times for rate limits.
    """
    for attempt in range(MAX_RETRIES_PER_MODEL + 1):
        try:
            return _openrouter_call(messages, model, **kwargs)
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429:
                if attempt >= MAX_RETRIES_PER_MODEL:
                    raise
                retry_after = e.response.headers.get("retry-after")
                if retry_after:
                    try:
                        wait = min(float(retry_after), 30)
                    except ValueError:
                        wait = BASE_BACKOFF * (2 ** attempt)
                else:
                    wait = BASE_BACKOFF * (2 ** attempt)
                logger.warning(
                    f"Rate limited on {model} (attempt {attempt + 1}/{MAX_RETRIES_PER_MODEL + 1}), "
                    f"waiting {wait:.1f}s..."
                )
                time.sleep(wait)
            else:
                raise
    return _openrouter_call(messages, model, **kwargs)


def call_model(messages: list[dict], **kwargs) -> str:
    """
    Smart router: Gemini → Groq → OpenRouter → Cloudflare → Ollama.
    Returns raw text. Never returns None.
    """
    try:
        from app.agents.smart_router import call_model as smart_call
        return smart_call(messages, **kwargs)
    except Exception as e:
        logger.error(f"Smart router failed: {e}")

    # Direct OpenRouter fallback with fixed model list
    errors = []
    if os.getenv("OPENROUTER_API_KEY", ""):
        for model in FREE_MODEL_LADDER:
            try:
                result = _call_with_retry(messages, model, **kwargs)
                logger.info(f"OpenRouter direct succeeded: {model}")
                return result
            except Exception as e2:
                errors.append(f"OpenRouter [{model}]: {e2}")
    else:
        errors.append("OpenRouter: OPENROUTER_API_KEY is not set")

    # Ollama last resort
    if os.getenv("OLLAMA_ENABLED", "true").lower() == "true":
        try:
            return _ollama_call(messages, **kwargs)
        except Exception as e3:
            errors.append(f"Ollama: {e3}")
    else:
        errors.append("Ollama: disabled")

    raise RuntimeError("All model tiers failed:\n" + "\n".join(errors))


def safe_parse(text: str) -> dict:
    """
    Strip markdown fences, attempt JSON parse.
    On failure returns a structured error dict — NEVER returns None.
    """
    cleaned = re.sub(r"```(?:json)?|```", "", text).strip()
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        match = re.search(r"\{.*\}", cleaned, re.DOTALL)
        if match:
            try:
                return json.loads(match.group())
            except json.JSONDecodeError:
                pass
    return {"error": "parse_failed", "raw": text[:800]}