Spaces:

Cuong2004
/

math-solver

Sleeping

File size: 3,752 Bytes

395651c

import os
import json
import asyncio
import logging
from openai import AsyncOpenAI
from typing import List, Dict, Any, Optional
from app.url_utils import openai_compatible_api_key, sanitize_env

logger = logging.getLogger(__name__)

class MultiLayerLLMClient:
    def __init__(self):
        # 1. Models sequence loading
        self.models = []
        for i in range(1, 4):
            model = os.getenv(f"OPENROUTER_MODEL_{i}")
            if model:
                self.models.append(model)
        
        # Fallback to legacy OPENROUTER_MODEL if no numbered models found
        if not self.models:
            legacy_model = os.getenv("OPENROUTER_MODEL", "google/gemini-2.0-flash-001")
            self.models = [legacy_model]

        # 2. Key selection (No rotation, always use the first available key)
        api_key = os.getenv("OPENROUTER_API_KEY_1") or os.getenv("OPENROUTER_API_KEY")
        
        if not api_key:
            logger.error("[LLM] No OpenRouter API key found.")
            self.client = None
        else:
            self.client = AsyncOpenAI(
                api_key=openai_compatible_api_key(api_key),
                base_url="https://openrouter.ai/api/v1",
                timeout=60.0,
                default_headers={
                    "HTTP-Referer": "https://mathsolver.ai",
                    "X-Title": "MathSolver Backend",
                }
            )

    async def chat_completions_create(
        self, 
        messages: List[Dict[str, str]], 
        response_format: Optional[Dict[str, str]] = None,
        **kwargs
    ) -> str:
        """
        Implements Model Fallback Sequence: Model 1 -> Model 2 -> Model 3.
        Always starts from Model 1 for every new call.
        """
        if not self.client:
            raise ValueError("No API client configured. Check your API keys.")

        MAX_ATTEMPTS = len(self.models)
        RETRY_DELAY = 1.0 # second
        
        for attempt_idx in range(MAX_ATTEMPTS):
            current_model = self.models[attempt_idx]
            attempt_num = attempt_idx + 1
            
            try:
                logger.info(f"[LLM] Attempt {attempt_num}/{MAX_ATTEMPTS} using Model: {current_model}...")
                
                response = await self.client.chat.completions.create(
                    model=current_model,
                    messages=messages,
                    response_format=response_format,
                    **kwargs
                )
                
                if not response or not getattr(response, "choices", None):
                     raise ValueError(f"Invalid response structure from model {current_model}")

                content = response.choices[0].message.content
                if content:
                    logger.info(f"[LLM] SUCCESS on attempt {attempt_num} ({current_model}).")
                    return content
                
                raise ValueError(f"Empty content from model {current_model}")

            except Exception as e:
                err_msg = f"{type(e).__name__}: {str(e)}"
                logger.warning(f"[LLM] FAILED on attempt {attempt_num} ({current_model}): {err_msg}")
                
                if attempt_num < MAX_ATTEMPTS:
                    logger.info(f"[LLM] Retrying next model in {RETRY_DELAY}s...")
                    await asyncio.sleep(RETRY_DELAY)
                else:
                    logger.error(f"[LLM] FINAL FAILURE after {attempt_num} models.")
                    raise e

# Global instance for easy reuse (singleton-ish)
_llm_client = None

def get_llm_client() -> MultiLayerLLMClient:
    global _llm_client
    if _llm_client is None:
        _llm_client = MultiLayerLLMClient()
    return _llm_client