Spaces:
Sleeping
Sleeping
File size: 4,547 Bytes
a2cbcac 36b481d a2cbcac d2f5b87 a2cbcac 8ed954c 36b481d 8ed954c 36b481d 8ed954c 36b481d a2cbcac 8ed954c a2cbcac 8ed954c a2cbcac 8ed954c 36b481d 5896d2e 8ed954c a2cbcac 8ed954c 36b481d 195fc1b 645673f 195fc1b 645673f c602f24 645673f d2f5b87 36b481d d2f5b87 36b481d d2f5b87 36b481d d2f5b87 36b481d d2f5b87 36b481d d2f5b87 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | import os
import time
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnableConfig
load_dotenv()
_llm_instance = None
# Ordered by preference: quality + reliability + speed
MODEL_CHAIN = [
"nvidia/nemotron-3-nano-30b-a3b:free",
"stepfun/step-3.5-flash:free",
"arcee-ai/trinity-large-preview:free",
"google/gemma-3-27b-it:free",
"meta-llama/llama-3.3-70b-instruct:free",
"mistralai/mistral-small-3.1-24b-instruct:free",
]
def get_llm() -> ChatOpenAI:
"""Lazy-initialised LLM singleton with automatic model fallback.
Tries the primary model first. If it has been marked as failing,
the fallback chain is tried until one works.
"""
global _llm_instance
if _llm_instance is not None:
return _llm_instance
api_key = os.getenv("OPENROUTER_API_KEY")
if not api_key:
from src.core.logger import get_logger
logger = get_logger(__name__)
logger.error("OPENROUTER_API_KEY not found in environment")
available = [k for k in os.environ if "API" in k or "KEY" in k]
logger.error("Available key-like env vars: %s", available)
raise ValueError("OPENROUTER_API_KEY not found. Check your secrets.")
_llm_instance = ChatOpenAI(
model=MODEL_CHAIN[0],
api_key=api_key,
base_url="https://openrouter.ai/api/v1",
temperature=0,
)
return _llm_instance
def get_structured_llm(max_tokens: int = 65536) -> ChatOpenAI:
"""Return an LLM instance configured for structured output.
A generous ``max_tokens`` ensures reasoning models have enough
headroom to think and then produce the full structured JSON.
"""
api_key = os.getenv("OPENROUTER_API_KEY")
if not api_key:
raise ValueError("OPENROUTER_API_KEY not found.")
return ChatOpenAI(
model=MODEL_CHAIN[0],
api_key=api_key,
base_url="https://openrouter.ai/api/v1",
temperature=0,
max_tokens=max_tokens,
request_timeout=120,
)
def invoke_with_fallback(prompt: str, max_retries: int = 2, run_name: str = "llm_call") -> str:
"""Invoke the LLM with automatic model fallback on 429 rate limits.
Tries each model in MODEL_CHAIN until one succeeds. Returns the
response content string.
Each invocation is tagged with the model name so LangSmith can filter
by ``model:<name>`` and ``error:429`` for the error dashboard.
"""
from src.core.logger import get_logger
logger = get_logger(__name__)
api_key = os.getenv("OPENROUTER_API_KEY")
if not api_key:
raise ValueError("OPENROUTER_API_KEY not found.")
last_error = None
for model_id in MODEL_CHAIN:
for attempt in range(max_retries):
try:
llm = ChatOpenAI(
model=model_id,
api_key=api_key,
base_url="https://openrouter.ai/api/v1",
temperature=0,
)
# LangSmith: tag every call with model name + attempt number
config = RunnableConfig(
run_name=run_name,
tags=[f"model:{model_id}", f"attempt:{attempt + 1}"],
metadata={
"model_id": model_id,
"attempt": attempt + 1,
"fallback_position": MODEL_CHAIN.index(model_id),
},
)
response = llm.invoke(prompt, config=config)
logger.info("LLM response from %s (attempt %d)", model_id, attempt + 1)
return response.content
except Exception as exc:
last_error = exc
err_str = str(exc)
if "429" in err_str:
logger.warning("Rate-limited on %s (attempt %d), trying next...", model_id, attempt + 1)
time.sleep(2)
break # move to next model
elif "404" in err_str:
logger.warning("Model %s not available, skipping", model_id)
break # move to next model
else:
logger.error("LLM error on %s: %s", model_id, exc)
if attempt < max_retries - 1:
time.sleep(1)
else:
break
raise RuntimeError(f"All {len(MODEL_CHAIN)} models failed. Last tried: {MODEL_CHAIN[-1]}. Last error: {last_error}")
|