File size: 4,547 Bytes
a2cbcac
36b481d
a2cbcac
 
d2f5b87
a2cbcac
 
 
8ed954c
 
36b481d
 
 
 
 
 
 
 
 
 
8ed954c
 
36b481d
8ed954c
36b481d
 
a2cbcac
8ed954c
 
 
 
a2cbcac
8ed954c
a2cbcac
8ed954c
 
 
 
 
 
 
 
36b481d
5896d2e
 
8ed954c
a2cbcac
8ed954c
36b481d
 
195fc1b
645673f
 
195fc1b
 
645673f
 
 
 
 
 
 
 
 
 
 
c602f24
645673f
 
 
d2f5b87
36b481d
 
 
 
d2f5b87
 
 
36b481d
 
 
 
 
 
 
 
d2f5b87
 
36b481d
 
 
 
 
 
 
 
 
d2f5b87
 
 
 
 
 
 
 
 
 
 
 
 
36b481d
 
 
d2f5b87
36b481d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2f5b87
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import os
import time
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnableConfig

load_dotenv()

_llm_instance = None

# Ordered by preference: quality + reliability + speed
MODEL_CHAIN = [
    "nvidia/nemotron-3-nano-30b-a3b:free",
    "stepfun/step-3.5-flash:free",
    "arcee-ai/trinity-large-preview:free",
    "google/gemma-3-27b-it:free",
    "meta-llama/llama-3.3-70b-instruct:free",
    "mistralai/mistral-small-3.1-24b-instruct:free",
]


def get_llm() -> ChatOpenAI:
    """Lazy-initialised LLM singleton with automatic model fallback.

    Tries the primary model first.  If it has been marked as failing,
    the fallback chain is tried until one works.
    """
    global _llm_instance
    if _llm_instance is not None:
        return _llm_instance

    api_key = os.getenv("OPENROUTER_API_KEY")

    if not api_key:
        from src.core.logger import get_logger
        logger = get_logger(__name__)
        logger.error("OPENROUTER_API_KEY not found in environment")
        available = [k for k in os.environ if "API" in k or "KEY" in k]
        logger.error("Available key-like env vars: %s", available)
        raise ValueError("OPENROUTER_API_KEY not found. Check your secrets.")

    _llm_instance = ChatOpenAI(
        model=MODEL_CHAIN[0],
        api_key=api_key,
        base_url="https://openrouter.ai/api/v1",
        temperature=0,
    )
    return _llm_instance


def get_structured_llm(max_tokens: int = 65536) -> ChatOpenAI:
    """Return an LLM instance configured for structured output.

    A generous ``max_tokens`` ensures reasoning models have enough
    headroom to think and then produce the full structured JSON.
    """
    api_key = os.getenv("OPENROUTER_API_KEY")
    if not api_key:
        raise ValueError("OPENROUTER_API_KEY not found.")

    return ChatOpenAI(
        model=MODEL_CHAIN[0],
        api_key=api_key,
        base_url="https://openrouter.ai/api/v1",
        temperature=0,
        max_tokens=max_tokens,
        request_timeout=120,
    )


def invoke_with_fallback(prompt: str, max_retries: int = 2, run_name: str = "llm_call") -> str:
    """Invoke the LLM with automatic model fallback on 429 rate limits.

    Tries each model in MODEL_CHAIN until one succeeds.  Returns the
    response content string.

    Each invocation is tagged with the model name so LangSmith can filter
    by ``model:<name>`` and ``error:429`` for the error dashboard.
    """
    from src.core.logger import get_logger
    logger = get_logger(__name__)

    api_key = os.getenv("OPENROUTER_API_KEY")
    if not api_key:
        raise ValueError("OPENROUTER_API_KEY not found.")

    last_error = None

    for model_id in MODEL_CHAIN:
        for attempt in range(max_retries):
            try:
                llm = ChatOpenAI(
                    model=model_id,
                    api_key=api_key,
                    base_url="https://openrouter.ai/api/v1",
                    temperature=0,
                )

                # LangSmith: tag every call with model name + attempt number
                config = RunnableConfig(
                    run_name=run_name,
                    tags=[f"model:{model_id}", f"attempt:{attempt + 1}"],
                    metadata={
                        "model_id": model_id,
                        "attempt": attempt + 1,
                        "fallback_position": MODEL_CHAIN.index(model_id),
                    },
                )

                response = llm.invoke(prompt, config=config)
                logger.info("LLM response from %s (attempt %d)", model_id, attempt + 1)
                return response.content
            except Exception as exc:
                last_error = exc
                err_str = str(exc)
                if "429" in err_str:
                    logger.warning("Rate-limited on %s (attempt %d), trying next...", model_id, attempt + 1)
                    time.sleep(2)
                    break  # move to next model
                elif "404" in err_str:
                    logger.warning("Model %s not available, skipping", model_id)
                    break  # move to next model
                else:
                    logger.error("LLM error on %s: %s", model_id, exc)
                    if attempt < max_retries - 1:
                        time.sleep(1)
                    else:
                        break

    raise RuntimeError(f"All {len(MODEL_CHAIN)} models failed. Last tried: {MODEL_CHAIN[-1]}. Last error: {last_error}")