Pawan Mane commited on
Commit
4cc24b5
Β·
1 Parent(s): 8986591

LLM Changes

Browse files
Files changed (1) hide show
  1. app/utils/llm.py +123 -11
app/utils/llm.py CHANGED
@@ -1,27 +1,139 @@
1
  """
2
  app/utils/llm.py
3
  ────────────────
4
- LLM singleton factory.
5
- Import `llm` and `llm_with_tools` from here β€” never instantiate ChatGroq elsewhere.
 
 
 
 
 
 
 
 
6
  """
7
 
 
 
8
  from langchain_groq import ChatGroq
9
  from app.config import settings
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- def _build_llm() -> ChatGroq:
 
 
 
 
 
 
 
13
  return ChatGroq(
14
- model=settings.LLM_MODEL,
15
  temperature=settings.LLM_TEMPERATURE,
16
  api_key=settings.GROQ_API_KEY,
17
  )
18
 
19
 
20
- # Plain LLM β€” used by router, evaluator, memory summariser
21
- llm = _build_llm()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- # Lazy-bound version with tools (tools are registered after this module loads)
24
- # Call get_llm_with_tools() after tools are imported.
25
- def get_llm_with_tools(tools: list) -> ChatGroq:
26
- """Return an LLM instance with the given tools bound."""
27
- return llm.bind_tools(tools)
 
1
  """
2
  app/utils/llm.py
3
  ────────────────
4
+ LLM singleton with automatic model fallback chain.
5
+
6
+ When a model hits its rate limit (429), the client transparently
7
+ tries the next model in the FALLBACK_MODELS list.
8
+
9
+ Fallback order (separate daily token quotas on Groq free tier):
10
+ 1. Primary model from config (default: llama-3.3-70b-versatile, 500k TPD)
11
+ 2. llama-3.1-8b-instant (500k TPD)
12
+ 3. openai/gpt-oss-120b (100k TPD)
13
+ 4. meta-llama/llama-4-scout-17b-16e-instruct (100k TPD)
14
  """
15
 
16
+ import re
17
+ import time
18
  from langchain_groq import ChatGroq
19
  from app.config import settings
20
 
21
+ # ── Fallback chain ─────────────────────────────────────────────────────────
22
+ # Primary is whatever LLM_MODEL is set to in .env / HF Secrets.
23
+ # The rest are tried in order when the current one is rate-limited.
24
+ FALLBACK_MODELS = [
25
+ settings.LLM_MODEL,
26
+ "llama-3.1-8b-instant",
27
+ "openai/gpt-oss-120b",
28
+ "meta-llama/llama-4-scout-17b-16e-instruct",
29
+ ]
30
+ # Deduplicate while preserving order
31
+ seen = set()
32
+ FALLBACK_MODELS = [m for m in FALLBACK_MODELS if not (m in seen or seen.add(m))]
33
+
34
+ _RATE_LIMIT_RE = re.compile(r'try again in\s+(?:(\d+)m)?(?:([\d.]+)s)?', re.IGNORECASE)
35
+
36
+
37
+ def _is_rate_limit(error: Exception) -> bool:
38
+ return "429" in str(error) or "rate_limit_exceeded" in str(error)
39
+
40
 
41
+ def _parse_wait(error: Exception) -> float:
42
+ m = _RATE_LIMIT_RE.search(str(error))
43
+ if m:
44
+ return float(m.group(1) or 0) * 60 + float(m.group(2) or 0)
45
+ return 30.0
46
+
47
+
48
+ def _build(model: str) -> ChatGroq:
49
  return ChatGroq(
50
+ model=model,
51
  temperature=settings.LLM_TEMPERATURE,
52
  api_key=settings.GROQ_API_KEY,
53
  )
54
 
55
 
56
+ # ── FallbackLLM wrapper ────────────────────────────────────────────────────
57
+
58
+ class FallbackLLM:
59
+ """
60
+ Drop-in replacement for a ChatGroq instance.
61
+ On 429, switches to the next model in the chain automatically.
62
+ Remembers which model is currently active across calls.
63
+ """
64
+
65
+ def __init__(self):
66
+ self._index = 0 # index into FALLBACK_MODELS
67
+ self._client = _build(FALLBACK_MODELS[0])
68
+ print(f"[LLM] Active model: {FALLBACK_MODELS[0]}")
69
+
70
+ @property
71
+ def current_model(self) -> str:
72
+ return FALLBACK_MODELS[self._index]
73
+
74
+ def _next_model(self, error: Exception) -> bool:
75
+ """Switch to next model. Returns False if all exhausted."""
76
+ wait = _parse_wait(error)
77
+ print(f"[LLM] ⚠ {self.current_model} rate-limited β€” trying next model (wait would be {wait:.0f}s)")
78
+
79
+ self._index += 1
80
+ if self._index >= len(FALLBACK_MODELS):
81
+ self._index = 0 # full rotation β€” wait on primary
82
+ mins, secs = int(wait // 60), int(wait % 60)
83
+ print(f"[LLM] All models exhausted. Waiting {mins}m {secs}s for {self.current_model}...")
84
+ time.sleep(wait + 2)
85
+ self._client = _build(FALLBACK_MODELS[0])
86
+ return False
87
+
88
+ self._client = _build(FALLBACK_MODELS[self._index])
89
+ print(f"[LLM] βœ“ Switched to: {self.current_model}")
90
+ return True
91
+
92
+ def invoke(self, messages, **kwargs):
93
+ while True:
94
+ try:
95
+ return self._client.invoke(messages, **kwargs)
96
+ except Exception as e:
97
+ if _is_rate_limit(e):
98
+ exhausted = not self._next_model(e)
99
+ if exhausted:
100
+ raise # re-raise after waiting on primary
101
+ else:
102
+ raise
103
+
104
+ def bind_tools(self, tools):
105
+ """Return a bound-tools version that also falls back on rate limit."""
106
+ return FallbackLLMWithTools(self, tools)
107
+
108
+ # Passthrough for any other ChatGroq attributes callers might use
109
+ def __getattr__(self, name):
110
+ return getattr(self._client, name)
111
+
112
+
113
+ class FallbackLLMWithTools:
114
+ """Wraps FallbackLLM for tool-calling routes."""
115
+
116
+ def __init__(self, parent: FallbackLLM, tools: list):
117
+ self._parent = parent
118
+ self._tools = tools
119
+
120
+ def invoke(self, messages, **kwargs):
121
+ while True:
122
+ try:
123
+ bound = self._parent._client.bind_tools(self._tools)
124
+ return bound.invoke(messages, **kwargs)
125
+ except Exception as e:
126
+ if _is_rate_limit(e):
127
+ exhausted = not self._parent._next_model(e)
128
+ if exhausted:
129
+ raise
130
+ else:
131
+ raise
132
+
133
+
134
+ # ── Singletons ─────────────────────────────────────────────────────────────
135
+
136
+ llm = FallbackLLM()
137
 
138
+ def get_llm_with_tools(tools: list) -> FallbackLLMWithTools:
139
+ return llm.bind_tools(tools)