precison9 commited on
Commit
153ac71
·
verified ·
1 Parent(s): c8f4f41

Add llm_client.py — unified multi-provider LLM client

Browse files
Files changed (1) hide show
  1. multeclaw/llm_client.py +381 -0
multeclaw/llm_client.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Multeclaw LLM Client — unified interface across OpenAI, Anthropic, HuggingFace, Groq, Ollama.
3
+ Uses native SDKs for maximum control, with LiteLLM as fallback router.
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import time
9
+ import traceback
10
+ from typing import Generator, Optional, Any
11
+ from dataclasses import dataclass
12
+
13
+ from multeclaw.config import Provider, ModelDef, MODEL_REGISTRY
14
+
15
+
16
+ @dataclass
17
+ class LLMResponse:
18
+ """Standardized response across all providers."""
19
+ content: str
20
+ model: str
21
+ provider: str
22
+ finish_reason: str = "stop"
23
+ input_tokens: int = 0
24
+ output_tokens: int = 0
25
+ latency_ms: float = 0.0
26
+ error: Optional[str] = None
27
+
28
+ @property
29
+ def total_tokens(self) -> int:
30
+ return self.input_tokens + self.output_tokens
31
+
32
+
33
+ class MultiModelClient:
34
+ """
35
+ Unified LLM client supporting multiple providers.
36
+ Handles streaming, error recovery, and provider-specific API differences.
37
+ """
38
+
39
+ def __init__(self):
40
+ self._clients: dict[str, Any] = {}
41
+ self._api_keys: dict[str, str] = {}
42
+ self._ollama_url: str = "http://localhost:11434"
43
+ self._load_env_keys()
44
+
45
+ # ─── Key Management ────────────────────────────────────────────────────
46
+ def _load_env_keys(self):
47
+ """Load API keys from environment variables."""
48
+ mappings = {
49
+ "openai": "OPENAI_API_KEY",
50
+ "anthropic": "ANTHROPIC_API_KEY",
51
+ "huggingface": "HF_TOKEN",
52
+ "groq": "GROQ_API_KEY",
53
+ }
54
+ for provider, env_var in mappings.items():
55
+ key = os.environ.get(env_var, "")
56
+ if key:
57
+ self._api_keys[provider] = key
58
+
59
+ def set_api_key(self, provider: str, key: str):
60
+ """Set an API key for a provider, re-initializing its client."""
61
+ self._api_keys[provider] = key
62
+ self._clients.pop(provider, None) # Force re-init
63
+
64
+ def set_ollama_url(self, url: str):
65
+ self._ollama_url = url
66
+ self._clients.pop("ollama", None)
67
+
68
+ def get_available_models(self) -> list[str]:
69
+ """Return model names that have valid API keys configured."""
70
+ available = []
71
+ for name, model_def in MODEL_REGISTRY.items():
72
+ provider = model_def.provider.value
73
+ if provider == "ollama":
74
+ available.append(name) # Always show local models
75
+ elif provider in self._api_keys and self._api_keys[provider]:
76
+ available.append(name)
77
+ return available
78
+
79
+ def check_connections(self) -> dict[str, dict]:
80
+ """Test connectivity for all configured providers."""
81
+ results = {}
82
+ for provider_name, key in self._api_keys.items():
83
+ try:
84
+ if provider_name == "openai":
85
+ import openai
86
+ c = openai.OpenAI(api_key=key, timeout=10)
87
+ c.models.list()
88
+ results[provider_name] = {"status": "✅ Connected", "models": "Available"}
89
+ elif provider_name == "anthropic":
90
+ results[provider_name] = {"status": "✅ Key Set", "models": "Available"}
91
+ elif provider_name == "huggingface":
92
+ from huggingface_hub import InferenceClient
93
+ c = InferenceClient(api_key=key, timeout=10)
94
+ results[provider_name] = {"status": "✅ Key Set", "models": "Available"}
95
+ elif provider_name == "groq":
96
+ results[provider_name] = {"status": "✅ Key Set", "models": "Available"}
97
+ except Exception as e:
98
+ results[provider_name] = {"status": f"❌ Error: {str(e)[:80]}", "models": "Unavailable"}
99
+
100
+ # Check Ollama
101
+ try:
102
+ import httpx
103
+ r = httpx.get(f"{self._ollama_url}/api/tags", timeout=5)
104
+ if r.status_code == 200:
105
+ models = [m["name"] for m in r.json().get("models", [])]
106
+ results["ollama"] = {"status": "✅ Running", "models": ", ".join(models[:5]) or "None"}
107
+ else:
108
+ results["ollama"] = {"status": "⚠️ Responded but error", "models": "Unknown"}
109
+ except Exception:
110
+ results["ollama"] = {"status": "⚪ Not running (optional)", "models": "N/A"}
111
+
112
+ return results
113
+
114
+ # ─── Client Initialization ─────────────────────────────────────────────
115
+ def _get_openai_client(self):
116
+ if "openai" not in self._clients:
117
+ import openai
118
+ self._clients["openai"] = openai.OpenAI(
119
+ api_key=self._api_keys.get("openai", ""),
120
+ timeout=120,
121
+ )
122
+ return self._clients["openai"]
123
+
124
+ def _get_anthropic_client(self):
125
+ if "anthropic" not in self._clients:
126
+ import anthropic
127
+ self._clients["anthropic"] = anthropic.Anthropic(
128
+ api_key=self._api_keys.get("anthropic", ""),
129
+ timeout=120,
130
+ )
131
+ return self._clients["anthropic"]
132
+
133
+ def _get_hf_client(self):
134
+ if "huggingface" not in self._clients:
135
+ from huggingface_hub import InferenceClient
136
+ self._clients["huggingface"] = InferenceClient(
137
+ provider="novita",
138
+ api_key=self._api_keys.get("huggingface", ""),
139
+ timeout=120,
140
+ )
141
+ return self._clients["huggingface"]
142
+
143
+ # ─── Completion (Non-streaming) ────────────────────────────────────────
144
+ def complete(
145
+ self,
146
+ model_name: str,
147
+ messages: list[dict],
148
+ system_prompt: str = "",
149
+ temperature: float = 0.7,
150
+ max_tokens: int = 4096,
151
+ tools: Optional[list] = None,
152
+ ) -> LLMResponse:
153
+ """
154
+ Send a completion request to the appropriate provider.
155
+ Returns a standardized LLMResponse.
156
+ """
157
+ if model_name not in MODEL_REGISTRY:
158
+ return LLMResponse(content="", model=model_name, provider="unknown",
159
+ error=f"Unknown model: {model_name}")
160
+
161
+ model_def = MODEL_REGISTRY[model_name]
162
+ provider = model_def.provider
163
+ start = time.time()
164
+
165
+ try:
166
+ if provider == Provider.OPENAI:
167
+ return self._complete_openai(model_def, messages, system_prompt, temperature, max_tokens, tools, start)
168
+ elif provider == Provider.ANTHROPIC:
169
+ return self._complete_anthropic(model_def, messages, system_prompt, temperature, max_tokens, tools, start)
170
+ elif provider == Provider.HUGGINGFACE:
171
+ return self._complete_hf(model_def, messages, system_prompt, temperature, max_tokens, start)
172
+ elif provider == Provider.GROQ:
173
+ return self._complete_groq(model_def, messages, system_prompt, temperature, max_tokens, start)
174
+ elif provider == Provider.OLLAMA:
175
+ return self._complete_ollama(model_def, messages, system_prompt, temperature, max_tokens, start)
176
+ else:
177
+ return LLMResponse(content="", model=model_name, provider=provider.value,
178
+ error=f"Unsupported provider: {provider}")
179
+ except Exception as e:
180
+ return LLMResponse(
181
+ content="", model=model_name, provider=provider.value,
182
+ error=f"{type(e).__name__}: {str(e)}",
183
+ latency_ms=(time.time() - start) * 1000,
184
+ )
185
+
186
+ def _complete_openai(self, model_def, messages, system_prompt, temperature, max_tokens, tools, start):
187
+ client = self._get_openai_client()
188
+ msgs = self._build_openai_messages(messages, system_prompt)
189
+ kwargs = dict(model=model_def.model_id, messages=msgs, temperature=temperature, max_tokens=max_tokens)
190
+ if tools:
191
+ kwargs["tools"] = [{"type": "function", "function": t} for t in tools]
192
+ kwargs["tool_choice"] = "auto"
193
+ resp = client.chat.completions.create(**kwargs)
194
+ choice = resp.choices[0]
195
+ content = choice.message.content or ""
196
+ # Handle tool calls
197
+ if choice.message.tool_calls:
198
+ tool_calls = [{"name": tc.function.name, "arguments": tc.function.arguments} for tc in choice.message.tool_calls]
199
+ content = json.dumps({"tool_calls": tool_calls}, indent=2)
200
+ return LLMResponse(
201
+ content=content, model=model_def.model_id, provider="openai",
202
+ finish_reason=choice.finish_reason or "stop",
203
+ input_tokens=resp.usage.prompt_tokens if resp.usage else 0,
204
+ output_tokens=resp.usage.completion_tokens if resp.usage else 0,
205
+ latency_ms=(time.time() - start) * 1000,
206
+ )
207
+
208
+ def _complete_anthropic(self, model_def, messages, system_prompt, temperature, max_tokens, tools, start):
209
+ client = self._get_anthropic_client()
210
+ # Anthropic: system is a top-level param, NOT in messages
211
+ filtered = [m for m in messages if m.get("role") != "system"]
212
+ kwargs = dict(model=model_def.model_id, messages=filtered, max_tokens=max_tokens, temperature=temperature)
213
+ if system_prompt:
214
+ kwargs["system"] = system_prompt
215
+ if tools:
216
+ kwargs["tools"] = [{"name": t["name"], "description": t["description"], "input_schema": t["parameters"]} for t in tools]
217
+ resp = client.messages.create(**kwargs)
218
+ content = ""
219
+ for block in resp.content:
220
+ if hasattr(block, "text"):
221
+ content += block.text
222
+ elif block.type == "tool_use":
223
+ content += json.dumps({"tool_use": {"name": block.name, "input": block.input, "id": block.id}}, indent=2)
224
+ return LLMResponse(
225
+ content=content, model=model_def.model_id, provider="anthropic",
226
+ finish_reason=resp.stop_reason or "end_turn",
227
+ input_tokens=resp.usage.input_tokens if resp.usage else 0,
228
+ output_tokens=resp.usage.output_tokens if resp.usage else 0,
229
+ latency_ms=(time.time() - start) * 1000,
230
+ )
231
+
232
+ def _complete_hf(self, model_def, messages, system_prompt, temperature, max_tokens, start):
233
+ client = self._get_hf_client()
234
+ msgs = self._build_openai_messages(messages, system_prompt)
235
+ resp = client.chat_completion(model=model_def.model_id, messages=msgs, max_tokens=max_tokens, temperature=max(temperature, 0.01))
236
+ content = resp.choices[0].message.content or ""
237
+ return LLMResponse(
238
+ content=content, model=model_def.model_id, provider="huggingface",
239
+ finish_reason=resp.choices[0].finish_reason or "stop",
240
+ input_tokens=resp.usage.prompt_tokens if resp.usage else 0,
241
+ output_tokens=resp.usage.completion_tokens if resp.usage else 0,
242
+ latency_ms=(time.time() - start) * 1000,
243
+ )
244
+
245
+ def _complete_groq(self, model_def, messages, system_prompt, temperature, max_tokens, start):
246
+ """Groq uses OpenAI-compatible API."""
247
+ import openai
248
+ client = openai.OpenAI(
249
+ api_key=self._api_keys.get("groq", ""),
250
+ base_url="https://api.groq.com/openai/v1",
251
+ timeout=60,
252
+ )
253
+ msgs = self._build_openai_messages(messages, system_prompt)
254
+ resp = client.chat.completions.create(model=model_def.model_id, messages=msgs, temperature=temperature, max_tokens=max_tokens)
255
+ choice = resp.choices[0]
256
+ return LLMResponse(
257
+ content=choice.message.content or "", model=model_def.model_id, provider="groq",
258
+ finish_reason=choice.finish_reason or "stop",
259
+ input_tokens=resp.usage.prompt_tokens if resp.usage else 0,
260
+ output_tokens=resp.usage.completion_tokens if resp.usage else 0,
261
+ latency_ms=(time.time() - start) * 1000,
262
+ )
263
+
264
+ def _complete_ollama(self, model_def, messages, system_prompt, temperature, max_tokens, start):
265
+ """Ollama uses OpenAI-compatible API."""
266
+ import openai
267
+ client = openai.OpenAI(
268
+ api_key="ollama",
269
+ base_url=f"{self._ollama_url}/v1",
270
+ timeout=120,
271
+ )
272
+ msgs = self._build_openai_messages(messages, system_prompt)
273
+ resp = client.chat.completions.create(model=model_def.model_id, messages=msgs, temperature=temperature, max_tokens=max_tokens)
274
+ choice = resp.choices[0]
275
+ return LLMResponse(
276
+ content=choice.message.content or "", model=model_def.model_id, provider="ollama",
277
+ finish_reason=choice.finish_reason or "stop",
278
+ latency_ms=(time.time() - start) * 1000,
279
+ )
280
+
281
+ # ─── Streaming Completion ──────────────────────────────────────────────
282
+ def stream(
283
+ self,
284
+ model_name: str,
285
+ messages: list[dict],
286
+ system_prompt: str = "",
287
+ temperature: float = 0.7,
288
+ max_tokens: int = 4096,
289
+ ) -> Generator[str, None, None]:
290
+ """
291
+ Stream a completion. Yields partial text chunks.
292
+ Handles provider-specific streaming differences.
293
+ """
294
+ if model_name not in MODEL_REGISTRY:
295
+ yield f"❌ Unknown model: {model_name}"
296
+ return
297
+
298
+ model_def = MODEL_REGISTRY[model_name]
299
+ provider = model_def.provider
300
+
301
+ try:
302
+ if provider == Provider.OPENAI:
303
+ yield from self._stream_openai(model_def, messages, system_prompt, temperature, max_tokens)
304
+ elif provider == Provider.ANTHROPIC:
305
+ yield from self._stream_anthropic(model_def, messages, system_prompt, temperature, max_tokens)
306
+ elif provider == Provider.HUGGINGFACE:
307
+ yield from self._stream_hf(model_def, messages, system_prompt, temperature, max_tokens)
308
+ elif provider == Provider.GROQ:
309
+ yield from self._stream_groq(model_def, messages, system_prompt, temperature, max_tokens)
310
+ elif provider == Provider.OLLAMA:
311
+ yield from self._stream_ollama(model_def, messages, system_prompt, temperature, max_tokens)
312
+ else:
313
+ yield f"❌ Unsupported provider for streaming: {provider}"
314
+ except Exception as e:
315
+ yield f"\n\n❌ **Streaming Error** ({type(e).__name__}): {str(e)}"
316
+
317
+ def _stream_openai(self, model_def, messages, system_prompt, temperature, max_tokens):
318
+ client = self._get_openai_client()
319
+ msgs = self._build_openai_messages(messages, system_prompt)
320
+ stream = client.chat.completions.create(
321
+ model=model_def.model_id, messages=msgs, temperature=temperature,
322
+ max_tokens=max_tokens, stream=True,
323
+ )
324
+ for chunk in stream:
325
+ delta = chunk.choices[0].delta.content
326
+ if delta:
327
+ yield delta
328
+
329
+ def _stream_anthropic(self, model_def, messages, system_prompt, temperature, max_tokens):
330
+ client = self._get_anthropic_client()
331
+ filtered = [m for m in messages if m.get("role") != "system"]
332
+ kwargs = dict(model=model_def.model_id, messages=filtered, max_tokens=max_tokens, temperature=temperature)
333
+ if system_prompt:
334
+ kwargs["system"] = system_prompt
335
+ with client.messages.stream(**kwargs) as stream:
336
+ for text in stream.text_stream:
337
+ yield text
338
+
339
+ def _stream_hf(self, model_def, messages, system_prompt, temperature, max_tokens):
340
+ client = self._get_hf_client()
341
+ msgs = self._build_openai_messages(messages, system_prompt)
342
+ stream = client.chat_completion(
343
+ model=model_def.model_id, messages=msgs, max_tokens=max_tokens,
344
+ temperature=max(temperature, 0.01), stream=True,
345
+ )
346
+ for chunk in stream:
347
+ delta = chunk.choices[0].delta.content
348
+ if delta:
349
+ yield delta
350
+
351
+ def _stream_groq(self, model_def, messages, system_prompt, temperature, max_tokens):
352
+ import openai
353
+ client = openai.OpenAI(api_key=self._api_keys.get("groq", ""), base_url="https://api.groq.com/openai/v1", timeout=60)
354
+ msgs = self._build_openai_messages(messages, system_prompt)
355
+ stream = client.chat.completions.create(model=model_def.model_id, messages=msgs, temperature=temperature, max_tokens=max_tokens, stream=True)
356
+ for chunk in stream:
357
+ delta = chunk.choices[0].delta.content
358
+ if delta:
359
+ yield delta
360
+
361
+ def _stream_ollama(self, model_def, messages, system_prompt, temperature, max_tokens):
362
+ import openai
363
+ client = openai.OpenAI(api_key="ollama", base_url=f"{self._ollama_url}/v1", timeout=120)
364
+ msgs = self._build_openai_messages(messages, system_prompt)
365
+ stream = client.chat.completions.create(model=model_def.model_id, messages=msgs, temperature=temperature, max_tokens=max_tokens, stream=True)
366
+ for chunk in stream:
367
+ delta = chunk.choices[0].delta.content
368
+ if delta:
369
+ yield delta
370
+
371
+ # ─── Helpers ───────────────────────────────────────────────────────────
372
+ @staticmethod
373
+ def _build_openai_messages(messages: list[dict], system_prompt: str = "") -> list[dict]:
374
+ """Build OpenAI-format message list with system prompt prepended."""
375
+ msgs = []
376
+ if system_prompt:
377
+ msgs.append({"role": "system", "content": system_prompt})
378
+ for m in messages:
379
+ if m.get("role") != "system": # Avoid duplicate system messages
380
+ msgs.append({"role": m["role"], "content": m["content"]})
381
+ return msgs