Spaces:
Running
Running
| """ | |
| Multi-Model LLM Client (Python) β All FREE on NVIDIA NIM | |
| 3 models, 1 provider, 1 API key, $0 cost: | |
| 1. MiniMax M2.7 β Best reasoning, 4M context, built-in CoT | |
| 2. LLaMA 3.3 70B β Reliable fallback | |
| 3. LLaMA 3.1 8B β Fast, simple tasks | |
| 4. Deterministic β Zero LLM fallback | |
| """ | |
| import time | |
| import json | |
| import hashlib | |
| import logging | |
| from typing import Optional | |
| from openai import AsyncOpenAI | |
| from config import settings | |
| logger = logging.getLogger(__name__) | |
| # βββ Model configs (ALL on NVIDIA NIM) βββββββββββββββββββββββ | |
| MODEL_CONFIGS = [ | |
| { | |
| "name": "MiniMax M2.7", | |
| "model": "minimaxai/minimax-m2.7", | |
| "max_context": 4_000_000, | |
| "best_for": "profiling, scoring, complex reasoning", | |
| }, | |
| { | |
| "name": "LLaMA 3.3 70B", | |
| "model": "meta/llama-3.3-70b-instruct", | |
| "max_context": 128_000, | |
| "best_for": "general tasks, reliable fallback", | |
| }, | |
| { | |
| "name": "LLaMA 3.1 8B", | |
| "model": "meta/llama-3.1-8b-instruct", | |
| "max_context": 128_000, | |
| "best_for": "email classification, simple checks", | |
| }, | |
| ] | |
| # βββ Shared client (single provider) βββββββββββββββββββββββββ | |
| _client: Optional[AsyncOpenAI] = None | |
| def get_client() -> AsyncOpenAI: | |
| global _client | |
| if _client is None: | |
| _client = AsyncOpenAI( | |
| base_url=settings.NVIDIA_NIM_BASE_URL, | |
| api_key=settings.NVIDIA_API_KEY, | |
| ) | |
| return _client | |
| # βββ Main LLM call βββββββββββββββββββββββββββββββββββββββββββ | |
| async def call_llm( | |
| operation: str, | |
| system_prompt: str, | |
| user_prompt: str, | |
| model_index: int = 0, | |
| temperature: float = 0.2, | |
| max_tokens: int = 1024, | |
| json_mode: bool = True, | |
| trace_id: str = "", | |
| company_id: str = None, | |
| ) -> dict: | |
| """Call LLM with fallback: MiniMax β LLaMA 70B β LLaMA 8B β Deterministic""" | |
| if model_index >= len(MODEL_CONFIGS): | |
| logger.error(f"ALL models failed for {operation} β deterministic fallback") | |
| return _deterministic_fallback() | |
| config = MODEL_CONFIGS[model_index] | |
| client = get_client() | |
| start = time.time() | |
| try: | |
| kwargs = { | |
| "model": config["model"], | |
| "messages": [ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_prompt}, | |
| ], | |
| "temperature": temperature, | |
| "max_tokens": max_tokens, | |
| "top_p": 0.9, | |
| } | |
| if json_mode: | |
| kwargs["response_format"] = {"type": "json_object"} | |
| response = await client.chat.completions.create(**kwargs) | |
| message = response.choices[0].message | |
| content = message.content or "" | |
| reasoning = getattr(message, "reasoning_content", None) | |
| usage = response.usage | |
| latency_ms = int((time.time() - start) * 1000) | |
| parsed = _safe_parse_json(content) if json_mode else None | |
| if json_mode and parsed is None: | |
| logger.warning(f"JSON parse failed on {config['name']} β next model") | |
| return await call_llm(operation, system_prompt, user_prompt, | |
| model_index + 1, temperature, max_tokens, | |
| json_mode, trace_id, company_id) | |
| result = { | |
| "content": content, | |
| "reasoning": reasoning, | |
| "parsed": parsed, | |
| "model": config["name"], | |
| "provider": "nvidia", | |
| "tokens": { | |
| "prompt": usage.prompt_tokens if usage else 0, | |
| "completion": usage.completion_tokens if usage else 0, | |
| "total": usage.total_tokens if usage else 0, | |
| }, | |
| "latency_ms": latency_ms, | |
| "fallback_used": False, | |
| } | |
| if reasoning: | |
| logger.debug(f"MiniMax reasoning: {reasoning[:150]}...") | |
| await _log_trace(trace_id, operation, config["name"], result, True, company_id) | |
| return result | |
| except Exception as e: | |
| error_msg = str(e) | |
| if "429" in error_msg: | |
| logger.warning(f"Rate limited on {config['name']} β waiting 10s") | |
| await _async_sleep(10) | |
| return await call_llm(operation, system_prompt, user_prompt, | |
| model_index, temperature, max_tokens, | |
| json_mode, trace_id, company_id) | |
| logger.warning(f"{config['name']} failed ({error_msg[:80]}) β next model") | |
| return await call_llm(operation, system_prompt, user_prompt, | |
| model_index + 1, temperature, max_tokens, | |
| json_mode, trace_id, company_id) | |
| def _deterministic_fallback() -> dict: | |
| return { | |
| "content": "", | |
| "reasoning": None, | |
| "parsed": None, | |
| "model": "deterministic_fallback", | |
| "provider": "none", | |
| "tokens": {"prompt": 0, "completion": 0, "total": 0}, | |
| "latency_ms": 0, | |
| "fallback_used": True, | |
| } | |
| # βββ Self-consistency check ββββββββββββββββββββββββββββββββββ | |
| async def call_with_consistency( | |
| operation: str, | |
| system_prompt: str, | |
| user_prompt: str, | |
| trace_id: str = "", | |
| company_id: str = None, | |
| ) -> dict: | |
| primary = await call_llm(operation, system_prompt, user_prompt, | |
| temperature=0.1, trace_id=trace_id, company_id=company_id) | |
| if operation not in ("profile", "score"): | |
| return {**primary, "is_consistent": True, "consistency_score": 1.0} | |
| if primary.get("fallback_used"): | |
| return {**primary, "is_consistent": True, "consistency_score": 0.5} | |
| # MiniMax with reasoning = inherently more consistent | |
| if primary.get("model") == "MiniMax M2.7" and primary.get("reasoning"): | |
| return {**primary, "is_consistent": True, "consistency_score": 0.95} | |
| secondary = await call_llm(operation, system_prompt, user_prompt, | |
| temperature=0.4, trace_id=trace_id, company_id=company_id) | |
| score = _compare_outputs(primary.get("parsed"), secondary.get("parsed")) | |
| return {**primary, "is_consistent": score >= 0.75, "consistency_score": score} | |
| def _compare_outputs(a: dict, b: dict) -> float: | |
| if not a or not b: | |
| return 0.5 | |
| matches = 0 | |
| total = 0 | |
| for key in ["ai_readiness", "tier", "service_match"]: | |
| if key in a and key in b: | |
| total += 1 | |
| if a[key] == b[key]: | |
| matches += 1 | |
| for key in ["total_score", "company_fit"]: | |
| av = a.get(key) | |
| bv = b.get(key) | |
| if isinstance(av, (int, float)) and isinstance(bv, (int, float)): | |
| total += 1 | |
| if abs(av - bv) <= 10: | |
| matches += 1 | |
| return matches / total if total > 0 else 1.0 | |
| # βββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _safe_parse_json(text: str) -> Optional[dict]: | |
| content = text.strip() | |
| if "```json" in content: | |
| content = content.split("```json")[1].split("```")[0].strip() | |
| elif "```" in content: | |
| content = content.split("```")[1].split("```")[0].strip() | |
| try: | |
| return json.loads(content) | |
| except json.JSONDecodeError: | |
| import re | |
| match = re.search(r'\{[\s\S]*\}', content) | |
| if match: | |
| try: | |
| return json.loads(match.group()) | |
| except json.JSONDecodeError: | |
| return None | |
| return None | |
| async def _log_trace(trace_id, operation, model, result, success, company_id): | |
| try: | |
| from supabase import create_client | |
| sb = create_client(settings.SUPABASE_URL, settings.SUPABASE_SERVICE_ROLE_KEY) | |
| sb.table("llm_traces").insert({ | |
| "trace_id": trace_id, | |
| "operation": operation, | |
| "model": model, | |
| "provider": "nvidia", | |
| "prompt_tokens": result["tokens"]["prompt"] if result else 0, | |
| "completion_tokens": result["tokens"]["completion"] if result else 0, | |
| "total_tokens": result["tokens"]["total"] if result else 0, | |
| "latency_ms": result.get("latency_ms", 0) if result else 0, | |
| "success": success, | |
| "fallback_used": result.get("fallback_used", True) if result else True, | |
| "company_id": company_id, | |
| }).execute() | |
| except Exception as e: | |
| logger.debug(f"Trace log failed (non-critical): {e}") | |
| async def _async_sleep(seconds: int): | |
| import asyncio | |
| await asyncio.sleep(seconds) | |