/** * Multi-Model LLM Client — All FREE on NVIDIA NIM * * 3 models, 1 provider, 1 API key, $0 cost: * * Priority 1: MiniMax M2.7 → Best reasoning, 4M context, built-in CoT * Priority 2: LLaMA 3.3 70B → Reliable, proven, 128K context * Priority 3: LLaMA 3.1 8B → Fast, cheap, for simple tasks * Priority 4: Deterministic → Zero LLM, zero hallucination * * All on: https://integrate.api.nvidia.com/v1 * All use: same NVIDIA_API_KEY * * MiniMax M2.7 special feature: * Response includes `reasoning_content` field — chain-of-thought * reasoning happens AUTOMATICALLY inside the model. * We don't need to prompt "think step by step" — it does it natively. */ import axios, { AxiosError } from "axios"; import { createHash } from "crypto"; import { getEnv } from "../config/env"; import { getSupabaseClient } from "../supabase/client"; import { logger } from "../utils/logger"; // ─── Types ─────────────────────────────────────────────────── export interface LLMRequest { operation: string; modelIndex?: number; // 0=MiniMax, 1=LLaMA70B, 2=LLaMA8B systemPrompt: string; userPrompt: string; temperature?: number; maxTokens?: number; jsonMode?: boolean; traceId: string; companyId?: string; } export interface LLMResponse { content: string; reasoning: string | null; // MiniMax's built-in chain-of-thought parsed: Record | null; model: string; provider: string; tokens: { prompt: number; completion: number; total: number }; latencyMs: number; grounded: boolean; fallbackUsed: boolean; } // ─── Model configs (ALL on NVIDIA NIM, ALL FREE) ───────────── interface ModelConfig { name: string; model: string; maxContext: number; bestFor: string; } const MODEL_CONFIGS: ModelConfig[] = [ { name: "MiniMax M2.7", model: "minimaxai/minimax-m2.7", maxContext: 4_000_000, // 4M tokens! bestFor: "profiling, scoring, complex reasoning", }, { name: "LLaMA 3.3 70B", model: "meta/llama-3.3-70b-instruct", maxContext: 128_000, bestFor: "general tasks, reliable fallback", }, { name: "LLaMA 3.1 8B", model: "meta/llama-3.1-8b-instruct", maxContext: 128_000, bestFor: "email classification, simple checks", }, ]; export const MODELS = { MINIMAX: 0, // Primary — best reasoning LLAMA_70B: 1, // Fallback — reliable LLAMA_8B: 2, // Fast — simple tasks FAST: 2, // alias } as const; // ─── Main LLM call ────────────────────────────────────────── export async function callLLM(request: LLMRequest): Promise { const modelIndex = request.modelIndex ?? 0; const env = getEnv(); if (modelIndex >= MODEL_CONFIGS.length) { return deterministicFallback(request); } const config = MODEL_CONFIGS[modelIndex]; const startTime = Date.now(); const body: Record = { model: config.model, messages: [ { role: "system", content: request.systemPrompt }, { role: "user", content: request.userPrompt }, ], temperature: request.temperature ?? 0.2, max_tokens: request.maxTokens ?? 1024, top_p: 0.9, }; if (request.jsonMode) { body.response_format = { type: "json_object" }; } try { const response = await axios.post( `${env.NVIDIA_NIM_BASE_URL}/chat/completions`, body, { headers: { Authorization: `Bearer ${env.NVIDIA_API_KEY}`, "Content-Type": "application/json", }, timeout: 90_000, // MiniMax can take longer for reasoning } ); const data = response.data; const message = data.choices?.[0]?.message; const content = message?.content ?? ""; const reasoning = message?.reasoning_content ?? null; // MiniMax CoT const usage = data.usage ?? { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }; const latencyMs = Date.now() - startTime; let parsed: Record | null = null; if (request.jsonMode) { parsed = safeParseJSON(content); if (!parsed) { logger.warn({ operation: request.operation, model: config.name }, "JSON parse failed → next model"); return callLLM({ ...request, modelIndex: modelIndex + 1 }); } } const result: LLMResponse = { content, reasoning, parsed, model: config.name, provider: "nvidia", tokens: { prompt: usage.prompt_tokens, completion: usage.completion_tokens, total: usage.total_tokens, }, latencyMs, grounded: true, fallbackUsed: false, }; // Log MiniMax reasoning if present if (reasoning) { logger.debug({ operation: request.operation, reasoning: reasoning.slice(0, 200) }, "MiniMax reasoning captured"); } await logLLMTrace(request, result, true, config); return result; } catch (err) { if (err instanceof AxiosError) { if (err.response?.status === 429) { const retryAfter = parseInt(err.response.headers["retry-after"] ?? "5", 10); logger.warn({ model: config.name, retryAfter }, "Rate limited → waiting"); await sleep(retryAfter * 1000); return callLLM(request); } if (err.response?.status === 503 || err.response?.status === 500) { logger.warn({ model: config.name, status: err.response?.status }, `${config.name} unavailable → next`); return callLLM({ ...request, modelIndex: modelIndex + 1 }); } } logger.error({ model: config.name, err: String(err).slice(0, 200) }, "LLM call failed → next"); return callLLM({ ...request, modelIndex: modelIndex + 1 }); } } function deterministicFallback(request: LLMRequest): LLMResponse { logger.error({ operation: request.operation }, "ALL models failed → deterministic fallback"); return { content: "", reasoning: null, parsed: null, model: "deterministic_fallback", provider: "none", tokens: { prompt: 0, completion: 0, total: 0 }, latencyMs: 0, grounded: false, fallbackUsed: true, }; } // ─── Self-consistency check ────────────────────────────────── // NOTE: MiniMax has built-in reasoning → consistency is higher // We still do dual-temperature check for critical operations export async function callLLMWithConsistencyCheck( request: LLMRequest ): Promise<{ primary: LLMResponse; isConsistent: boolean; consistencyScore: number }> { const primary = await callLLM({ ...request, temperature: 0.1 }); if (!["profile", "score"].includes(request.operation)) { return { primary, isConsistent: true, consistencyScore: 1.0 }; } if (primary.fallbackUsed) { return { primary, isConsistent: true, consistencyScore: 0.5 }; } // MiniMax has reasoning → inherently more consistent // Only do consistency check with LLaMA models if (primary.model === "MiniMax M2.7" && primary.reasoning) { // MiniMax showed its reasoning → trust it more return { primary, isConsistent: true, consistencyScore: 0.95 }; } const secondary = await callLLM({ ...request, temperature: 0.4, modelIndex: request.modelIndex }); const score = compareOutputs(primary, secondary); return { primary, isConsistent: score >= 0.75, consistencyScore: score }; } function compareOutputs(a: LLMResponse, b: LLMResponse): number { if (!a.parsed || !b.parsed) return 0.5; let matches = 0, total = 0; for (const key of ["ai_readiness", "tier", "service_match"]) { if (key in a.parsed && key in b.parsed) { total++; if (a.parsed[key] === b.parsed[key]) matches++; } } for (const key of ["total_score", "company_fit"]) { const aVal = a.parsed[key], bVal = b.parsed[key]; if (typeof aVal === "number" && typeof bVal === "number") { total++; if (Math.abs(aVal - bVal) <= 10) matches++; } } return total === 0 ? 1.0 : matches / total; } // ─── Trace logging ─────────────────────────────────────────── async function logLLMTrace( request: LLMRequest, response: LLMResponse | null, success: boolean, config?: ModelConfig ): Promise { try { const db = getSupabaseClient(); await db.from("llm_traces").insert({ trace_id: request.traceId, operation: request.operation, model: response?.model ?? config?.name ?? "unknown", provider: "nvidia", prompt_tokens: response?.tokens.prompt ?? 0, completion_tokens: response?.tokens.completion ?? 0, total_tokens: response?.tokens.total ?? 0, latency_ms: response?.latencyMs ?? 0, success, fallback_used: response?.fallbackUsed ?? true, company_id: request.companyId ?? null, input_hash: hashText(request.userPrompt.slice(0, 200)), output_hash: response ? hashText(response.content.slice(0, 200)) : null, }); } catch (err) { logger.warn({ err }, "Trace log failed — non-critical"); } } // ─── Helpers ───────────────────────────────────────────────── function safeParseJSON(text: string): Record | null { let content = text.trim(); if (content.includes("```json")) content = content.split("```json")[1].split("```")[0].trim(); else if (content.includes("```")) content = content.split("```")[1].split("```")[0].trim(); try { return JSON.parse(content); } catch { const match = content.match(/\{[\s\S]*\}/); if (match) { try { return JSON.parse(match[0]); } catch { return null; } } return null; } } function hashText(text: string): string { return createHash("sha256").update(text).digest("hex").slice(0, 16); } function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); }