Spaces:
Running
Running
| /** | |
| * Multi-Model LLM Client β All FREE on NVIDIA NIM | |
| * | |
| * 3 models, 1 provider, 1 API key, $0 cost: | |
| * | |
| * Priority 1: MiniMax M2.7 β Best reasoning, 4M context, built-in CoT | |
| * Priority 2: LLaMA 3.3 70B β Reliable, proven, 128K context | |
| * Priority 3: LLaMA 3.1 8B β Fast, cheap, for simple tasks | |
| * Priority 4: Deterministic β Zero LLM, zero hallucination | |
| * | |
| * All on: https://integrate.api.nvidia.com/v1 | |
| * All use: same NVIDIA_API_KEY | |
| * | |
| * MiniMax M2.7 special feature: | |
| * Response includes `reasoning_content` field β chain-of-thought | |
| * reasoning happens AUTOMATICALLY inside the model. | |
| * We don't need to prompt "think step by step" β it does it natively. | |
| */ | |
| import axios, { AxiosError } from "axios"; | |
| import { createHash } from "crypto"; | |
| import { getEnv } from "../config/env"; | |
| import { getSupabaseClient } from "../supabase/client"; | |
| import { logger } from "../utils/logger"; | |
| // βββ Types βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| export interface LLMRequest { | |
| operation: string; | |
| modelIndex?: number; // 0=MiniMax, 1=LLaMA70B, 2=LLaMA8B | |
| systemPrompt: string; | |
| userPrompt: string; | |
| temperature?: number; | |
| maxTokens?: number; | |
| jsonMode?: boolean; | |
| traceId: string; | |
| companyId?: string; | |
| } | |
| export interface LLMResponse { | |
| content: string; | |
| reasoning: string | null; // MiniMax's built-in chain-of-thought | |
| parsed: Record<string, unknown> | null; | |
| model: string; | |
| provider: string; | |
| tokens: { prompt: number; completion: number; total: number }; | |
| latencyMs: number; | |
| grounded: boolean; | |
| fallbackUsed: boolean; | |
| } | |
| // βββ Model configs (ALL on NVIDIA NIM, ALL FREE) βββββββββββββ | |
| interface ModelConfig { | |
| name: string; | |
| model: string; | |
| maxContext: number; | |
| bestFor: string; | |
| } | |
| const MODEL_CONFIGS: ModelConfig[] = [ | |
| { | |
| name: "MiniMax M2.7", | |
| model: "minimaxai/minimax-m2.7", | |
| maxContext: 4_000_000, // 4M tokens! | |
| bestFor: "profiling, scoring, complex reasoning", | |
| }, | |
| { | |
| name: "LLaMA 3.3 70B", | |
| model: "meta/llama-3.3-70b-instruct", | |
| maxContext: 128_000, | |
| bestFor: "general tasks, reliable fallback", | |
| }, | |
| { | |
| name: "LLaMA 3.1 8B", | |
| model: "meta/llama-3.1-8b-instruct", | |
| maxContext: 128_000, | |
| bestFor: "email classification, simple checks", | |
| }, | |
| ]; | |
| export const MODELS = { | |
| MINIMAX: 0, // Primary β best reasoning | |
| LLAMA_70B: 1, // Fallback β reliable | |
| LLAMA_8B: 2, // Fast β simple tasks | |
| FAST: 2, // alias | |
| } as const; | |
| // βββ Main LLM call ββββββββββββββββββββββββββββββββββββββββββ | |
| export async function callLLM(request: LLMRequest): Promise<LLMResponse> { | |
| const modelIndex = request.modelIndex ?? 0; | |
| const env = getEnv(); | |
| if (modelIndex >= MODEL_CONFIGS.length) { | |
| return deterministicFallback(request); | |
| } | |
| const config = MODEL_CONFIGS[modelIndex]; | |
| const startTime = Date.now(); | |
| const body: Record<string, unknown> = { | |
| model: config.model, | |
| messages: [ | |
| { role: "system", content: request.systemPrompt }, | |
| { role: "user", content: request.userPrompt }, | |
| ], | |
| temperature: request.temperature ?? 0.2, | |
| max_tokens: request.maxTokens ?? 1024, | |
| top_p: 0.9, | |
| }; | |
| if (request.jsonMode) { | |
| body.response_format = { type: "json_object" }; | |
| } | |
| try { | |
| const response = await axios.post( | |
| `${env.NVIDIA_NIM_BASE_URL}/chat/completions`, | |
| body, | |
| { | |
| headers: { | |
| Authorization: `Bearer ${env.NVIDIA_API_KEY}`, | |
| "Content-Type": "application/json", | |
| }, | |
| timeout: 90_000, // MiniMax can take longer for reasoning | |
| } | |
| ); | |
| const data = response.data; | |
| const message = data.choices?.[0]?.message; | |
| const content = message?.content ?? ""; | |
| const reasoning = message?.reasoning_content ?? null; // MiniMax CoT | |
| const usage = data.usage ?? { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }; | |
| const latencyMs = Date.now() - startTime; | |
| let parsed: Record<string, unknown> | null = null; | |
| if (request.jsonMode) { | |
| parsed = safeParseJSON(content); | |
| if (!parsed) { | |
| logger.warn({ operation: request.operation, model: config.name }, "JSON parse failed β next model"); | |
| return callLLM({ ...request, modelIndex: modelIndex + 1 }); | |
| } | |
| } | |
| const result: LLMResponse = { | |
| content, | |
| reasoning, | |
| parsed, | |
| model: config.name, | |
| provider: "nvidia", | |
| tokens: { | |
| prompt: usage.prompt_tokens, | |
| completion: usage.completion_tokens, | |
| total: usage.total_tokens, | |
| }, | |
| latencyMs, | |
| grounded: true, | |
| fallbackUsed: false, | |
| }; | |
| // Log MiniMax reasoning if present | |
| if (reasoning) { | |
| logger.debug({ operation: request.operation, reasoning: reasoning.slice(0, 200) }, | |
| "MiniMax reasoning captured"); | |
| } | |
| await logLLMTrace(request, result, true, config); | |
| return result; | |
| } catch (err) { | |
| if (err instanceof AxiosError) { | |
| if (err.response?.status === 429) { | |
| const retryAfter = parseInt(err.response.headers["retry-after"] ?? "5", 10); | |
| logger.warn({ model: config.name, retryAfter }, "Rate limited β waiting"); | |
| await sleep(retryAfter * 1000); | |
| return callLLM(request); | |
| } | |
| if (err.response?.status === 503 || err.response?.status === 500) { | |
| logger.warn({ model: config.name, status: err.response?.status }, `${config.name} unavailable β next`); | |
| return callLLM({ ...request, modelIndex: modelIndex + 1 }); | |
| } | |
| } | |
| logger.error({ model: config.name, err: String(err).slice(0, 200) }, "LLM call failed β next"); | |
| return callLLM({ ...request, modelIndex: modelIndex + 1 }); | |
| } | |
| } | |
| function deterministicFallback(request: LLMRequest): LLMResponse { | |
| logger.error({ operation: request.operation }, "ALL models failed β deterministic fallback"); | |
| return { | |
| content: "", | |
| reasoning: null, | |
| parsed: null, | |
| model: "deterministic_fallback", | |
| provider: "none", | |
| tokens: { prompt: 0, completion: 0, total: 0 }, | |
| latencyMs: 0, | |
| grounded: false, | |
| fallbackUsed: true, | |
| }; | |
| } | |
| // βββ Self-consistency check ββββββββββββββββββββββββββββββββββ | |
| // NOTE: MiniMax has built-in reasoning β consistency is higher | |
| // We still do dual-temperature check for critical operations | |
| export async function callLLMWithConsistencyCheck( | |
| request: LLMRequest | |
| ): Promise<{ primary: LLMResponse; isConsistent: boolean; consistencyScore: number }> { | |
| const primary = await callLLM({ ...request, temperature: 0.1 }); | |
| if (!["profile", "score"].includes(request.operation)) { | |
| return { primary, isConsistent: true, consistencyScore: 1.0 }; | |
| } | |
| if (primary.fallbackUsed) { | |
| return { primary, isConsistent: true, consistencyScore: 0.5 }; | |
| } | |
| // MiniMax has reasoning β inherently more consistent | |
| // Only do consistency check with LLaMA models | |
| if (primary.model === "MiniMax M2.7" && primary.reasoning) { | |
| // MiniMax showed its reasoning β trust it more | |
| return { primary, isConsistent: true, consistencyScore: 0.95 }; | |
| } | |
| const secondary = await callLLM({ ...request, temperature: 0.4, modelIndex: request.modelIndex }); | |
| const score = compareOutputs(primary, secondary); | |
| return { primary, isConsistent: score >= 0.75, consistencyScore: score }; | |
| } | |
| function compareOutputs(a: LLMResponse, b: LLMResponse): number { | |
| if (!a.parsed || !b.parsed) return 0.5; | |
| let matches = 0, total = 0; | |
| for (const key of ["ai_readiness", "tier", "service_match"]) { | |
| if (key in a.parsed && key in b.parsed) { | |
| total++; | |
| if (a.parsed[key] === b.parsed[key]) matches++; | |
| } | |
| } | |
| for (const key of ["total_score", "company_fit"]) { | |
| const aVal = a.parsed[key], bVal = b.parsed[key]; | |
| if (typeof aVal === "number" && typeof bVal === "number") { | |
| total++; | |
| if (Math.abs(aVal - bVal) <= 10) matches++; | |
| } | |
| } | |
| return total === 0 ? 1.0 : matches / total; | |
| } | |
| // βββ Trace logging βββββββββββββββββββββββββββββββββββββββββββ | |
| async function logLLMTrace( | |
| request: LLMRequest, | |
| response: LLMResponse | null, | |
| success: boolean, | |
| config?: ModelConfig | |
| ): Promise<void> { | |
| try { | |
| const db = getSupabaseClient(); | |
| await db.from("llm_traces").insert({ | |
| trace_id: request.traceId, | |
| operation: request.operation, | |
| model: response?.model ?? config?.name ?? "unknown", | |
| provider: "nvidia", | |
| prompt_tokens: response?.tokens.prompt ?? 0, | |
| completion_tokens: response?.tokens.completion ?? 0, | |
| total_tokens: response?.tokens.total ?? 0, | |
| latency_ms: response?.latencyMs ?? 0, | |
| success, | |
| fallback_used: response?.fallbackUsed ?? true, | |
| company_id: request.companyId ?? null, | |
| input_hash: hashText(request.userPrompt.slice(0, 200)), | |
| output_hash: response ? hashText(response.content.slice(0, 200)) : null, | |
| }); | |
| } catch (err) { | |
| logger.warn({ err }, "Trace log failed β non-critical"); | |
| } | |
| } | |
| // βββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| function safeParseJSON(text: string): Record<string, unknown> | null { | |
| let content = text.trim(); | |
| if (content.includes("```json")) content = content.split("```json")[1].split("```")[0].trim(); | |
| else if (content.includes("```")) content = content.split("```")[1].split("```")[0].trim(); | |
| try { | |
| return JSON.parse(content); | |
| } catch { | |
| const match = content.match(/\{[\s\S]*\}/); | |
| if (match) { try { return JSON.parse(match[0]); } catch { return null; } } | |
| return null; | |
| } | |
| } | |
| function hashText(text: string): string { | |
| return createHash("sha256").update(text).digest("hex").slice(0, 16); | |
| } | |
| function sleep(ms: number): Promise<void> { | |
| return new Promise((resolve) => setTimeout(resolve, ms)); | |
| } | |