clienttarget / src /shared /llm /nvidia-client.ts
iDevBuddy
feat: Phase 1 β€” AI Client Acquisition System
bd28470
/**
* Multi-Model LLM Client β€” All FREE on NVIDIA NIM
*
* 3 models, 1 provider, 1 API key, $0 cost:
*
* Priority 1: MiniMax M2.7 β†’ Best reasoning, 4M context, built-in CoT
* Priority 2: LLaMA 3.3 70B β†’ Reliable, proven, 128K context
* Priority 3: LLaMA 3.1 8B β†’ Fast, cheap, for simple tasks
* Priority 4: Deterministic β†’ Zero LLM, zero hallucination
*
* All on: https://integrate.api.nvidia.com/v1
* All use: same NVIDIA_API_KEY
*
* MiniMax M2.7 special feature:
* Response includes `reasoning_content` field β€” chain-of-thought
* reasoning happens AUTOMATICALLY inside the model.
* We don't need to prompt "think step by step" β€” it does it natively.
*/
import axios, { AxiosError } from "axios";
import { createHash } from "crypto";
import { getEnv } from "../config/env";
import { getSupabaseClient } from "../supabase/client";
import { logger } from "../utils/logger";
// ─── Types ───────────────────────────────────────────────────
export interface LLMRequest {
operation: string;
modelIndex?: number; // 0=MiniMax, 1=LLaMA70B, 2=LLaMA8B
systemPrompt: string;
userPrompt: string;
temperature?: number;
maxTokens?: number;
jsonMode?: boolean;
traceId: string;
companyId?: string;
}
export interface LLMResponse {
content: string;
reasoning: string | null; // MiniMax's built-in chain-of-thought
parsed: Record<string, unknown> | null;
model: string;
provider: string;
tokens: { prompt: number; completion: number; total: number };
latencyMs: number;
grounded: boolean;
fallbackUsed: boolean;
}
// ─── Model configs (ALL on NVIDIA NIM, ALL FREE) ─────────────
interface ModelConfig {
name: string;
model: string;
maxContext: number;
bestFor: string;
}
const MODEL_CONFIGS: ModelConfig[] = [
{
name: "MiniMax M2.7",
model: "minimaxai/minimax-m2.7",
maxContext: 4_000_000, // 4M tokens!
bestFor: "profiling, scoring, complex reasoning",
},
{
name: "LLaMA 3.3 70B",
model: "meta/llama-3.3-70b-instruct",
maxContext: 128_000,
bestFor: "general tasks, reliable fallback",
},
{
name: "LLaMA 3.1 8B",
model: "meta/llama-3.1-8b-instruct",
maxContext: 128_000,
bestFor: "email classification, simple checks",
},
];
export const MODELS = {
MINIMAX: 0, // Primary β€” best reasoning
LLAMA_70B: 1, // Fallback β€” reliable
LLAMA_8B: 2, // Fast β€” simple tasks
FAST: 2, // alias
} as const;
// ─── Main LLM call ──────────────────────────────────────────
export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
const modelIndex = request.modelIndex ?? 0;
const env = getEnv();
if (modelIndex >= MODEL_CONFIGS.length) {
return deterministicFallback(request);
}
const config = MODEL_CONFIGS[modelIndex];
const startTime = Date.now();
const body: Record<string, unknown> = {
model: config.model,
messages: [
{ role: "system", content: request.systemPrompt },
{ role: "user", content: request.userPrompt },
],
temperature: request.temperature ?? 0.2,
max_tokens: request.maxTokens ?? 1024,
top_p: 0.9,
};
if (request.jsonMode) {
body.response_format = { type: "json_object" };
}
try {
const response = await axios.post(
`${env.NVIDIA_NIM_BASE_URL}/chat/completions`,
body,
{
headers: {
Authorization: `Bearer ${env.NVIDIA_API_KEY}`,
"Content-Type": "application/json",
},
timeout: 90_000, // MiniMax can take longer for reasoning
}
);
const data = response.data;
const message = data.choices?.[0]?.message;
const content = message?.content ?? "";
const reasoning = message?.reasoning_content ?? null; // MiniMax CoT
const usage = data.usage ?? { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
const latencyMs = Date.now() - startTime;
let parsed: Record<string, unknown> | null = null;
if (request.jsonMode) {
parsed = safeParseJSON(content);
if (!parsed) {
logger.warn({ operation: request.operation, model: config.name }, "JSON parse failed β†’ next model");
return callLLM({ ...request, modelIndex: modelIndex + 1 });
}
}
const result: LLMResponse = {
content,
reasoning,
parsed,
model: config.name,
provider: "nvidia",
tokens: {
prompt: usage.prompt_tokens,
completion: usage.completion_tokens,
total: usage.total_tokens,
},
latencyMs,
grounded: true,
fallbackUsed: false,
};
// Log MiniMax reasoning if present
if (reasoning) {
logger.debug({ operation: request.operation, reasoning: reasoning.slice(0, 200) },
"MiniMax reasoning captured");
}
await logLLMTrace(request, result, true, config);
return result;
} catch (err) {
if (err instanceof AxiosError) {
if (err.response?.status === 429) {
const retryAfter = parseInt(err.response.headers["retry-after"] ?? "5", 10);
logger.warn({ model: config.name, retryAfter }, "Rate limited β†’ waiting");
await sleep(retryAfter * 1000);
return callLLM(request);
}
if (err.response?.status === 503 || err.response?.status === 500) {
logger.warn({ model: config.name, status: err.response?.status }, `${config.name} unavailable β†’ next`);
return callLLM({ ...request, modelIndex: modelIndex + 1 });
}
}
logger.error({ model: config.name, err: String(err).slice(0, 200) }, "LLM call failed β†’ next");
return callLLM({ ...request, modelIndex: modelIndex + 1 });
}
}
function deterministicFallback(request: LLMRequest): LLMResponse {
logger.error({ operation: request.operation }, "ALL models failed β†’ deterministic fallback");
return {
content: "",
reasoning: null,
parsed: null,
model: "deterministic_fallback",
provider: "none",
tokens: { prompt: 0, completion: 0, total: 0 },
latencyMs: 0,
grounded: false,
fallbackUsed: true,
};
}
// ─── Self-consistency check ──────────────────────────────────
// NOTE: MiniMax has built-in reasoning β†’ consistency is higher
// We still do dual-temperature check for critical operations
export async function callLLMWithConsistencyCheck(
request: LLMRequest
): Promise<{ primary: LLMResponse; isConsistent: boolean; consistencyScore: number }> {
const primary = await callLLM({ ...request, temperature: 0.1 });
if (!["profile", "score"].includes(request.operation)) {
return { primary, isConsistent: true, consistencyScore: 1.0 };
}
if (primary.fallbackUsed) {
return { primary, isConsistent: true, consistencyScore: 0.5 };
}
// MiniMax has reasoning β†’ inherently more consistent
// Only do consistency check with LLaMA models
if (primary.model === "MiniMax M2.7" && primary.reasoning) {
// MiniMax showed its reasoning β†’ trust it more
return { primary, isConsistent: true, consistencyScore: 0.95 };
}
const secondary = await callLLM({ ...request, temperature: 0.4, modelIndex: request.modelIndex });
const score = compareOutputs(primary, secondary);
return { primary, isConsistent: score >= 0.75, consistencyScore: score };
}
function compareOutputs(a: LLMResponse, b: LLMResponse): number {
if (!a.parsed || !b.parsed) return 0.5;
let matches = 0, total = 0;
for (const key of ["ai_readiness", "tier", "service_match"]) {
if (key in a.parsed && key in b.parsed) {
total++;
if (a.parsed[key] === b.parsed[key]) matches++;
}
}
for (const key of ["total_score", "company_fit"]) {
const aVal = a.parsed[key], bVal = b.parsed[key];
if (typeof aVal === "number" && typeof bVal === "number") {
total++;
if (Math.abs(aVal - bVal) <= 10) matches++;
}
}
return total === 0 ? 1.0 : matches / total;
}
// ─── Trace logging ───────────────────────────────────────────
async function logLLMTrace(
request: LLMRequest,
response: LLMResponse | null,
success: boolean,
config?: ModelConfig
): Promise<void> {
try {
const db = getSupabaseClient();
await db.from("llm_traces").insert({
trace_id: request.traceId,
operation: request.operation,
model: response?.model ?? config?.name ?? "unknown",
provider: "nvidia",
prompt_tokens: response?.tokens.prompt ?? 0,
completion_tokens: response?.tokens.completion ?? 0,
total_tokens: response?.tokens.total ?? 0,
latency_ms: response?.latencyMs ?? 0,
success,
fallback_used: response?.fallbackUsed ?? true,
company_id: request.companyId ?? null,
input_hash: hashText(request.userPrompt.slice(0, 200)),
output_hash: response ? hashText(response.content.slice(0, 200)) : null,
});
} catch (err) {
logger.warn({ err }, "Trace log failed β€” non-critical");
}
}
// ─── Helpers ─────────────────────────────────────────────────
function safeParseJSON(text: string): Record<string, unknown> | null {
let content = text.trim();
if (content.includes("```json")) content = content.split("```json")[1].split("```")[0].trim();
else if (content.includes("```")) content = content.split("```")[1].split("```")[0].trim();
try {
return JSON.parse(content);
} catch {
const match = content.match(/\{[\s\S]*\}/);
if (match) { try { return JSON.parse(match[0]); } catch { return null; } }
return null;
}
}
function hashText(text: string): string {
return createHash("sha256").update(text).digest("hex").slice(0, 16);
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}