Spaces:

dexakif
/

clienttarget

Running

clienttarget / src /shared /llm /nvidia-client.ts

iDevBuddy

feat: Phase 1 — AI Client Acquisition System

bd28470 4 days ago

10.2 kB

	/**
	* Multi-Model LLM Client — All FREE on NVIDIA NIM
	*
	* 3 models, 1 provider, 1 API key, $0 cost:
	*
	* Priority 1: MiniMax M2.7 → Best reasoning, 4M context, built-in CoT
	* Priority 2: LLaMA 3.3 70B → Reliable, proven, 128K context
	* Priority 3: LLaMA 3.1 8B → Fast, cheap, for simple tasks
	* Priority 4: Deterministic → Zero LLM, zero hallucination
	*
	* All on: https://integrate.api.nvidia.com/v1
	* All use: same NVIDIA_API_KEY
	*
	* MiniMax M2.7 special feature:
	* Response includes `reasoning_content` field — chain-of-thought
	* reasoning happens AUTOMATICALLY inside the model.
	* We don't need to prompt "think step by step" — it does it natively.
	*/

	import axios, { AxiosError } from "axios";
	import { createHash } from "crypto";
	import { getEnv } from "../config/env";
	import { getSupabaseClient } from "../supabase/client";
	import { logger } from "../utils/logger";

	// ─── Types ───────────────────────────────────────────────────

	export interface LLMRequest {
	operation: string;
	modelIndex?: number; // 0=MiniMax, 1=LLaMA70B, 2=LLaMA8B
	systemPrompt: string;
	userPrompt: string;
	temperature?: number;
	maxTokens?: number;
	jsonMode?: boolean;
	traceId: string;
	companyId?: string;
	}

	export interface LLMResponse {
	content: string;
	reasoning: string \| null; // MiniMax's built-in chain-of-thought
	parsed: Record<string, unknown> \| null;
	model: string;
	provider: string;
	tokens: { prompt: number; completion: number; total: number };
	latencyMs: number;
	grounded: boolean;
	fallbackUsed: boolean;
	}

	// ─── Model configs (ALL on NVIDIA NIM, ALL FREE) ─────────────

	interface ModelConfig {
	name: string;
	model: string;
	maxContext: number;
	bestFor: string;
	}

	const MODEL_CONFIGS: ModelConfig[] = [
	{
	name: "MiniMax M2.7",
	model: "minimaxai/minimax-m2.7",
	maxContext: 4_000_000, // 4M tokens!
	bestFor: "profiling, scoring, complex reasoning",
	},
	{
	name: "LLaMA 3.3 70B",
	model: "meta/llama-3.3-70b-instruct",
	maxContext: 128_000,
	bestFor: "general tasks, reliable fallback",
	},
	{
	name: "LLaMA 3.1 8B",
	model: "meta/llama-3.1-8b-instruct",
	maxContext: 128_000,
	bestFor: "email classification, simple checks",
	},
	];

	export const MODELS = {
	MINIMAX: 0, // Primary — best reasoning
	LLAMA_70B: 1, // Fallback — reliable
	LLAMA_8B: 2, // Fast — simple tasks
	FAST: 2, // alias
	} as const;

	// ─── Main LLM call ──────────────────────────────────────────

	export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
	const modelIndex = request.modelIndex ?? 0;
	const env = getEnv();

	if (modelIndex >= MODEL_CONFIGS.length) {
	return deterministicFallback(request);
	}

	const config = MODEL_CONFIGS[modelIndex];
	const startTime = Date.now();

	const body: Record<string, unknown> = {
	model: config.model,
	messages: [
	{ role: "system", content: request.systemPrompt },
	{ role: "user", content: request.userPrompt },
	],
	temperature: request.temperature ?? 0.2,
	max_tokens: request.maxTokens ?? 1024,
	top_p: 0.9,
	};

	if (request.jsonMode) {
	body.response_format = { type: "json_object" };
	}

	try {
	const response = await axios.post(
	`${env.NVIDIA_NIM_BASE_URL}/chat/completions`,
	body,
	{
	headers: {
	Authorization: `Bearer ${env.NVIDIA_API_KEY}`,
	"Content-Type": "application/json",
	},
	timeout: 90_000, // MiniMax can take longer for reasoning
	}
	);

	const data = response.data;
	const message = data.choices?.[0]?.message;
	const content = message?.content ?? "";
	const reasoning = message?.reasoning_content ?? null; // MiniMax CoT
	const usage = data.usage ?? { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
	const latencyMs = Date.now() - startTime;

	let parsed: Record<string, unknown> \| null = null;
	if (request.jsonMode) {
	parsed = safeParseJSON(content);
	if (!parsed) {
	logger.warn({ operation: request.operation, model: config.name }, "JSON parse failed → next model");
	return callLLM({ ...request, modelIndex: modelIndex + 1 });
	}
	}

	const result: LLMResponse = {
	content,
	reasoning,
	parsed,
	model: config.name,
	provider: "nvidia",
	tokens: {
	prompt: usage.prompt_tokens,
	completion: usage.completion_tokens,
	total: usage.total_tokens,
	},
	latencyMs,
	grounded: true,
	fallbackUsed: false,
	};

	// Log MiniMax reasoning if present
	if (reasoning) {
	logger.debug({ operation: request.operation, reasoning: reasoning.slice(0, 200) },
	"MiniMax reasoning captured");
	}

	await logLLMTrace(request, result, true, config);
	return result;

	} catch (err) {
	if (err instanceof AxiosError) {
	if (err.response?.status === 429) {
	const retryAfter = parseInt(err.response.headers["retry-after"] ?? "5", 10);
	logger.warn({ model: config.name, retryAfter }, "Rate limited → waiting");
	await sleep(retryAfter * 1000);
	return callLLM(request);
	}

	if (err.response?.status === 503 \|\| err.response?.status === 500) {
	logger.warn({ model: config.name, status: err.response?.status }, `${config.name} unavailable → next`);
	return callLLM({ ...request, modelIndex: modelIndex + 1 });
	}
	}

	logger.error({ model: config.name, err: String(err).slice(0, 200) }, "LLM call failed → next");
	return callLLM({ ...request, modelIndex: modelIndex + 1 });
	}
	}

	function deterministicFallback(request: LLMRequest): LLMResponse {
	logger.error({ operation: request.operation }, "ALL models failed → deterministic fallback");
	return {
	content: "",
	reasoning: null,
	parsed: null,
	model: "deterministic_fallback",
	provider: "none",
	tokens: { prompt: 0, completion: 0, total: 0 },
	latencyMs: 0,
	grounded: false,
	fallbackUsed: true,
	};
	}

	// ─── Self-consistency check ──────────────────────────────────
	// NOTE: MiniMax has built-in reasoning → consistency is higher
	// We still do dual-temperature check for critical operations

	export async function callLLMWithConsistencyCheck(
	request: LLMRequest
	): Promise<{ primary: LLMResponse; isConsistent: boolean; consistencyScore: number }> {
	const primary = await callLLM({ ...request, temperature: 0.1 });

	if (!["profile", "score"].includes(request.operation)) {
	return { primary, isConsistent: true, consistencyScore: 1.0 };
	}

	if (primary.fallbackUsed) {
	return { primary, isConsistent: true, consistencyScore: 0.5 };
	}

	// MiniMax has reasoning → inherently more consistent
	// Only do consistency check with LLaMA models
	if (primary.model === "MiniMax M2.7" && primary.reasoning) {
	// MiniMax showed its reasoning → trust it more
	return { primary, isConsistent: true, consistencyScore: 0.95 };
	}

	const secondary = await callLLM({ ...request, temperature: 0.4, modelIndex: request.modelIndex });
	const score = compareOutputs(primary, secondary);
	return { primary, isConsistent: score >= 0.75, consistencyScore: score };
	}

	function compareOutputs(a: LLMResponse, b: LLMResponse): number {
	if (!a.parsed \|\| !b.parsed) return 0.5;
	let matches = 0, total = 0;

	for (const key of ["ai_readiness", "tier", "service_match"]) {
	if (key in a.parsed && key in b.parsed) {
	total++;
	if (a.parsed[key] === b.parsed[key]) matches++;
	}
	}
	for (const key of ["total_score", "company_fit"]) {
	const aVal = a.parsed[key], bVal = b.parsed[key];
	if (typeof aVal === "number" && typeof bVal === "number") {
	total++;
	if (Math.abs(aVal - bVal) <= 10) matches++;
	}
	}
	return total === 0 ? 1.0 : matches / total;
	}

	// ─── Trace logging ───────────────────────────────────────────

	async function logLLMTrace(
	request: LLMRequest,
	response: LLMResponse \| null,
	success: boolean,
	config?: ModelConfig
	): Promise<void> {
	try {
	const db = getSupabaseClient();
	await db.from("llm_traces").insert({
	trace_id: request.traceId,
	operation: request.operation,
	model: response?.model ?? config?.name ?? "unknown",
	provider: "nvidia",
	prompt_tokens: response?.tokens.prompt ?? 0,
	completion_tokens: response?.tokens.completion ?? 0,
	total_tokens: response?.tokens.total ?? 0,
	latency_ms: response?.latencyMs ?? 0,
	success,
	fallback_used: response?.fallbackUsed ?? true,
	company_id: request.companyId ?? null,
	input_hash: hashText(request.userPrompt.slice(0, 200)),
	output_hash: response ? hashText(response.content.slice(0, 200)) : null,
	});
	} catch (err) {
	logger.warn({ err }, "Trace log failed — non-critical");
	}
	}

	// ─── Helpers ─────────────────────────────────────────────────

	function safeParseJSON(text: string): Record<string, unknown> \| null {
	let content = text.trim();
	if (content.includes("```json")) content = content.split("```json")[1].split("```")[0].trim();
	else if (content.includes("```")) content = content.split("```")[1].split("```")[0].trim();

	try {
	return JSON.parse(content);
	} catch {
	const match = content.match(/\{[\s\S]*\}/);
	if (match) { try { return JSON.parse(match[0]); } catch { return null; } }
	return null;
	}
	}

	function hashText(text: string): string {
	return createHash("sha256").update(text).digest("hex").slice(0, 16);
	}

	function sleep(ms: number): Promise<void> {
	return new Promise((resolve) => setTimeout(resolve, ms));
	}