Spaces:

dexakif
/

clienttarget

Running

File size: 10,202 Bytes

bd28470

/**
 * Multi-Model LLM Client — All FREE on NVIDIA NIM
 * 
 * 3 models, 1 provider, 1 API key, $0 cost:
 * 
 * Priority 1: MiniMax M2.7     → Best reasoning, 4M context, built-in CoT
 * Priority 2: LLaMA 3.3 70B    → Reliable, proven, 128K context
 * Priority 3: LLaMA 3.1 8B     → Fast, cheap, for simple tasks
 * Priority 4: Deterministic     → Zero LLM, zero hallucination
 * 
 * All on: https://integrate.api.nvidia.com/v1
 * All use: same NVIDIA_API_KEY
 * 
 * MiniMax M2.7 special feature:
 *   Response includes `reasoning_content` field — chain-of-thought
 *   reasoning happens AUTOMATICALLY inside the model.
 *   We don't need to prompt "think step by step" — it does it natively.
 */

import axios, { AxiosError } from "axios";
import { createHash } from "crypto";
import { getEnv } from "../config/env";
import { getSupabaseClient } from "../supabase/client";
import { logger } from "../utils/logger";

// ─── Types ───────────────────────────────────────────────────

export interface LLMRequest {
  operation: string;
  modelIndex?: number;       // 0=MiniMax, 1=LLaMA70B, 2=LLaMA8B
  systemPrompt: string;
  userPrompt: string;
  temperature?: number;
  maxTokens?: number;
  jsonMode?: boolean;
  traceId: string;
  companyId?: string;
}

export interface LLMResponse {
  content: string;
  reasoning: string | null;   // MiniMax's built-in chain-of-thought
  parsed: Record<string, unknown> | null;
  model: string;
  provider: string;
  tokens: { prompt: number; completion: number; total: number };
  latencyMs: number;
  grounded: boolean;
  fallbackUsed: boolean;
}

// ─── Model configs (ALL on NVIDIA NIM, ALL FREE) ─────────────

interface ModelConfig {
  name: string;
  model: string;
  maxContext: number;
  bestFor: string;
}

const MODEL_CONFIGS: ModelConfig[] = [
  {
    name: "MiniMax M2.7",
    model: "minimaxai/minimax-m2.7",
    maxContext: 4_000_000,         // 4M tokens!
    bestFor: "profiling, scoring, complex reasoning",
  },
  {
    name: "LLaMA 3.3 70B",
    model: "meta/llama-3.3-70b-instruct",
    maxContext: 128_000,
    bestFor: "general tasks, reliable fallback",
  },
  {
    name: "LLaMA 3.1 8B",
    model: "meta/llama-3.1-8b-instruct",
    maxContext: 128_000,
    bestFor: "email classification, simple checks",
  },
];

export const MODELS = {
  MINIMAX: 0,       // Primary — best reasoning
  LLAMA_70B: 1,     // Fallback — reliable
  LLAMA_8B: 2,      // Fast — simple tasks
  FAST: 2,          // alias
} as const;

// ─── Main LLM call ──────────────────────────────────────────

export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
  const modelIndex = request.modelIndex ?? 0;
  const env = getEnv();

  if (modelIndex >= MODEL_CONFIGS.length) {
    return deterministicFallback(request);
  }

  const config = MODEL_CONFIGS[modelIndex];
  const startTime = Date.now();

  const body: Record<string, unknown> = {
    model: config.model,
    messages: [
      { role: "system", content: request.systemPrompt },
      { role: "user", content: request.userPrompt },
    ],
    temperature: request.temperature ?? 0.2,
    max_tokens: request.maxTokens ?? 1024,
    top_p: 0.9,
  };

  if (request.jsonMode) {
    body.response_format = { type: "json_object" };
  }

  try {
    const response = await axios.post(
      `${env.NVIDIA_NIM_BASE_URL}/chat/completions`,
      body,
      {
        headers: {
          Authorization: `Bearer ${env.NVIDIA_API_KEY}`,
          "Content-Type": "application/json",
        },
        timeout: 90_000, // MiniMax can take longer for reasoning
      }
    );

    const data = response.data;
    const message = data.choices?.[0]?.message;
    const content = message?.content ?? "";
    const reasoning = message?.reasoning_content ?? null; // MiniMax CoT
    const usage = data.usage ?? { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
    const latencyMs = Date.now() - startTime;

    let parsed: Record<string, unknown> | null = null;
    if (request.jsonMode) {
      parsed = safeParseJSON(content);
      if (!parsed) {
        logger.warn({ operation: request.operation, model: config.name }, "JSON parse failed → next model");
        return callLLM({ ...request, modelIndex: modelIndex + 1 });
      }
    }

    const result: LLMResponse = {
      content,
      reasoning,
      parsed,
      model: config.name,
      provider: "nvidia",
      tokens: {
        prompt: usage.prompt_tokens,
        completion: usage.completion_tokens,
        total: usage.total_tokens,
      },
      latencyMs,
      grounded: true,
      fallbackUsed: false,
    };

    // Log MiniMax reasoning if present
    if (reasoning) {
      logger.debug({ operation: request.operation, reasoning: reasoning.slice(0, 200) },
        "MiniMax reasoning captured");
    }

    await logLLMTrace(request, result, true, config);
    return result;

  } catch (err) {
    if (err instanceof AxiosError) {
      if (err.response?.status === 429) {
        const retryAfter = parseInt(err.response.headers["retry-after"] ?? "5", 10);
        logger.warn({ model: config.name, retryAfter }, "Rate limited → waiting");
        await sleep(retryAfter * 1000);
        return callLLM(request);
      }

      if (err.response?.status === 503 || err.response?.status === 500) {
        logger.warn({ model: config.name, status: err.response?.status }, `${config.name} unavailable → next`);
        return callLLM({ ...request, modelIndex: modelIndex + 1 });
      }
    }

    logger.error({ model: config.name, err: String(err).slice(0, 200) }, "LLM call failed → next");
    return callLLM({ ...request, modelIndex: modelIndex + 1 });
  }
}

function deterministicFallback(request: LLMRequest): LLMResponse {
  logger.error({ operation: request.operation }, "ALL models failed → deterministic fallback");
  return {
    content: "",
    reasoning: null,
    parsed: null,
    model: "deterministic_fallback",
    provider: "none",
    tokens: { prompt: 0, completion: 0, total: 0 },
    latencyMs: 0,
    grounded: false,
    fallbackUsed: true,
  };
}

// ─── Self-consistency check ──────────────────────────────────
// NOTE: MiniMax has built-in reasoning → consistency is higher
// We still do dual-temperature check for critical operations

export async function callLLMWithConsistencyCheck(
  request: LLMRequest
): Promise<{ primary: LLMResponse; isConsistent: boolean; consistencyScore: number }> {
  const primary = await callLLM({ ...request, temperature: 0.1 });

  if (!["profile", "score"].includes(request.operation)) {
    return { primary, isConsistent: true, consistencyScore: 1.0 };
  }

  if (primary.fallbackUsed) {
    return { primary, isConsistent: true, consistencyScore: 0.5 };
  }

  // MiniMax has reasoning → inherently more consistent
  // Only do consistency check with LLaMA models
  if (primary.model === "MiniMax M2.7" && primary.reasoning) {
    // MiniMax showed its reasoning → trust it more
    return { primary, isConsistent: true, consistencyScore: 0.95 };
  }

  const secondary = await callLLM({ ...request, temperature: 0.4, modelIndex: request.modelIndex });
  const score = compareOutputs(primary, secondary);
  return { primary, isConsistent: score >= 0.75, consistencyScore: score };
}

function compareOutputs(a: LLMResponse, b: LLMResponse): number {
  if (!a.parsed || !b.parsed) return 0.5;
  let matches = 0, total = 0;

  for (const key of ["ai_readiness", "tier", "service_match"]) {
    if (key in a.parsed && key in b.parsed) {
      total++;
      if (a.parsed[key] === b.parsed[key]) matches++;
    }
  }
  for (const key of ["total_score", "company_fit"]) {
    const aVal = a.parsed[key], bVal = b.parsed[key];
    if (typeof aVal === "number" && typeof bVal === "number") {
      total++;
      if (Math.abs(aVal - bVal) <= 10) matches++;
    }
  }
  return total === 0 ? 1.0 : matches / total;
}

// ─── Trace logging ───────────────────────────────────────────

async function logLLMTrace(
  request: LLMRequest,
  response: LLMResponse | null,
  success: boolean,
  config?: ModelConfig
): Promise<void> {
  try {
    const db = getSupabaseClient();
    await db.from("llm_traces").insert({
      trace_id: request.traceId,
      operation: request.operation,
      model: response?.model ?? config?.name ?? "unknown",
      provider: "nvidia",
      prompt_tokens: response?.tokens.prompt ?? 0,
      completion_tokens: response?.tokens.completion ?? 0,
      total_tokens: response?.tokens.total ?? 0,
      latency_ms: response?.latencyMs ?? 0,
      success,
      fallback_used: response?.fallbackUsed ?? true,
      company_id: request.companyId ?? null,
      input_hash: hashText(request.userPrompt.slice(0, 200)),
      output_hash: response ? hashText(response.content.slice(0, 200)) : null,
    });
  } catch (err) {
    logger.warn({ err }, "Trace log failed — non-critical");
  }
}

// ─── Helpers ─────────────────────────────────────────────────

function safeParseJSON(text: string): Record<string, unknown> | null {
  let content = text.trim();
  if (content.includes("```json")) content = content.split("```json")[1].split("```")[0].trim();
  else if (content.includes("```")) content = content.split("```")[1].split("```")[0].trim();

  try {
    return JSON.parse(content);
  } catch {
    const match = content.match(/\{[\s\S]*\}/);
    if (match) { try { return JSON.parse(match[0]); } catch { return null; } }
    return null;
  }
}

function hashText(text: string): string {
  return createHash("sha256").update(text).digest("hex").slice(0, 16);
}

function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
}