// RAM-compatibility math for running a GGUF quant on a given MacBook config.
// Estimates are heuristic — the HF API doesn't expose n_layers / n_kv_heads
// per quant, so KV cache is approximated from param count and context length.

import type { MacBookConfig } from './macbooks';

export interface Quant {
  path: string;
  size: number;
  sizeGB: number;
  quant: string;
  sharded: boolean;
}

export interface ModelEntry {
  id: string;
  author: string;
  name: string;
  downloads: number;
  likes: number;
  pipeline_tag: string | null;
  params_b: number | null;
  arch: string | null;
  context_length: number | null;
  tags: string[];
  quants: Quant[];
}

export type Verdict = 'fits' | 'tight' | 'wont_fit' | 'unknown';

export interface CompatResult {
  verdict: Verdict;
  requiredGB: number;
  usableGB: number;
  utilization: number; // 0-1+
  estTokensPerSec: number | null;
  reason: string;
}

const GB = 1024 ** 3;

// Heuristic KV cache size at FP16: ~0.13 GB per 1B params per 4096 ctx.
// Real numbers vary 2-3x by architecture (GQA reduces this).
function estimateKVCacheGB(params_b: number, ctx: number): number {
  return 0.13 * params_b * (ctx / 4096);
}

// macOS reserves a chunk of RAM for the system. Roughly:
// - 8 GB systems: ~3 GB reserved (heavy)
// - 16 GB: ~3.5 GB
// - 32 GB+: ~4 GB
function osReserveGB(totalGB: number): number {
  if (totalGB <= 8) return 3;
  if (totalGB <= 16) return 3.5;
  if (totalGB <= 32) return 4;
  return 5;
}

// On Apple Silicon, llama.cpp can wire up to ~75% of RAM as VRAM by default;
// `iogpu.wired_limit_mb` raises this. We assume the realistic default cap.
function gpuWiredCapGB(totalGB: number, unified: boolean): number {
  if (!unified) return totalGB; // CPU path; whole RAM accessible
  if (totalGB <= 16) return totalGB * 0.67;
  if (totalGB <= 64) return totalGB * 0.75;
  return totalGB * 0.85;
}

export function checkCompat(
  cfg: MacBookConfig,
  model: ModelEntry,
  quant: Quant,
  ctx: number = 4096
): CompatResult {
  const fileGB = quant.size / GB;
  const params = model.params_b ?? estimateParamsFromSize(fileGB, quant.quant);
  const kv = params ? estimateKVCacheGB(params, ctx) : 1.0;
  const overhead = 0.8; // compute buffers, activations, runtime
  const requiredGB = fileGB + kv + overhead;

  const total = cfg.ram;
  const reserve = osReserveGB(total);
  const wiredCap = gpuWiredCapGB(total, cfg.macbook.unifiedMemory);
  const usableGB = Math.min(total - reserve, wiredCap);
  const utilization = requiredGB / usableGB;

  let verdict: Verdict;
  if (utilization > 1) verdict = 'wont_fit';
  else if (utilization > 0.85) verdict = 'tight';
  else verdict = 'fits';

  // Throughput estimate: bandwidth-bound.
  // tok/s ≈ bandwidth_GBs / file_GB * efficiency
  // Apple Silicon Metal hits ~70% of peak BW for inference; CPU path ~25%.
  let estTokensPerSec: number | null = null;
  if (verdict !== 'wont_fit') {
    const eff = cfg.macbook.unifiedMemory ? 0.7 : 0.25;
    estTokensPerSec = (cfg.macbook.bandwidthGBs / fileGB) * eff;
    estTokensPerSec = Math.round(estTokensPerSec * 10) / 10;
  }

  let reason: string;
  if (verdict === 'wont_fit')
    reason = `Needs ~${requiredGB.toFixed(1)} GB but only ~${usableGB.toFixed(1)} GB usable.`;
  else if (verdict === 'tight')
    reason = `Tight: ~${(utilization * 100).toFixed(0)}% of usable RAM.`;
  else
    reason = `Comfortable: ~${(utilization * 100).toFixed(0)}% of usable RAM.`;

  return { verdict, requiredGB, usableGB, utilization, estTokensPerSec, reason };
}

function estimateParamsFromSize(fileGB: number, quant: string): number {
  // Bits-per-weight by quant family (rough)
  const bpw: Record<string, number> = {
    F32: 32,
    BF16: 16,
    F16: 16,
    Q8_0: 8.5,
    Q6_K: 6.5,
    Q5_K_M: 5.5,
    Q5_K_S: 5.5,
    Q5_0: 5.5,
    Q4_K_M: 4.8,
    Q4_K_S: 4.6,
    Q4_0: 4.5,
    Q3_K_M: 3.9,
    Q3_K_S: 3.5,
    Q2_K: 3.0,
    IQ4_XS: 4.3,
    IQ3_M: 3.7,
    IQ2_M: 2.7,
    IQ2_XS: 2.4
  };
  const b = bpw[quant.toUpperCase()] ?? 5;
  // size_bytes ≈ params * bpw / 8
  const params_b = (fileGB * GB * 8) / (b * 1e9);
  return params_b;
}

export interface BestPick {
  model: ModelEntry;
  quant: Quant;
  result: CompatResult;
}

// Best (largest) quant of a given model that still fits comfortably.
export function bestQuantForModel(
  cfg: MacBookConfig,
  model: ModelEntry,
  ctx: number
): BestPick | null {
  const sorted = [...model.quants].sort((a, b) => b.size - a.size);
  for (const q of sorted) {
    const r = checkCompat(cfg, model, q, ctx);
    if (r.verdict === 'fits') return { model, quant: q, result: r };
  }
  for (const q of sorted) {
    const r = checkCompat(cfg, model, q, ctx);
    if (r.verdict === 'tight') return { model, quant: q, result: r };
  }
  return null;
}

export function llamaServerSnippet(
  model: ModelEntry,
  quant: Quant,
  ctx: number,
  unified: boolean
): string {
  // llama-server -hf supports `org/repo:Q4_K_M` shorthand for top-level files,
  // or use the full filename if the file is in a subdir.
  const fileBase = quant.path.split('/').pop() || quant.path;
  const isTopLevel = !quant.path.includes('/');
  const ngl = unified ? 99 : 0; // offload everything on Apple Silicon, none on Intel
  const ref = isTopLevel
    ? `-hf ${model.id}:${quant.quant}`
    : `-hf ${model.id} --hf-file ${fileBase}`;
  return `llama-server ${ref} -c ${ctx} -ngl ${ngl}`;
}