// RAM-compatibility math for running a GGUF quant on a given MacBook config. // Estimates are heuristic — the HF API doesn't expose n_layers / n_kv_heads // per quant, so KV cache is approximated from param count and context length. import type { MacBookConfig } from './macbooks'; export interface Quant { path: string; size: number; sizeGB: number; quant: string; sharded: boolean; } export interface ModelEntry { id: string; author: string; name: string; downloads: number; likes: number; pipeline_tag: string | null; params_b: number | null; arch: string | null; context_length: number | null; tags: string[]; quants: Quant[]; } export type Verdict = 'fits' | 'tight' | 'wont_fit' | 'unknown'; export interface CompatResult { verdict: Verdict; requiredGB: number; usableGB: number; utilization: number; // 0-1+ estTokensPerSec: number | null; reason: string; } const GB = 1024 ** 3; // Heuristic KV cache size at FP16: ~0.13 GB per 1B params per 4096 ctx. // Real numbers vary 2-3x by architecture (GQA reduces this). function estimateKVCacheGB(params_b: number, ctx: number): number { return 0.13 * params_b * (ctx / 4096); } // macOS reserves a chunk of RAM for the system. Roughly: // - 8 GB systems: ~3 GB reserved (heavy) // - 16 GB: ~3.5 GB // - 32 GB+: ~4 GB function osReserveGB(totalGB: number): number { if (totalGB <= 8) return 3; if (totalGB <= 16) return 3.5; if (totalGB <= 32) return 4; return 5; } // On Apple Silicon, llama.cpp can wire up to ~75% of RAM as VRAM by default; // `iogpu.wired_limit_mb` raises this. We assume the realistic default cap. function gpuWiredCapGB(totalGB: number, unified: boolean): number { if (!unified) return totalGB; // CPU path; whole RAM accessible if (totalGB <= 16) return totalGB * 0.67; if (totalGB <= 64) return totalGB * 0.75; return totalGB * 0.85; } export function checkCompat( cfg: MacBookConfig, model: ModelEntry, quant: Quant, ctx: number = 4096 ): CompatResult { const fileGB = quant.size / GB; const params = model.params_b ?? estimateParamsFromSize(fileGB, quant.quant); const kv = params ? estimateKVCacheGB(params, ctx) : 1.0; const overhead = 0.8; // compute buffers, activations, runtime const requiredGB = fileGB + kv + overhead; const total = cfg.ram; const reserve = osReserveGB(total); const wiredCap = gpuWiredCapGB(total, cfg.macbook.unifiedMemory); const usableGB = Math.min(total - reserve, wiredCap); const utilization = requiredGB / usableGB; let verdict: Verdict; if (utilization > 1) verdict = 'wont_fit'; else if (utilization > 0.85) verdict = 'tight'; else verdict = 'fits'; // Throughput estimate: bandwidth-bound. // tok/s ≈ bandwidth_GBs / file_GB * efficiency // Apple Silicon Metal hits ~70% of peak BW for inference; CPU path ~25%. let estTokensPerSec: number | null = null; if (verdict !== 'wont_fit') { const eff = cfg.macbook.unifiedMemory ? 0.7 : 0.25; estTokensPerSec = (cfg.macbook.bandwidthGBs / fileGB) * eff; estTokensPerSec = Math.round(estTokensPerSec * 10) / 10; } let reason: string; if (verdict === 'wont_fit') reason = `Needs ~${requiredGB.toFixed(1)} GB but only ~${usableGB.toFixed(1)} GB usable.`; else if (verdict === 'tight') reason = `Tight: ~${(utilization * 100).toFixed(0)}% of usable RAM.`; else reason = `Comfortable: ~${(utilization * 100).toFixed(0)}% of usable RAM.`; return { verdict, requiredGB, usableGB, utilization, estTokensPerSec, reason }; } function estimateParamsFromSize(fileGB: number, quant: string): number { // Bits-per-weight by quant family (rough) const bpw: Record = { F32: 32, BF16: 16, F16: 16, Q8_0: 8.5, Q6_K: 6.5, Q5_K_M: 5.5, Q5_K_S: 5.5, Q5_0: 5.5, Q4_K_M: 4.8, Q4_K_S: 4.6, Q4_0: 4.5, Q3_K_M: 3.9, Q3_K_S: 3.5, Q2_K: 3.0, IQ4_XS: 4.3, IQ3_M: 3.7, IQ2_M: 2.7, IQ2_XS: 2.4 }; const b = bpw[quant.toUpperCase()] ?? 5; // size_bytes ≈ params * bpw / 8 const params_b = (fileGB * GB * 8) / (b * 1e9); return params_b; } export interface BestPick { model: ModelEntry; quant: Quant; result: CompatResult; } // Best (largest) quant of a given model that still fits comfortably. export function bestQuantForModel( cfg: MacBookConfig, model: ModelEntry, ctx: number ): BestPick | null { const sorted = [...model.quants].sort((a, b) => b.size - a.size); for (const q of sorted) { const r = checkCompat(cfg, model, q, ctx); if (r.verdict === 'fits') return { model, quant: q, result: r }; } for (const q of sorted) { const r = checkCompat(cfg, model, q, ctx); if (r.verdict === 'tight') return { model, quant: q, result: r }; } return null; } export function llamaServerSnippet( model: ModelEntry, quant: Quant, ctx: number, unified: boolean ): string { // llama-server -hf supports `org/repo:Q4_K_M` shorthand for top-level files, // or use the full filename if the file is in a subdir. const fileBase = quant.path.split('/').pop() || quant.path; const isTopLevel = !quant.path.includes('/'); const ngl = unified ? 99 : 0; // offload everything on Apple Silicon, none on Intel const ref = isTopLevel ? `-hf ${model.id}:${quant.quant}` : `-hf ${model.id} --hf-file ${fileBase}`; return `llama-server ${ref} -c ${ctx} -ngl ${ngl}`; }