macbook-llm-compat / src /lib /compat.ts
mishig's picture
mishig HF Staff
Upload folder using huggingface_hub
8dcb261 verified
// RAM-compatibility math for running a GGUF quant on a given MacBook config.
// Estimates are heuristic — the HF API doesn't expose n_layers / n_kv_heads
// per quant, so KV cache is approximated from param count and context length.
import type { MacBookConfig } from './macbooks';
export interface Quant {
path: string;
size: number;
sizeGB: number;
quant: string;
sharded: boolean;
}
export interface ModelEntry {
id: string;
author: string;
name: string;
downloads: number;
likes: number;
pipeline_tag: string | null;
params_b: number | null;
arch: string | null;
context_length: number | null;
tags: string[];
quants: Quant[];
}
export type Verdict = 'fits' | 'tight' | 'wont_fit' | 'unknown';
export interface CompatResult {
verdict: Verdict;
requiredGB: number;
usableGB: number;
utilization: number; // 0-1+
estTokensPerSec: number | null;
reason: string;
}
const GB = 1024 ** 3;
// Heuristic KV cache size at FP16: ~0.13 GB per 1B params per 4096 ctx.
// Real numbers vary 2-3x by architecture (GQA reduces this).
function estimateKVCacheGB(params_b: number, ctx: number): number {
return 0.13 * params_b * (ctx / 4096);
}
// macOS reserves a chunk of RAM for the system. Roughly:
// - 8 GB systems: ~3 GB reserved (heavy)
// - 16 GB: ~3.5 GB
// - 32 GB+: ~4 GB
function osReserveGB(totalGB: number): number {
if (totalGB <= 8) return 3;
if (totalGB <= 16) return 3.5;
if (totalGB <= 32) return 4;
return 5;
}
// On Apple Silicon, llama.cpp can wire up to ~75% of RAM as VRAM by default;
// `iogpu.wired_limit_mb` raises this. We assume the realistic default cap.
function gpuWiredCapGB(totalGB: number, unified: boolean): number {
if (!unified) return totalGB; // CPU path; whole RAM accessible
if (totalGB <= 16) return totalGB * 0.67;
if (totalGB <= 64) return totalGB * 0.75;
return totalGB * 0.85;
}
export function checkCompat(
cfg: MacBookConfig,
model: ModelEntry,
quant: Quant,
ctx: number = 4096
): CompatResult {
const fileGB = quant.size / GB;
const params = model.params_b ?? estimateParamsFromSize(fileGB, quant.quant);
const kv = params ? estimateKVCacheGB(params, ctx) : 1.0;
const overhead = 0.8; // compute buffers, activations, runtime
const requiredGB = fileGB + kv + overhead;
const total = cfg.ram;
const reserve = osReserveGB(total);
const wiredCap = gpuWiredCapGB(total, cfg.macbook.unifiedMemory);
const usableGB = Math.min(total - reserve, wiredCap);
const utilization = requiredGB / usableGB;
let verdict: Verdict;
if (utilization > 1) verdict = 'wont_fit';
else if (utilization > 0.85) verdict = 'tight';
else verdict = 'fits';
// Throughput estimate: bandwidth-bound.
// tok/s ≈ bandwidth_GBs / file_GB * efficiency
// Apple Silicon Metal hits ~70% of peak BW for inference; CPU path ~25%.
let estTokensPerSec: number | null = null;
if (verdict !== 'wont_fit') {
const eff = cfg.macbook.unifiedMemory ? 0.7 : 0.25;
estTokensPerSec = (cfg.macbook.bandwidthGBs / fileGB) * eff;
estTokensPerSec = Math.round(estTokensPerSec * 10) / 10;
}
let reason: string;
if (verdict === 'wont_fit')
reason = `Needs ~${requiredGB.toFixed(1)} GB but only ~${usableGB.toFixed(1)} GB usable.`;
else if (verdict === 'tight')
reason = `Tight: ~${(utilization * 100).toFixed(0)}% of usable RAM.`;
else
reason = `Comfortable: ~${(utilization * 100).toFixed(0)}% of usable RAM.`;
return { verdict, requiredGB, usableGB, utilization, estTokensPerSec, reason };
}
function estimateParamsFromSize(fileGB: number, quant: string): number {
// Bits-per-weight by quant family (rough)
const bpw: Record<string, number> = {
F32: 32,
BF16: 16,
F16: 16,
Q8_0: 8.5,
Q6_K: 6.5,
Q5_K_M: 5.5,
Q5_K_S: 5.5,
Q5_0: 5.5,
Q4_K_M: 4.8,
Q4_K_S: 4.6,
Q4_0: 4.5,
Q3_K_M: 3.9,
Q3_K_S: 3.5,
Q2_K: 3.0,
IQ4_XS: 4.3,
IQ3_M: 3.7,
IQ2_M: 2.7,
IQ2_XS: 2.4
};
const b = bpw[quant.toUpperCase()] ?? 5;
// size_bytes ≈ params * bpw / 8
const params_b = (fileGB * GB * 8) / (b * 1e9);
return params_b;
}
export interface BestPick {
model: ModelEntry;
quant: Quant;
result: CompatResult;
}
// Best (largest) quant of a given model that still fits comfortably.
export function bestQuantForModel(
cfg: MacBookConfig,
model: ModelEntry,
ctx: number
): BestPick | null {
const sorted = [...model.quants].sort((a, b) => b.size - a.size);
for (const q of sorted) {
const r = checkCompat(cfg, model, q, ctx);
if (r.verdict === 'fits') return { model, quant: q, result: r };
}
for (const q of sorted) {
const r = checkCompat(cfg, model, q, ctx);
if (r.verdict === 'tight') return { model, quant: q, result: r };
}
return null;
}
export function llamaServerSnippet(
model: ModelEntry,
quant: Quant,
ctx: number,
unified: boolean
): string {
// llama-server -hf supports `org/repo:Q4_K_M` shorthand for top-level files,
// or use the full filename if the file is in a subdir.
const fileBase = quant.path.split('/').pop() || quant.path;
const isTopLevel = !quant.path.includes('/');
const ngl = unified ? 99 : 0; // offload everything on Apple Silicon, none on Intel
const ref = isTopLevel
? `-hf ${model.id}:${quant.quant}`
: `-hf ${model.id} --hf-file ${fileBase}`;
return `llama-server ${ref} -c ${ctx} -ngl ${ngl}`;
}