Spaces:

mishig
/

macbook-llm-compat

Running

App Files Files Community

macbook-llm-compat / src /lib /compat.ts

mishig HF Staff

Upload folder using huggingface_hub

8dcb261 verified about 1 month ago

raw

history blame contribute delete

5.41 kB

	// RAM-compatibility math for running a GGUF quant on a given MacBook config.
	// Estimates are heuristic — the HF API doesn't expose n_layers / n_kv_heads
	// per quant, so KV cache is approximated from param count and context length.

	import type { MacBookConfig } from './macbooks';

	export interface Quant {
	path: string;
	size: number;
	sizeGB: number;
	quant: string;
	sharded: boolean;
	}

	export interface ModelEntry {
	id: string;
	author: string;
	name: string;
	downloads: number;
	likes: number;
	pipeline_tag: string \| null;
	params_b: number \| null;
	arch: string \| null;
	context_length: number \| null;
	tags: string[];
	quants: Quant[];
	}

	export type Verdict = 'fits' \| 'tight' \| 'wont_fit' \| 'unknown';

	export interface CompatResult {
	verdict: Verdict;
	requiredGB: number;
	usableGB: number;
	utilization: number; // 0-1+
	estTokensPerSec: number \| null;
	reason: string;
	}

	const GB = 1024 ** 3;

	// Heuristic KV cache size at FP16: ~0.13 GB per 1B params per 4096 ctx.
	// Real numbers vary 2-3x by architecture (GQA reduces this).
	function estimateKVCacheGB(params_b: number, ctx: number): number {
	return 0.13 * params_b * (ctx / 4096);
	}

	// macOS reserves a chunk of RAM for the system. Roughly:
	// - 8 GB systems: ~3 GB reserved (heavy)
	// - 16 GB: ~3.5 GB
	// - 32 GB+: ~4 GB
	function osReserveGB(totalGB: number): number {
	if (totalGB <= 8) return 3;
	if (totalGB <= 16) return 3.5;
	if (totalGB <= 32) return 4;
	return 5;
	}

	// On Apple Silicon, llama.cpp can wire up to ~75% of RAM as VRAM by default;
	// `iogpu.wired_limit_mb` raises this. We assume the realistic default cap.
	function gpuWiredCapGB(totalGB: number, unified: boolean): number {
	if (!unified) return totalGB; // CPU path; whole RAM accessible
	if (totalGB <= 16) return totalGB * 0.67;
	if (totalGB <= 64) return totalGB * 0.75;
	return totalGB * 0.85;
	}

	export function checkCompat(
	cfg: MacBookConfig,
	model: ModelEntry,
	quant: Quant,
	ctx: number = 4096
	): CompatResult {
	const fileGB = quant.size / GB;
	const params = model.params_b ?? estimateParamsFromSize(fileGB, quant.quant);
	const kv = params ? estimateKVCacheGB(params, ctx) : 1.0;
	const overhead = 0.8; // compute buffers, activations, runtime
	const requiredGB = fileGB + kv + overhead;

	const total = cfg.ram;
	const reserve = osReserveGB(total);
	const wiredCap = gpuWiredCapGB(total, cfg.macbook.unifiedMemory);
	const usableGB = Math.min(total - reserve, wiredCap);
	const utilization = requiredGB / usableGB;

	let verdict: Verdict;
	if (utilization > 1) verdict = 'wont_fit';
	else if (utilization > 0.85) verdict = 'tight';
	else verdict = 'fits';

	// Throughput estimate: bandwidth-bound.
	// tok/s ≈ bandwidth_GBs / file_GB * efficiency
	// Apple Silicon Metal hits ~70% of peak BW for inference; CPU path ~25%.
	let estTokensPerSec: number \| null = null;
	if (verdict !== 'wont_fit') {
	const eff = cfg.macbook.unifiedMemory ? 0.7 : 0.25;
	estTokensPerSec = (cfg.macbook.bandwidthGBs / fileGB) * eff;
	estTokensPerSec = Math.round(estTokensPerSec * 10) / 10;
	}

	let reason: string;
	if (verdict === 'wont_fit')
	reason = `Needs ~${requiredGB.toFixed(1)} GB but only ~${usableGB.toFixed(1)} GB usable.`;
	else if (verdict === 'tight')
	reason = `Tight: ~${(utilization * 100).toFixed(0)}% of usable RAM.`;
	else
	reason = `Comfortable: ~${(utilization * 100).toFixed(0)}% of usable RAM.`;

	return { verdict, requiredGB, usableGB, utilization, estTokensPerSec, reason };
	}

	function estimateParamsFromSize(fileGB: number, quant: string): number {
	// Bits-per-weight by quant family (rough)
	const bpw: Record<string, number> = {
	F32: 32,
	BF16: 16,
	F16: 16,
	Q8_0: 8.5,
	Q6_K: 6.5,
	Q5_K_M: 5.5,
	Q5_K_S: 5.5,
	Q5_0: 5.5,
	Q4_K_M: 4.8,
	Q4_K_S: 4.6,
	Q4_0: 4.5,
	Q3_K_M: 3.9,
	Q3_K_S: 3.5,
	Q2_K: 3.0,
	IQ4_XS: 4.3,
	IQ3_M: 3.7,
	IQ2_M: 2.7,
	IQ2_XS: 2.4
	};
	const b = bpw[quant.toUpperCase()] ?? 5;
	// size_bytes ≈ params * bpw / 8
	const params_b = (fileGB * GB * 8) / (b * 1e9);
	return params_b;
	}

	export interface BestPick {
	model: ModelEntry;
	quant: Quant;
	result: CompatResult;
	}

	// Best (largest) quant of a given model that still fits comfortably.
	export function bestQuantForModel(
	cfg: MacBookConfig,
	model: ModelEntry,
	ctx: number
	): BestPick \| null {
	const sorted = [...model.quants].sort((a, b) => b.size - a.size);
	for (const q of sorted) {
	const r = checkCompat(cfg, model, q, ctx);
	if (r.verdict === 'fits') return { model, quant: q, result: r };
	}
	for (const q of sorted) {
	const r = checkCompat(cfg, model, q, ctx);
	if (r.verdict === 'tight') return { model, quant: q, result: r };
	}
	return null;
	}

	export function llamaServerSnippet(
	model: ModelEntry,
	quant: Quant,
	ctx: number,
	unified: boolean
	): string {
	// llama-server -hf supports `org/repo:Q4_K_M` shorthand for top-level files,
	// or use the full filename if the file is in a subdir.
	const fileBase = quant.path.split('/').pop() \|\| quant.path;
	const isTopLevel = !quant.path.includes('/');
	const ngl = unified ? 99 : 0; // offload everything on Apple Silicon, none on Intel
	const ref = isTopLevel
	? `-hf ${model.id}:${quant.quant}`
	: `-hf ${model.id} --hf-file ${fileBase}`;
	return `llama-server ${ref} -c ${ctx} -ngl ${ngl}`;
	}