Spaces:
Running
Running
| // RAM-compatibility math for running a GGUF quant on a given MacBook config. | |
| // Estimates are heuristic — the HF API doesn't expose n_layers / n_kv_heads | |
| // per quant, so KV cache is approximated from param count and context length. | |
| import type { MacBookConfig } from './macbooks'; | |
| export interface Quant { | |
| path: string; | |
| size: number; | |
| sizeGB: number; | |
| quant: string; | |
| sharded: boolean; | |
| } | |
| export interface ModelEntry { | |
| id: string; | |
| author: string; | |
| name: string; | |
| downloads: number; | |
| likes: number; | |
| pipeline_tag: string | null; | |
| params_b: number | null; | |
| arch: string | null; | |
| context_length: number | null; | |
| tags: string[]; | |
| quants: Quant[]; | |
| } | |
| export type Verdict = 'fits' | 'tight' | 'wont_fit' | 'unknown'; | |
| export interface CompatResult { | |
| verdict: Verdict; | |
| requiredGB: number; | |
| usableGB: number; | |
| utilization: number; // 0-1+ | |
| estTokensPerSec: number | null; | |
| reason: string; | |
| } | |
| const GB = 1024 ** 3; | |
| // Heuristic KV cache size at FP16: ~0.13 GB per 1B params per 4096 ctx. | |
| // Real numbers vary 2-3x by architecture (GQA reduces this). | |
| function estimateKVCacheGB(params_b: number, ctx: number): number { | |
| return 0.13 * params_b * (ctx / 4096); | |
| } | |
| // macOS reserves a chunk of RAM for the system. Roughly: | |
| // - 8 GB systems: ~3 GB reserved (heavy) | |
| // - 16 GB: ~3.5 GB | |
| // - 32 GB+: ~4 GB | |
| function osReserveGB(totalGB: number): number { | |
| if (totalGB <= 8) return 3; | |
| if (totalGB <= 16) return 3.5; | |
| if (totalGB <= 32) return 4; | |
| return 5; | |
| } | |
| // On Apple Silicon, llama.cpp can wire up to ~75% of RAM as VRAM by default; | |
| // `iogpu.wired_limit_mb` raises this. We assume the realistic default cap. | |
| function gpuWiredCapGB(totalGB: number, unified: boolean): number { | |
| if (!unified) return totalGB; // CPU path; whole RAM accessible | |
| if (totalGB <= 16) return totalGB * 0.67; | |
| if (totalGB <= 64) return totalGB * 0.75; | |
| return totalGB * 0.85; | |
| } | |
| export function checkCompat( | |
| cfg: MacBookConfig, | |
| model: ModelEntry, | |
| quant: Quant, | |
| ctx: number = 4096 | |
| ): CompatResult { | |
| const fileGB = quant.size / GB; | |
| const params = model.params_b ?? estimateParamsFromSize(fileGB, quant.quant); | |
| const kv = params ? estimateKVCacheGB(params, ctx) : 1.0; | |
| const overhead = 0.8; // compute buffers, activations, runtime | |
| const requiredGB = fileGB + kv + overhead; | |
| const total = cfg.ram; | |
| const reserve = osReserveGB(total); | |
| const wiredCap = gpuWiredCapGB(total, cfg.macbook.unifiedMemory); | |
| const usableGB = Math.min(total - reserve, wiredCap); | |
| const utilization = requiredGB / usableGB; | |
| let verdict: Verdict; | |
| if (utilization > 1) verdict = 'wont_fit'; | |
| else if (utilization > 0.85) verdict = 'tight'; | |
| else verdict = 'fits'; | |
| // Throughput estimate: bandwidth-bound. | |
| // tok/s ≈ bandwidth_GBs / file_GB * efficiency | |
| // Apple Silicon Metal hits ~70% of peak BW for inference; CPU path ~25%. | |
| let estTokensPerSec: number | null = null; | |
| if (verdict !== 'wont_fit') { | |
| const eff = cfg.macbook.unifiedMemory ? 0.7 : 0.25; | |
| estTokensPerSec = (cfg.macbook.bandwidthGBs / fileGB) * eff; | |
| estTokensPerSec = Math.round(estTokensPerSec * 10) / 10; | |
| } | |
| let reason: string; | |
| if (verdict === 'wont_fit') | |
| reason = `Needs ~${requiredGB.toFixed(1)} GB but only ~${usableGB.toFixed(1)} GB usable.`; | |
| else if (verdict === 'tight') | |
| reason = `Tight: ~${(utilization * 100).toFixed(0)}% of usable RAM.`; | |
| else | |
| reason = `Comfortable: ~${(utilization * 100).toFixed(0)}% of usable RAM.`; | |
| return { verdict, requiredGB, usableGB, utilization, estTokensPerSec, reason }; | |
| } | |
| function estimateParamsFromSize(fileGB: number, quant: string): number { | |
| // Bits-per-weight by quant family (rough) | |
| const bpw: Record<string, number> = { | |
| F32: 32, | |
| BF16: 16, | |
| F16: 16, | |
| Q8_0: 8.5, | |
| Q6_K: 6.5, | |
| Q5_K_M: 5.5, | |
| Q5_K_S: 5.5, | |
| Q5_0: 5.5, | |
| Q4_K_M: 4.8, | |
| Q4_K_S: 4.6, | |
| Q4_0: 4.5, | |
| Q3_K_M: 3.9, | |
| Q3_K_S: 3.5, | |
| Q2_K: 3.0, | |
| IQ4_XS: 4.3, | |
| IQ3_M: 3.7, | |
| IQ2_M: 2.7, | |
| IQ2_XS: 2.4 | |
| }; | |
| const b = bpw[quant.toUpperCase()] ?? 5; | |
| // size_bytes ≈ params * bpw / 8 | |
| const params_b = (fileGB * GB * 8) / (b * 1e9); | |
| return params_b; | |
| } | |
| export interface BestPick { | |
| model: ModelEntry; | |
| quant: Quant; | |
| result: CompatResult; | |
| } | |
| // Best (largest) quant of a given model that still fits comfortably. | |
| export function bestQuantForModel( | |
| cfg: MacBookConfig, | |
| model: ModelEntry, | |
| ctx: number | |
| ): BestPick | null { | |
| const sorted = [...model.quants].sort((a, b) => b.size - a.size); | |
| for (const q of sorted) { | |
| const r = checkCompat(cfg, model, q, ctx); | |
| if (r.verdict === 'fits') return { model, quant: q, result: r }; | |
| } | |
| for (const q of sorted) { | |
| const r = checkCompat(cfg, model, q, ctx); | |
| if (r.verdict === 'tight') return { model, quant: q, result: r }; | |
| } | |
| return null; | |
| } | |
| export function llamaServerSnippet( | |
| model: ModelEntry, | |
| quant: Quant, | |
| ctx: number, | |
| unified: boolean | |
| ): string { | |
| // llama-server -hf supports `org/repo:Q4_K_M` shorthand for top-level files, | |
| // or use the full filename if the file is in a subdir. | |
| const fileBase = quant.path.split('/').pop() || quant.path; | |
| const isTopLevel = !quant.path.includes('/'); | |
| const ngl = unified ? 99 : 0; // offload everything on Apple Silicon, none on Intel | |
| const ref = isTopLevel | |
| ? `-hf ${model.id}:${quant.quant}` | |
| : `-hf ${model.id} --hf-file ${fileBase}`; | |
| return `llama-server ${ref} -c ${ctx} -ngl ${ngl}`; | |
| } | |