/** * Downloads the HF dataset snapshot to local cache during build. * This avoids hitting HF rate limits at runtime and during cache refresh. * * Phase 1: Clone the dataset snapshot once via Git * Phase 2: Copy index files (model-cards, eval-list, developers, benchmark-metadata, peer-ranks) * Phase 3: Copy detail directories (developers/*.json, evals/*.json, models/*.json) * * Run with: node scripts/cache-hf-data.mjs */ import { execFile } from "child_process" import fs from "fs/promises" import os from "os" import path from "path" import { promisify } from "util" const root = path.resolve(new URL(import.meta.url).pathname, "..", "..") const cacheDir = path.join(root, ".cache", "hf-data") const publicDir = path.join(root, "public") const dataBackend = process.env.DATA_BACKEND?.trim().toLowerCase() if (dataBackend === "v2" || dataBackend === "stage-j") { await fs.mkdir(cacheDir, { recursive: true }) console.log("[cache-hf-data] DATA_BACKEND=v2: skipping legacy HF cache; runtime reads SNAPSHOT_URL") process.exit(0) } const HF_DATASET_REPO = process.env.HF_DATASET_REPO?.trim() || "https://huggingface.co/datasets/evaleval/card_backend" const HF_RESOLVE_BASE = `${HF_DATASET_REPO}/resolve/main` const execFileAsync = promisify(execFile) // Lean DuckDB mode: when DATA_BACKEND=duckdb, the runtime reads model/eval/ // developer/summary data exclusively from `duckdb/v1/*.parquet` (see // lib/duckdb-data.ts:48,82). The legacy JSON-fallback artifacts // (model-cards*, eval-list*, developers*, plus the per-slug detail // directories developers/, evals/, models/) become dead weight — skipping // them avoids OOMing the HF Space on a 2.8 GB JSON snapshot. // // Always keep: // - manifest.json (lib/data-backend.ts:96 → /api/backend-manifest) // - eval-hierarchy.json (lib/data-backend.ts:98 → /api/eval-hierarchy) // - benchmark-metadata.json (lib/benchmark-metadata.ts:5 → /api/benchmark-metadata) // - comparison-index.json (app/api/comparison-index/route.ts:7 → app/models/[id]/page.tsx:157) // - corpus-aggregates.json (app/corpus/page.tsx via lib/hf-data:fetchCorpusAggregates) // - duckdb/v1/*.parquet (lib/duckdb-data.ts read path) // - public/peer-ranks.json (always written outside cacheDir; component fetches HF directly) const isDuckDBLean = process.env.DATA_BACKEND?.trim().toLowerCase() === "duckdb" const ESSENTIAL_CACHE_ROOT_FILES = [ "manifest.json", "benchmark-metadata.json", "eval-hierarchy.json", "comparison-index.json", "corpus-aggregates.json", ] const JSON_FALLBACK_CACHE_ROOT_FILES = [ "model-cards.json", "model-cards-lite.json", "eval-list.json", "eval-list-lite.json", "developers.json", ] const CACHE_ROOT_FILES = isDuckDBLean ? ESSENTIAL_CACHE_ROOT_FILES : [...ESSENTIAL_CACHE_ROOT_FILES, ...JSON_FALLBACK_CACHE_ROOT_FILES] const OPTIONAL_CACHE_ROOT_FILES = new Set([ "model-cards-lite.json", "eval-list-lite.json", "corpus-aggregates.json", ]) const ESSENTIAL_CACHE_DIRECTORIES = ["duckdb"] const JSON_FALLBACK_CACHE_DIRECTORIES = ["developers", "evals", "models"] const CACHE_DIRECTORIES = isDuckDBLean ? ESSENTIAL_CACHE_DIRECTORIES : [...JSON_FALLBACK_CACHE_DIRECTORIES, ...ESSENTIAL_CACHE_DIRECTORIES] const TOKEN_CASE_MAP = { ai: "AI", claude: "Claude", fc: "FC", gemini: "Gemini", gemma: "Gemma", gpt: "GPT", haiku: "Haiku", mini: "Mini", opus: "Opus", preview: "Preview", pro: "Pro", prompt: "Prompt", reasoning: "Reasoning", sonnet: "Sonnet", thinking: "Thinking", } async function runGit(args, cwd) { await execFileAsync("git", args, { cwd, maxBuffer: 1024 * 1024 * 32, }) } async function cloneDatasetSnapshot(targetDir) { await runGit(["clone", "--depth", "1", "--single-branch", HF_DATASET_REPO, targetDir], root) } async function ensureCleanDirectory(dirPath) { await fs.rm(dirPath, { recursive: true, force: true }) await fs.mkdir(dirPath, { recursive: true }) } async function copyFile(sourcePath, destinationPath) { await fs.mkdir(path.dirname(destinationPath), { recursive: true }) await fs.copyFile(sourcePath, destinationPath) const stat = await fs.stat(destinationPath) return stat.size } function isGitLfsPointer(contents) { return contents.startsWith("version https://git-lfs.github.com/spec/v1\n") } async function writeRemoteFile(relativePath, destinationPath) { const headers = {} const hfToken = process.env.HF_TOKEN?.trim() if (hfToken) { headers.Authorization = `Bearer ${hfToken}` } const response = await fetch(`${HF_RESOLVE_BASE}/${relativePath}`, { headers }) if (!response.ok) { throw new Error(`Failed to download ${relativePath}: ${response.status} ${response.statusText}`) } const buffer = Buffer.from(await response.arrayBuffer()) await fs.mkdir(path.dirname(destinationPath), { recursive: true }) await fs.writeFile(destinationPath, buffer) return buffer.length } async function copySnapshotFile(snapshotRoot, relativePath, destinationPath) { const sourcePath = path.join(snapshotRoot, relativePath) const contents = await fs.readFile(sourcePath, "utf8") if (isGitLfsPointer(contents)) { const size = await writeRemoteFile(relativePath, destinationPath) return { size, source: "remote" } } const size = await copyFile(sourcePath, destinationPath) return { size, source: "snapshot" } } async function copySnapshotDirectory(snapshotRoot, relativeDir, destinationDir) { const sourceDir = path.join(snapshotRoot, relativeDir) const entries = await fs.readdir(sourceDir, { withFileTypes: true }) let fileCount = 0 let remoteCount = 0 for (const entry of entries) { const nestedRelativePath = path.posix.join(relativeDir, entry.name) const destinationPath = path.join(destinationDir, entry.name) if (entry.isDirectory()) { await ensureCleanDirectory(destinationPath) const nested = await copySnapshotDirectory(snapshotRoot, nestedRelativePath, destinationPath) fileCount += nested.fileCount remoteCount += nested.remoteCount continue } const result = await copySnapshotFile(snapshotRoot, nestedRelativePath, destinationPath) fileCount += 1 remoteCount += result.source === "remote" ? 1 : 0 } return { fileCount, remoteCount } } function normalizeHandle(rawHandle) { return rawHandle .trim() .toLowerCase() .replace(/[_\s/]+/g, "-") .replace(/(\d)-(?=\d(?:-|$))/g, "$1.") .replace(/-+/g, "-") .replace(/^-|-$/g, "") } function formatVersionDate(dateToken) { if (!/^(?:19|20)\d{6}$/.test(dateToken)) { return dateToken } return `${dateToken.slice(0, 4)}-${dateToken.slice(4, 6)}-${dateToken.slice(6, 8)}` } function titleCaseToken(token) { if (!token) { return token } if (TOKEN_CASE_MAP[token]) { return TOKEN_CASE_MAP[token] } if (/^\d+(\.\d+)?$/.test(token)) { return token } if (/^\d+(\.\d+)?[bkmt]$/i.test(token)) { return `${token.slice(0, -1)}${token.slice(-1).toUpperCase()}` } return token.charAt(0).toUpperCase() + token.slice(1) } function humanizeHandle(handle) { return handle .split("-") .filter(Boolean) .map((token) => (/^(?:19|20)\d{6}$/.test(token) ? formatVersionDate(token) : titleCaseToken(token))) .join(" ") } function getCanonicalFamilyInfo(modelFamilyId) { const [namespace = "unknown", rawHandle = ""] = String(modelFamilyId).split("/") const normalizedHandle = normalizeHandle(rawHandle) const match = normalizedHandle.match(/^(.*?)-((?:19|20)\d{6})(?:-(.+))?$/) const familySlug = match ? match[1] : normalizedHandle return { familyId: `${namespace}/${familySlug}`, familyName: humanizeHandle(familySlug), } } function normalizeSetupAliasQualifier(value) { return String(value ?? "").trim().toLowerCase().replace(/[_\s]+/g, "-") } function isSetupAliasQualifier(value) { const normalized = normalizeSetupAliasQualifier(value) return ( normalized === "prompt" || normalized === "fc" || normalized === "function-calling" || normalized.startsWith("thinking") ) } function getNormalizedVariantMeta(familyId, variantKey, variantLabel) { if (variantKey === "default" || variantKey === "base") { return { variantKey: "default", variantLabel: "Default", } } const familyHandle = familyId.split("/")[1] ?? "" const compositeHandle = normalizeHandle(`${familyHandle}-${variantKey}`) const match = compositeHandle.match(/^(.*?)-((?:19|20)\d{6})(?:-(.+))?$/) if (!match) { return { variantKey, variantLabel, } } const [, , dateToken, qualifier] = match if (qualifier && isSetupAliasQualifier(humanizeHandle(qualifier))) { const formattedDate = formatVersionDate(dateToken) return { variantKey: formattedDate, variantLabel: formattedDate, } } const formattedDate = formatVersionDate(dateToken) return { variantKey, variantLabel: qualifier ? `${formattedDate} · ${humanizeHandle(qualifier)}` : formattedDate, } } function normalizeModelCardEntry(entry) { const familyInfo = getCanonicalFamilyInfo(entry.model_family_id) const variantsByKey = new Map() for (const variant of entry.variants ?? []) { const meta = getNormalizedVariantMeta(familyInfo.familyId, variant.variant_key, variant.variant_label) const existing = variantsByKey.get(meta.variantKey) if (existing) { existing.evaluation_count += variant.evaluation_count existing.last_updated = !existing.last_updated || (variant.last_updated && new Date(variant.last_updated).getTime() > new Date(existing.last_updated).getTime()) ? variant.last_updated : existing.last_updated existing.raw_model_ids = Array.from( new Set([...(existing.raw_model_ids ?? []), ...(variant.raw_model_ids ?? [])]) ).sort((a, b) => a.localeCompare(b)) continue } variantsByKey.set(meta.variantKey, { ...variant, variant_key: meta.variantKey, variant_label: meta.variantLabel, raw_model_ids: [...(variant.raw_model_ids ?? [])].sort((a, b) => a.localeCompare(b)), }) } const variants = Array.from(variantsByKey.values()).sort((a, b) => { const aIsDefault = a.variant_key === "default" const bIsDefault = b.variant_key === "default" if (aIsDefault !== bIsDefault) { return aIsDefault ? -1 : 1 } const aTime = a.last_updated ? new Date(a.last_updated).getTime() : Number.NEGATIVE_INFINITY const bTime = b.last_updated ? new Date(b.last_updated).getTime() : Number.NEGATIVE_INFINITY if (aTime !== bTime) { return bTime - aTime } return a.variant_label.localeCompare(b.variant_label) }) return { ...entry, model_family_id: familyInfo.familyId, model_route_id: familyInfo.familyId.replace(/\//g, "__"), model_family_name: familyInfo.familyName, total_evaluations: variants.length > 0 ? variants.reduce((sum, variant) => sum + (variant.evaluation_count ?? 0), 0) : entry.total_evaluations, variants, } } async function normalizeCachedModelCardFile(filePath) { try { const text = await fs.readFile(filePath, "utf8") const data = JSON.parse(text) if (!Array.isArray(data)) { return } const normalized = data.map(normalizeModelCardEntry) await fs.writeFile(filePath, `${JSON.stringify(normalized, null, 2)}\n`) } catch (error) { console.warn(` ○ failed to normalize ${path.basename(filePath)}: ${error instanceof Error ? error.message : String(error)}`) } } async function countFiles(dirPath) { const entries = await fs.readdir(dirPath, { withFileTypes: true }) let count = 0 for (const entry of entries) { const fullPath = path.join(dirPath, entry.name) if (entry.isDirectory()) { count += await countFiles(fullPath) continue } count += 1 } return count } async function main() { console.log("Caching HF dataset snapshot for build...\n") if (isDuckDBLean) { console.log("Lean DuckDB cache mode: skipping JSON-fallback artifacts (model-cards*, eval-list*, developers*, developers/, evals/, models/)") console.log("") } await fs.mkdir(cacheDir, { recursive: true }) await fs.mkdir(publicDir, { recursive: true }) const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "card-backend-")) try { // ── Phase 1: Clone snapshot ─────────────────────────────────────────── console.log("Phase 1: Clone dataset snapshot") await cloneDatasetSnapshot(tempDir) console.log(` ✓ cloned ${HF_DATASET_REPO}`) // ── Phase 2: Root files ─────────────────────────────────────────────── console.log("\nPhase 2: Copy index files") for (const fileName of CACHE_ROOT_FILES) { const destinationPath = path.join(cacheDir, fileName) try { const result = await copySnapshotFile(tempDir, fileName, destinationPath) const suffix = result.source === "remote" ? ", resolved from LFS" : "" console.log(` ✓ ${fileName} (${(result.size / 1024).toFixed(0)} KB${suffix})`) } catch (error) { if ( OPTIONAL_CACHE_ROOT_FILES.has(fileName) && typeof error === "object" && error != null && "code" in error && error.code === "ENOENT" ) { console.log(` ○ ${fileName} not published yet; skipping`) continue } throw error } } const peerRanksResult = await copySnapshotFile( tempDir, "peer-ranks.json", path.join(publicDir, "peer-ranks.json") ) const peerRanksSuffix = peerRanksResult.source === "remote" ? ", resolved from LFS" : "" console.log(` ✓ peer-ranks.json (${(peerRanksResult.size / 1024).toFixed(0)} KB${peerRanksSuffix})`) if (!isDuckDBLean) { await normalizeCachedModelCardFile(path.join(cacheDir, "model-cards.json")) await normalizeCachedModelCardFile(path.join(cacheDir, "model-cards-lite.json")) console.log(" ✓ normalized model card artifacts") } // ── Phase 3: Detail directories ───────────────────────────────────── console.log("\nPhase 3: Copy detail directories") for (const directoryName of CACHE_DIRECTORIES) { const destinationPath = path.join(cacheDir, directoryName) await ensureCleanDirectory(destinationPath) const result = await copySnapshotDirectory(tempDir, directoryName, destinationPath) const remoteSuffix = result.remoteCount > 0 ? `, ${result.remoteCount} resolved from LFS` : "" console.log(` ✓ ${directoryName}/ (${result.fileCount} files${remoteSuffix})`) } console.log("\nDone.") } finally { await fs.rm(tempDir, { recursive: true, force: true }) } } main().catch((err) => { console.error(err) process.exit(1) })