Spaces:
Running
Running
| /** | |
| * Downloads the HF dataset snapshot to local cache during build. | |
| * This avoids hitting HF rate limits at runtime and during cache refresh. | |
| * | |
| * Phase 1: Clone the dataset snapshot once via Git | |
| * Phase 2: Copy index files (model-cards, eval-list, developers, benchmark-metadata, peer-ranks) | |
| * Phase 3: Copy detail directories (developers/*.json, evals/*.json, models/*.json) | |
| * | |
| * Run with: node scripts/cache-hf-data.mjs | |
| */ | |
| import { execFile } from "child_process" | |
| import fs from "fs/promises" | |
| import os from "os" | |
| import path from "path" | |
| import { promisify } from "util" | |
| const root = path.resolve(new URL(import.meta.url).pathname, "..", "..") | |
| const cacheDir = path.join(root, ".cache", "hf-data") | |
| const publicDir = path.join(root, "public") | |
| const dataBackend = process.env.DATA_BACKEND?.trim().toLowerCase() | |
| if (dataBackend === "v2" || dataBackend === "stage-j") { | |
| await fs.mkdir(cacheDir, { recursive: true }) | |
| console.log("[cache-hf-data] DATA_BACKEND=v2: skipping legacy HF cache; runtime reads SNAPSHOT_URL") | |
| process.exit(0) | |
| } | |
| const HF_DATASET_REPO = process.env.HF_DATASET_REPO?.trim() | |
| || "https://huggingface.co/datasets/evaleval/card_backend" | |
| const HF_RESOLVE_BASE = `${HF_DATASET_REPO}/resolve/main` | |
| const execFileAsync = promisify(execFile) | |
| // Lean DuckDB mode: when DATA_BACKEND=duckdb, the runtime reads model/eval/ | |
| // developer/summary data exclusively from `duckdb/v1/*.parquet` (see | |
| // lib/duckdb-data.ts:48,82). The legacy JSON-fallback artifacts | |
| // (model-cards*, eval-list*, developers*, plus the per-slug detail | |
| // directories developers/, evals/, models/) become dead weight β skipping | |
| // them avoids OOMing the HF Space on a 2.8 GB JSON snapshot. | |
| // | |
| // Always keep: | |
| // - manifest.json (lib/data-backend.ts:96 β /api/backend-manifest) | |
| // - eval-hierarchy.json (lib/data-backend.ts:98 β /api/eval-hierarchy) | |
| // - benchmark-metadata.json (lib/benchmark-metadata.ts:5 β /api/benchmark-metadata) | |
| // - comparison-index.json (app/api/comparison-index/route.ts:7 β app/models/[id]/page.tsx:157) | |
| // - corpus-aggregates.json (app/corpus/page.tsx via lib/hf-data:fetchCorpusAggregates) | |
| // - duckdb/v1/*.parquet (lib/duckdb-data.ts read path) | |
| // - public/peer-ranks.json (always written outside cacheDir; component fetches HF directly) | |
| const isDuckDBLean = process.env.DATA_BACKEND?.trim().toLowerCase() === "duckdb" | |
| const ESSENTIAL_CACHE_ROOT_FILES = [ | |
| "manifest.json", | |
| "benchmark-metadata.json", | |
| "eval-hierarchy.json", | |
| "comparison-index.json", | |
| "corpus-aggregates.json", | |
| ] | |
| const JSON_FALLBACK_CACHE_ROOT_FILES = [ | |
| "model-cards.json", | |
| "model-cards-lite.json", | |
| "eval-list.json", | |
| "eval-list-lite.json", | |
| "developers.json", | |
| ] | |
| const CACHE_ROOT_FILES = isDuckDBLean | |
| ? ESSENTIAL_CACHE_ROOT_FILES | |
| : [...ESSENTIAL_CACHE_ROOT_FILES, ...JSON_FALLBACK_CACHE_ROOT_FILES] | |
| const OPTIONAL_CACHE_ROOT_FILES = new Set([ | |
| "model-cards-lite.json", | |
| "eval-list-lite.json", | |
| "corpus-aggregates.json", | |
| ]) | |
| const ESSENTIAL_CACHE_DIRECTORIES = ["duckdb"] | |
| const JSON_FALLBACK_CACHE_DIRECTORIES = ["developers", "evals", "models"] | |
| const CACHE_DIRECTORIES = isDuckDBLean | |
| ? ESSENTIAL_CACHE_DIRECTORIES | |
| : [...JSON_FALLBACK_CACHE_DIRECTORIES, ...ESSENTIAL_CACHE_DIRECTORIES] | |
| const TOKEN_CASE_MAP = { | |
| ai: "AI", | |
| claude: "Claude", | |
| fc: "FC", | |
| gemini: "Gemini", | |
| gemma: "Gemma", | |
| gpt: "GPT", | |
| haiku: "Haiku", | |
| mini: "Mini", | |
| opus: "Opus", | |
| preview: "Preview", | |
| pro: "Pro", | |
| prompt: "Prompt", | |
| reasoning: "Reasoning", | |
| sonnet: "Sonnet", | |
| thinking: "Thinking", | |
| } | |
| async function runGit(args, cwd) { | |
| await execFileAsync("git", args, { | |
| cwd, | |
| maxBuffer: 1024 * 1024 * 32, | |
| }) | |
| } | |
| async function cloneDatasetSnapshot(targetDir) { | |
| await runGit(["clone", "--depth", "1", "--single-branch", HF_DATASET_REPO, targetDir], root) | |
| } | |
| async function ensureCleanDirectory(dirPath) { | |
| await fs.rm(dirPath, { recursive: true, force: true }) | |
| await fs.mkdir(dirPath, { recursive: true }) | |
| } | |
| async function copyFile(sourcePath, destinationPath) { | |
| await fs.mkdir(path.dirname(destinationPath), { recursive: true }) | |
| await fs.copyFile(sourcePath, destinationPath) | |
| const stat = await fs.stat(destinationPath) | |
| return stat.size | |
| } | |
| function isGitLfsPointer(contents) { | |
| return contents.startsWith("version https://git-lfs.github.com/spec/v1\n") | |
| } | |
| async function writeRemoteFile(relativePath, destinationPath) { | |
| const headers = {} | |
| const hfToken = process.env.HF_TOKEN?.trim() | |
| if (hfToken) { | |
| headers.Authorization = `Bearer ${hfToken}` | |
| } | |
| const response = await fetch(`${HF_RESOLVE_BASE}/${relativePath}`, { headers }) | |
| if (!response.ok) { | |
| throw new Error(`Failed to download ${relativePath}: ${response.status} ${response.statusText}`) | |
| } | |
| const buffer = Buffer.from(await response.arrayBuffer()) | |
| await fs.mkdir(path.dirname(destinationPath), { recursive: true }) | |
| await fs.writeFile(destinationPath, buffer) | |
| return buffer.length | |
| } | |
| async function copySnapshotFile(snapshotRoot, relativePath, destinationPath) { | |
| const sourcePath = path.join(snapshotRoot, relativePath) | |
| const contents = await fs.readFile(sourcePath, "utf8") | |
| if (isGitLfsPointer(contents)) { | |
| const size = await writeRemoteFile(relativePath, destinationPath) | |
| return { size, source: "remote" } | |
| } | |
| const size = await copyFile(sourcePath, destinationPath) | |
| return { size, source: "snapshot" } | |
| } | |
| async function copySnapshotDirectory(snapshotRoot, relativeDir, destinationDir) { | |
| const sourceDir = path.join(snapshotRoot, relativeDir) | |
| const entries = await fs.readdir(sourceDir, { withFileTypes: true }) | |
| let fileCount = 0 | |
| let remoteCount = 0 | |
| for (const entry of entries) { | |
| const nestedRelativePath = path.posix.join(relativeDir, entry.name) | |
| const destinationPath = path.join(destinationDir, entry.name) | |
| if (entry.isDirectory()) { | |
| await ensureCleanDirectory(destinationPath) | |
| const nested = await copySnapshotDirectory(snapshotRoot, nestedRelativePath, destinationPath) | |
| fileCount += nested.fileCount | |
| remoteCount += nested.remoteCount | |
| continue | |
| } | |
| const result = await copySnapshotFile(snapshotRoot, nestedRelativePath, destinationPath) | |
| fileCount += 1 | |
| remoteCount += result.source === "remote" ? 1 : 0 | |
| } | |
| return { fileCount, remoteCount } | |
| } | |
| function normalizeHandle(rawHandle) { | |
| return rawHandle | |
| .trim() | |
| .toLowerCase() | |
| .replace(/[_\s/]+/g, "-") | |
| .replace(/(\d)-(?=\d(?:-|$))/g, "$1.") | |
| .replace(/-+/g, "-") | |
| .replace(/^-|-$/g, "") | |
| } | |
| function formatVersionDate(dateToken) { | |
| if (!/^(?:19|20)\d{6}$/.test(dateToken)) { | |
| return dateToken | |
| } | |
| return `${dateToken.slice(0, 4)}-${dateToken.slice(4, 6)}-${dateToken.slice(6, 8)}` | |
| } | |
| function titleCaseToken(token) { | |
| if (!token) { | |
| return token | |
| } | |
| if (TOKEN_CASE_MAP[token]) { | |
| return TOKEN_CASE_MAP[token] | |
| } | |
| if (/^\d+(\.\d+)?$/.test(token)) { | |
| return token | |
| } | |
| if (/^\d+(\.\d+)?[bkmt]$/i.test(token)) { | |
| return `${token.slice(0, -1)}${token.slice(-1).toUpperCase()}` | |
| } | |
| return token.charAt(0).toUpperCase() + token.slice(1) | |
| } | |
| function humanizeHandle(handle) { | |
| return handle | |
| .split("-") | |
| .filter(Boolean) | |
| .map((token) => (/^(?:19|20)\d{6}$/.test(token) ? formatVersionDate(token) : titleCaseToken(token))) | |
| .join(" ") | |
| } | |
| function getCanonicalFamilyInfo(modelFamilyId) { | |
| const [namespace = "unknown", rawHandle = ""] = String(modelFamilyId).split("/") | |
| const normalizedHandle = normalizeHandle(rawHandle) | |
| const match = normalizedHandle.match(/^(.*?)-((?:19|20)\d{6})(?:-(.+))?$/) | |
| const familySlug = match ? match[1] : normalizedHandle | |
| return { | |
| familyId: `${namespace}/${familySlug}`, | |
| familyName: humanizeHandle(familySlug), | |
| } | |
| } | |
| function normalizeSetupAliasQualifier(value) { | |
| return String(value ?? "").trim().toLowerCase().replace(/[_\s]+/g, "-") | |
| } | |
| function isSetupAliasQualifier(value) { | |
| const normalized = normalizeSetupAliasQualifier(value) | |
| return ( | |
| normalized === "prompt" || | |
| normalized === "fc" || | |
| normalized === "function-calling" || | |
| normalized.startsWith("thinking") | |
| ) | |
| } | |
| function getNormalizedVariantMeta(familyId, variantKey, variantLabel) { | |
| if (variantKey === "default" || variantKey === "base") { | |
| return { | |
| variantKey: "default", | |
| variantLabel: "Default", | |
| } | |
| } | |
| const familyHandle = familyId.split("/")[1] ?? "" | |
| const compositeHandle = normalizeHandle(`${familyHandle}-${variantKey}`) | |
| const match = compositeHandle.match(/^(.*?)-((?:19|20)\d{6})(?:-(.+))?$/) | |
| if (!match) { | |
| return { | |
| variantKey, | |
| variantLabel, | |
| } | |
| } | |
| const [, , dateToken, qualifier] = match | |
| if (qualifier && isSetupAliasQualifier(humanizeHandle(qualifier))) { | |
| const formattedDate = formatVersionDate(dateToken) | |
| return { | |
| variantKey: formattedDate, | |
| variantLabel: formattedDate, | |
| } | |
| } | |
| const formattedDate = formatVersionDate(dateToken) | |
| return { | |
| variantKey, | |
| variantLabel: qualifier ? `${formattedDate} Β· ${humanizeHandle(qualifier)}` : formattedDate, | |
| } | |
| } | |
| function normalizeModelCardEntry(entry) { | |
| const familyInfo = getCanonicalFamilyInfo(entry.model_family_id) | |
| const variantsByKey = new Map() | |
| for (const variant of entry.variants ?? []) { | |
| const meta = getNormalizedVariantMeta(familyInfo.familyId, variant.variant_key, variant.variant_label) | |
| const existing = variantsByKey.get(meta.variantKey) | |
| if (existing) { | |
| existing.evaluation_count += variant.evaluation_count | |
| existing.last_updated = | |
| !existing.last_updated || (variant.last_updated && new Date(variant.last_updated).getTime() > new Date(existing.last_updated).getTime()) | |
| ? variant.last_updated | |
| : existing.last_updated | |
| existing.raw_model_ids = Array.from( | |
| new Set([...(existing.raw_model_ids ?? []), ...(variant.raw_model_ids ?? [])]) | |
| ).sort((a, b) => a.localeCompare(b)) | |
| continue | |
| } | |
| variantsByKey.set(meta.variantKey, { | |
| ...variant, | |
| variant_key: meta.variantKey, | |
| variant_label: meta.variantLabel, | |
| raw_model_ids: [...(variant.raw_model_ids ?? [])].sort((a, b) => a.localeCompare(b)), | |
| }) | |
| } | |
| const variants = Array.from(variantsByKey.values()).sort((a, b) => { | |
| const aIsDefault = a.variant_key === "default" | |
| const bIsDefault = b.variant_key === "default" | |
| if (aIsDefault !== bIsDefault) { | |
| return aIsDefault ? -1 : 1 | |
| } | |
| const aTime = a.last_updated ? new Date(a.last_updated).getTime() : Number.NEGATIVE_INFINITY | |
| const bTime = b.last_updated ? new Date(b.last_updated).getTime() : Number.NEGATIVE_INFINITY | |
| if (aTime !== bTime) { | |
| return bTime - aTime | |
| } | |
| return a.variant_label.localeCompare(b.variant_label) | |
| }) | |
| return { | |
| ...entry, | |
| model_family_id: familyInfo.familyId, | |
| model_route_id: familyInfo.familyId.replace(/\//g, "__"), | |
| model_family_name: familyInfo.familyName, | |
| total_evaluations: | |
| variants.length > 0 | |
| ? variants.reduce((sum, variant) => sum + (variant.evaluation_count ?? 0), 0) | |
| : entry.total_evaluations, | |
| variants, | |
| } | |
| } | |
| async function normalizeCachedModelCardFile(filePath) { | |
| try { | |
| const text = await fs.readFile(filePath, "utf8") | |
| const data = JSON.parse(text) | |
| if (!Array.isArray(data)) { | |
| return | |
| } | |
| const normalized = data.map(normalizeModelCardEntry) | |
| await fs.writeFile(filePath, `${JSON.stringify(normalized, null, 2)}\n`) | |
| } catch (error) { | |
| console.warn(` β failed to normalize ${path.basename(filePath)}: ${error instanceof Error ? error.message : String(error)}`) | |
| } | |
| } | |
| async function countFiles(dirPath) { | |
| const entries = await fs.readdir(dirPath, { withFileTypes: true }) | |
| let count = 0 | |
| for (const entry of entries) { | |
| const fullPath = path.join(dirPath, entry.name) | |
| if (entry.isDirectory()) { | |
| count += await countFiles(fullPath) | |
| continue | |
| } | |
| count += 1 | |
| } | |
| return count | |
| } | |
| async function main() { | |
| console.log("Caching HF dataset snapshot for build...\n") | |
| if (isDuckDBLean) { | |
| console.log("Lean DuckDB cache mode: skipping JSON-fallback artifacts (model-cards*, eval-list*, developers*, developers/, evals/, models/)") | |
| console.log("") | |
| } | |
| await fs.mkdir(cacheDir, { recursive: true }) | |
| await fs.mkdir(publicDir, { recursive: true }) | |
| const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "card-backend-")) | |
| try { | |
| // ββ Phase 1: Clone snapshot βββββββββββββββββββββββββββββββββββββββββββ | |
| console.log("Phase 1: Clone dataset snapshot") | |
| await cloneDatasetSnapshot(tempDir) | |
| console.log(` β cloned ${HF_DATASET_REPO}`) | |
| // ββ Phase 2: Root files βββββββββββββββββββββββββββββββββββββββββββββββ | |
| console.log("\nPhase 2: Copy index files") | |
| for (const fileName of CACHE_ROOT_FILES) { | |
| const destinationPath = path.join(cacheDir, fileName) | |
| try { | |
| const result = await copySnapshotFile(tempDir, fileName, destinationPath) | |
| const suffix = result.source === "remote" ? ", resolved from LFS" : "" | |
| console.log(` β ${fileName} (${(result.size / 1024).toFixed(0)} KB${suffix})`) | |
| } catch (error) { | |
| if ( | |
| OPTIONAL_CACHE_ROOT_FILES.has(fileName) && | |
| typeof error === "object" && | |
| error != null && | |
| "code" in error && | |
| error.code === "ENOENT" | |
| ) { | |
| console.log(` β ${fileName} not published yet; skipping`) | |
| continue | |
| } | |
| throw error | |
| } | |
| } | |
| const peerRanksResult = await copySnapshotFile( | |
| tempDir, | |
| "peer-ranks.json", | |
| path.join(publicDir, "peer-ranks.json") | |
| ) | |
| const peerRanksSuffix = peerRanksResult.source === "remote" ? ", resolved from LFS" : "" | |
| console.log(` β peer-ranks.json (${(peerRanksResult.size / 1024).toFixed(0)} KB${peerRanksSuffix})`) | |
| if (!isDuckDBLean) { | |
| await normalizeCachedModelCardFile(path.join(cacheDir, "model-cards.json")) | |
| await normalizeCachedModelCardFile(path.join(cacheDir, "model-cards-lite.json")) | |
| console.log(" β normalized model card artifacts") | |
| } | |
| // ββ Phase 3: Detail directories βββββββββββββββββββββββββββββββββββββ | |
| console.log("\nPhase 3: Copy detail directories") | |
| for (const directoryName of CACHE_DIRECTORIES) { | |
| const destinationPath = path.join(cacheDir, directoryName) | |
| await ensureCleanDirectory(destinationPath) | |
| const result = await copySnapshotDirectory(tempDir, directoryName, destinationPath) | |
| const remoteSuffix = result.remoteCount > 0 ? `, ${result.remoteCount} resolved from LFS` : "" | |
| console.log(` β ${directoryName}/ (${result.fileCount} files${remoteSuffix})`) | |
| } | |
| console.log("\nDone.") | |
| } finally { | |
| await fs.rm(tempDir, { recursive: true, force: true }) | |
| } | |
| } | |
| main().catch((err) => { | |
| console.error(err) | |
| process.exit(1) | |
| }) | |