general-eval-card / scripts /cache-hf-data.mjs
evijit's picture
evijit HF Staff
Swap backend data (#3)
fe99ffa
/**
* Downloads the HF dataset snapshot to local cache during build.
* This avoids hitting HF rate limits at runtime and during cache refresh.
*
* Phase 1: Clone the dataset snapshot once via Git
* Phase 2: Copy index files (model-cards, eval-list, developers, benchmark-metadata, peer-ranks)
* Phase 3: Copy detail directories (developers/*.json, evals/*.json, models/*.json)
*
* Run with: node scripts/cache-hf-data.mjs
*/
import { execFile } from "child_process"
import fs from "fs/promises"
import os from "os"
import path from "path"
import { promisify } from "util"
const root = path.resolve(new URL(import.meta.url).pathname, "..", "..")
const cacheDir = path.join(root, ".cache", "hf-data")
const publicDir = path.join(root, "public")
const dataBackend = process.env.DATA_BACKEND?.trim().toLowerCase()
if (dataBackend === "v2" || dataBackend === "stage-j") {
await fs.mkdir(cacheDir, { recursive: true })
console.log("[cache-hf-data] DATA_BACKEND=v2: skipping legacy HF cache; runtime reads SNAPSHOT_URL")
process.exit(0)
}
const HF_DATASET_REPO = process.env.HF_DATASET_REPO?.trim()
|| "https://huggingface.co/datasets/evaleval/card_backend"
const HF_RESOLVE_BASE = `${HF_DATASET_REPO}/resolve/main`
const execFileAsync = promisify(execFile)
// Lean DuckDB mode: when DATA_BACKEND=duckdb, the runtime reads model/eval/
// developer/summary data exclusively from `duckdb/v1/*.parquet` (see
// lib/duckdb-data.ts:48,82). The legacy JSON-fallback artifacts
// (model-cards*, eval-list*, developers*, plus the per-slug detail
// directories developers/, evals/, models/) become dead weight β€” skipping
// them avoids OOMing the HF Space on a 2.8 GB JSON snapshot.
//
// Always keep:
// - manifest.json (lib/data-backend.ts:96 β†’ /api/backend-manifest)
// - eval-hierarchy.json (lib/data-backend.ts:98 β†’ /api/eval-hierarchy)
// - benchmark-metadata.json (lib/benchmark-metadata.ts:5 β†’ /api/benchmark-metadata)
// - comparison-index.json (app/api/comparison-index/route.ts:7 β†’ app/models/[id]/page.tsx:157)
// - corpus-aggregates.json (app/corpus/page.tsx via lib/hf-data:fetchCorpusAggregates)
// - duckdb/v1/*.parquet (lib/duckdb-data.ts read path)
// - public/peer-ranks.json (always written outside cacheDir; component fetches HF directly)
const isDuckDBLean = process.env.DATA_BACKEND?.trim().toLowerCase() === "duckdb"
const ESSENTIAL_CACHE_ROOT_FILES = [
"manifest.json",
"benchmark-metadata.json",
"eval-hierarchy.json",
"comparison-index.json",
"corpus-aggregates.json",
]
const JSON_FALLBACK_CACHE_ROOT_FILES = [
"model-cards.json",
"model-cards-lite.json",
"eval-list.json",
"eval-list-lite.json",
"developers.json",
]
const CACHE_ROOT_FILES = isDuckDBLean
? ESSENTIAL_CACHE_ROOT_FILES
: [...ESSENTIAL_CACHE_ROOT_FILES, ...JSON_FALLBACK_CACHE_ROOT_FILES]
const OPTIONAL_CACHE_ROOT_FILES = new Set([
"model-cards-lite.json",
"eval-list-lite.json",
"corpus-aggregates.json",
])
const ESSENTIAL_CACHE_DIRECTORIES = ["duckdb"]
const JSON_FALLBACK_CACHE_DIRECTORIES = ["developers", "evals", "models"]
const CACHE_DIRECTORIES = isDuckDBLean
? ESSENTIAL_CACHE_DIRECTORIES
: [...JSON_FALLBACK_CACHE_DIRECTORIES, ...ESSENTIAL_CACHE_DIRECTORIES]
const TOKEN_CASE_MAP = {
ai: "AI",
claude: "Claude",
fc: "FC",
gemini: "Gemini",
gemma: "Gemma",
gpt: "GPT",
haiku: "Haiku",
mini: "Mini",
opus: "Opus",
preview: "Preview",
pro: "Pro",
prompt: "Prompt",
reasoning: "Reasoning",
sonnet: "Sonnet",
thinking: "Thinking",
}
async function runGit(args, cwd) {
await execFileAsync("git", args, {
cwd,
maxBuffer: 1024 * 1024 * 32,
})
}
async function cloneDatasetSnapshot(targetDir) {
await runGit(["clone", "--depth", "1", "--single-branch", HF_DATASET_REPO, targetDir], root)
}
async function ensureCleanDirectory(dirPath) {
await fs.rm(dirPath, { recursive: true, force: true })
await fs.mkdir(dirPath, { recursive: true })
}
async function copyFile(sourcePath, destinationPath) {
await fs.mkdir(path.dirname(destinationPath), { recursive: true })
await fs.copyFile(sourcePath, destinationPath)
const stat = await fs.stat(destinationPath)
return stat.size
}
function isGitLfsPointer(contents) {
return contents.startsWith("version https://git-lfs.github.com/spec/v1\n")
}
async function writeRemoteFile(relativePath, destinationPath) {
const headers = {}
const hfToken = process.env.HF_TOKEN?.trim()
if (hfToken) {
headers.Authorization = `Bearer ${hfToken}`
}
const response = await fetch(`${HF_RESOLVE_BASE}/${relativePath}`, { headers })
if (!response.ok) {
throw new Error(`Failed to download ${relativePath}: ${response.status} ${response.statusText}`)
}
const buffer = Buffer.from(await response.arrayBuffer())
await fs.mkdir(path.dirname(destinationPath), { recursive: true })
await fs.writeFile(destinationPath, buffer)
return buffer.length
}
async function copySnapshotFile(snapshotRoot, relativePath, destinationPath) {
const sourcePath = path.join(snapshotRoot, relativePath)
const contents = await fs.readFile(sourcePath, "utf8")
if (isGitLfsPointer(contents)) {
const size = await writeRemoteFile(relativePath, destinationPath)
return { size, source: "remote" }
}
const size = await copyFile(sourcePath, destinationPath)
return { size, source: "snapshot" }
}
async function copySnapshotDirectory(snapshotRoot, relativeDir, destinationDir) {
const sourceDir = path.join(snapshotRoot, relativeDir)
const entries = await fs.readdir(sourceDir, { withFileTypes: true })
let fileCount = 0
let remoteCount = 0
for (const entry of entries) {
const nestedRelativePath = path.posix.join(relativeDir, entry.name)
const destinationPath = path.join(destinationDir, entry.name)
if (entry.isDirectory()) {
await ensureCleanDirectory(destinationPath)
const nested = await copySnapshotDirectory(snapshotRoot, nestedRelativePath, destinationPath)
fileCount += nested.fileCount
remoteCount += nested.remoteCount
continue
}
const result = await copySnapshotFile(snapshotRoot, nestedRelativePath, destinationPath)
fileCount += 1
remoteCount += result.source === "remote" ? 1 : 0
}
return { fileCount, remoteCount }
}
function normalizeHandle(rawHandle) {
return rawHandle
.trim()
.toLowerCase()
.replace(/[_\s/]+/g, "-")
.replace(/(\d)-(?=\d(?:-|$))/g, "$1.")
.replace(/-+/g, "-")
.replace(/^-|-$/g, "")
}
function formatVersionDate(dateToken) {
if (!/^(?:19|20)\d{6}$/.test(dateToken)) {
return dateToken
}
return `${dateToken.slice(0, 4)}-${dateToken.slice(4, 6)}-${dateToken.slice(6, 8)}`
}
function titleCaseToken(token) {
if (!token) {
return token
}
if (TOKEN_CASE_MAP[token]) {
return TOKEN_CASE_MAP[token]
}
if (/^\d+(\.\d+)?$/.test(token)) {
return token
}
if (/^\d+(\.\d+)?[bkmt]$/i.test(token)) {
return `${token.slice(0, -1)}${token.slice(-1).toUpperCase()}`
}
return token.charAt(0).toUpperCase() + token.slice(1)
}
function humanizeHandle(handle) {
return handle
.split("-")
.filter(Boolean)
.map((token) => (/^(?:19|20)\d{6}$/.test(token) ? formatVersionDate(token) : titleCaseToken(token)))
.join(" ")
}
function getCanonicalFamilyInfo(modelFamilyId) {
const [namespace = "unknown", rawHandle = ""] = String(modelFamilyId).split("/")
const normalizedHandle = normalizeHandle(rawHandle)
const match = normalizedHandle.match(/^(.*?)-((?:19|20)\d{6})(?:-(.+))?$/)
const familySlug = match ? match[1] : normalizedHandle
return {
familyId: `${namespace}/${familySlug}`,
familyName: humanizeHandle(familySlug),
}
}
function normalizeSetupAliasQualifier(value) {
return String(value ?? "").trim().toLowerCase().replace(/[_\s]+/g, "-")
}
function isSetupAliasQualifier(value) {
const normalized = normalizeSetupAliasQualifier(value)
return (
normalized === "prompt" ||
normalized === "fc" ||
normalized === "function-calling" ||
normalized.startsWith("thinking")
)
}
function getNormalizedVariantMeta(familyId, variantKey, variantLabel) {
if (variantKey === "default" || variantKey === "base") {
return {
variantKey: "default",
variantLabel: "Default",
}
}
const familyHandle = familyId.split("/")[1] ?? ""
const compositeHandle = normalizeHandle(`${familyHandle}-${variantKey}`)
const match = compositeHandle.match(/^(.*?)-((?:19|20)\d{6})(?:-(.+))?$/)
if (!match) {
return {
variantKey,
variantLabel,
}
}
const [, , dateToken, qualifier] = match
if (qualifier && isSetupAliasQualifier(humanizeHandle(qualifier))) {
const formattedDate = formatVersionDate(dateToken)
return {
variantKey: formattedDate,
variantLabel: formattedDate,
}
}
const formattedDate = formatVersionDate(dateToken)
return {
variantKey,
variantLabel: qualifier ? `${formattedDate} Β· ${humanizeHandle(qualifier)}` : formattedDate,
}
}
function normalizeModelCardEntry(entry) {
const familyInfo = getCanonicalFamilyInfo(entry.model_family_id)
const variantsByKey = new Map()
for (const variant of entry.variants ?? []) {
const meta = getNormalizedVariantMeta(familyInfo.familyId, variant.variant_key, variant.variant_label)
const existing = variantsByKey.get(meta.variantKey)
if (existing) {
existing.evaluation_count += variant.evaluation_count
existing.last_updated =
!existing.last_updated || (variant.last_updated && new Date(variant.last_updated).getTime() > new Date(existing.last_updated).getTime())
? variant.last_updated
: existing.last_updated
existing.raw_model_ids = Array.from(
new Set([...(existing.raw_model_ids ?? []), ...(variant.raw_model_ids ?? [])])
).sort((a, b) => a.localeCompare(b))
continue
}
variantsByKey.set(meta.variantKey, {
...variant,
variant_key: meta.variantKey,
variant_label: meta.variantLabel,
raw_model_ids: [...(variant.raw_model_ids ?? [])].sort((a, b) => a.localeCompare(b)),
})
}
const variants = Array.from(variantsByKey.values()).sort((a, b) => {
const aIsDefault = a.variant_key === "default"
const bIsDefault = b.variant_key === "default"
if (aIsDefault !== bIsDefault) {
return aIsDefault ? -1 : 1
}
const aTime = a.last_updated ? new Date(a.last_updated).getTime() : Number.NEGATIVE_INFINITY
const bTime = b.last_updated ? new Date(b.last_updated).getTime() : Number.NEGATIVE_INFINITY
if (aTime !== bTime) {
return bTime - aTime
}
return a.variant_label.localeCompare(b.variant_label)
})
return {
...entry,
model_family_id: familyInfo.familyId,
model_route_id: familyInfo.familyId.replace(/\//g, "__"),
model_family_name: familyInfo.familyName,
total_evaluations:
variants.length > 0
? variants.reduce((sum, variant) => sum + (variant.evaluation_count ?? 0), 0)
: entry.total_evaluations,
variants,
}
}
async function normalizeCachedModelCardFile(filePath) {
try {
const text = await fs.readFile(filePath, "utf8")
const data = JSON.parse(text)
if (!Array.isArray(data)) {
return
}
const normalized = data.map(normalizeModelCardEntry)
await fs.writeFile(filePath, `${JSON.stringify(normalized, null, 2)}\n`)
} catch (error) {
console.warn(` β—‹ failed to normalize ${path.basename(filePath)}: ${error instanceof Error ? error.message : String(error)}`)
}
}
async function countFiles(dirPath) {
const entries = await fs.readdir(dirPath, { withFileTypes: true })
let count = 0
for (const entry of entries) {
const fullPath = path.join(dirPath, entry.name)
if (entry.isDirectory()) {
count += await countFiles(fullPath)
continue
}
count += 1
}
return count
}
async function main() {
console.log("Caching HF dataset snapshot for build...\n")
if (isDuckDBLean) {
console.log("Lean DuckDB cache mode: skipping JSON-fallback artifacts (model-cards*, eval-list*, developers*, developers/, evals/, models/)")
console.log("")
}
await fs.mkdir(cacheDir, { recursive: true })
await fs.mkdir(publicDir, { recursive: true })
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), "card-backend-"))
try {
// ── Phase 1: Clone snapshot ───────────────────────────────────────────
console.log("Phase 1: Clone dataset snapshot")
await cloneDatasetSnapshot(tempDir)
console.log(` βœ“ cloned ${HF_DATASET_REPO}`)
// ── Phase 2: Root files ───────────────────────────────────────────────
console.log("\nPhase 2: Copy index files")
for (const fileName of CACHE_ROOT_FILES) {
const destinationPath = path.join(cacheDir, fileName)
try {
const result = await copySnapshotFile(tempDir, fileName, destinationPath)
const suffix = result.source === "remote" ? ", resolved from LFS" : ""
console.log(` βœ“ ${fileName} (${(result.size / 1024).toFixed(0)} KB${suffix})`)
} catch (error) {
if (
OPTIONAL_CACHE_ROOT_FILES.has(fileName) &&
typeof error === "object" &&
error != null &&
"code" in error &&
error.code === "ENOENT"
) {
console.log(` β—‹ ${fileName} not published yet; skipping`)
continue
}
throw error
}
}
const peerRanksResult = await copySnapshotFile(
tempDir,
"peer-ranks.json",
path.join(publicDir, "peer-ranks.json")
)
const peerRanksSuffix = peerRanksResult.source === "remote" ? ", resolved from LFS" : ""
console.log(` βœ“ peer-ranks.json (${(peerRanksResult.size / 1024).toFixed(0)} KB${peerRanksSuffix})`)
if (!isDuckDBLean) {
await normalizeCachedModelCardFile(path.join(cacheDir, "model-cards.json"))
await normalizeCachedModelCardFile(path.join(cacheDir, "model-cards-lite.json"))
console.log(" βœ“ normalized model card artifacts")
}
// ── Phase 3: Detail directories ─────────────────────────────────────
console.log("\nPhase 3: Copy detail directories")
for (const directoryName of CACHE_DIRECTORIES) {
const destinationPath = path.join(cacheDir, directoryName)
await ensureCleanDirectory(destinationPath)
const result = await copySnapshotDirectory(tempDir, directoryName, destinationPath)
const remoteSuffix = result.remoteCount > 0 ? `, ${result.remoteCount} resolved from LFS` : ""
console.log(` βœ“ ${directoryName}/ (${result.fileCount} files${remoteSuffix})`)
}
console.log("\nDone.")
} finally {
await fs.rm(tempDir, { recursive: true, force: true })
}
}
main().catch((err) => {
console.error(err)
process.exit(1)
})