general-eval-card / lib /hf-data.ts
j-chim's picture
WIP: v2 cleanup checkpoint before merging origin/main
d249d5b
import "server-only"
import { promises as fs } from "fs"
import path from "path"
import type {
BackendManifest,
BackendManifestStatus,
ComparisonIndex,
CorpusAggregates,
EvalHierarchy,
EvalcardsAnnotations,
HierarchyBenchmark,
HierarchyComposite,
HierarchyFamily,
HierarchyMetric,
HierarchySlice,
HierarchyTags,
PeerRanksMap,
RowAnnotations,
SignalSummaries,
} from "@/lib/backend-artifacts"
import type {
BenchmarkCard,
BenchmarkEvaluation,
EvalTag,
EvaluationResult,
MetricConfig,
ModelInfo,
SampleResult,
SourceData,
SourceMetadata,
} from "@/lib/benchmark-schema"
import { getCanonicalModelIdentity, getModelFamilyRouteId } from "@/lib/model-family"
// ---------------------------------------------------------------------------
// HuggingFace dataset base URL
// ---------------------------------------------------------------------------
const HF_DATASET = "evaleval/card_backend"
const HF_BASE = `https://huggingface.co/datasets/${HF_DATASET}/resolve/main`
// ---------------------------------------------------------------------------
// Local disk cache (populated by scripts/cache-hf-data.mjs during build)
// ---------------------------------------------------------------------------
// HF_DATA_LOCAL_DIR overrides the cache location so the JSON read path can be
// pointed at a sibling repo's pipeline output for parity testing against the
// DuckDB backend. Falls back to the cache populated by scripts/cache-hf-data.mjs.
const LOCAL_CACHE_DIR = process.env.HF_DATA_LOCAL_DIR?.trim()
? path.resolve(process.env.HF_DATA_LOCAL_DIR.trim())
: path.join(process.cwd(), ".cache", "hf-data")
async function readLocalCache<T>(relativePath: string): Promise<T | null> {
try {
const filePath = path.join(LOCAL_CACHE_DIR, relativePath)
const text = await fs.readFile(filePath, "utf8")
return JSON.parse(text) as T
} catch {
return null
}
}
// ---------------------------------------------------------------------------
// In-memory cache (always active to avoid HF rate limits)
// ---------------------------------------------------------------------------
interface CacheEntry {
data: unknown
ts: number
manifestSignature?: string
}
const cache = new Map<string, CacheEntry>()
const CACHE_TTL_MS =
process.env.HF_DATA_CACHE_TTL_MS != null
? Number.parseInt(process.env.HF_DATA_CACHE_TTL_MS, 10)
: process.env.NODE_ENV === "production"
? 60 * 1000
: 0
const MANIFEST_TTL_MS =
process.env.HF_MANIFEST_CACHE_TTL_MS != null
? Number.parseInt(process.env.HF_MANIFEST_CACHE_TTL_MS, 10)
: process.env.NODE_ENV === "production"
? 30 * 1000
: 0
let manifestSnapshotCache:
| {
remote: BackendManifest | null
local: BackendManifest | null
ts: number
}
| null = null
let localManifestPromise: Promise<BackendManifest | null> | null = null
let manifestRefreshPromise: Promise<void> | null = null
let refreshTargetManifestSignature: string | null = null
let refreshTargetFailed = false
let activeManifestSignature: string | null = null
let activeManifest: BackendManifest | null = null
const backgroundRefreshes = new Map<string, Promise<void>>()
const observedPaths = new Set<string>()
function isCanonicalCacheShape(relativePath: string, data: unknown) {
if (!data || typeof data !== "object") {
return false
}
const record = data as Record<string, unknown>
if (relativePath.startsWith("models/")) {
return record.hierarchy_by_category != null
}
if (relativePath.startsWith("evals/")) {
return Array.isArray(record.metrics)
}
if (relativePath === "eval-list.json") {
const evals = Array.isArray(record.evals) ? (record.evals as Array<Record<string, unknown>>) : []
return evals.length === 0 || typeof evals[0]?.benchmark_family_key === "string"
}
if (relativePath === "comparison-index.json") {
return record.evals != null && record.by_model != null
}
return true
}
function getManifestSignature(manifest: BackendManifest | null | undefined) {
if (!manifest) {
return null
}
return JSON.stringify({
generated_at: manifest.generated_at,
config_version: manifest.config_version,
skipped_configs: [...manifest.skipped_configs].sort(),
})
}
// HF_DATA_OFFLINE disables every network fetch, so the read path is fully
// served by LOCAL_CACHE_DIR. Used by the DuckDB parity setup so two servers
// reading the same on-disk artifacts cannot diverge mid-test via background
// refresh, and useful generally for offline development.
const OFFLINE = process.env.HF_DATA_OFFLINE === "1"
const DATA_BACKEND_VERSION = process.env.DATA_BACKEND?.trim().toLowerCase()
function useViewLayerBackend() {
return DATA_BACKEND_VERSION === "v2" || DATA_BACKEND_VERSION === "stage-j"
}
async function fetchSnapshotSidecars() {
return import("@/lib/sidecars")
}
async function fetchRemoteJson<T>(relativePath: string): Promise<T> {
if (OFFLINE) {
throw new Error(`HF_DATA_OFFLINE=1: refusing remote fetch for ${relativePath}`)
}
const url = `${HF_BASE}/${relativePath}`
let lastError: Error | null = null
for (let attempt = 0; attempt < 3; attempt++) {
if (attempt > 0) {
await new Promise((resolve) => setTimeout(resolve, 1000 * attempt))
}
try {
const res = await fetch(url, { cache: "no-store" })
if (res.status === 429) {
lastError = new Error(`HF rate limited (429) for ${url}`)
continue
}
if (!res.ok) {
throw new Error(`HF fetch failed: ${res.status} ${res.statusText} for ${url}`)
}
return (await res.json()) as T
} catch (err) {
lastError = err instanceof Error ? err : new Error(String(err))
if (!String(err).includes("429")) {
throw err
}
}
}
throw lastError ?? new Error(`HF fetch failed for ${url}`)
}
async function getLocalManifest() {
if (!localManifestPromise) {
localManifestPromise = readLocalCache<BackendManifest>("manifest.json")
}
const local = await localManifestPromise
if (!manifestSnapshotCache) {
manifestSnapshotCache = {
remote: null,
local,
ts: 0,
}
} else if (manifestSnapshotCache.local == null && local != null) {
manifestSnapshotCache.local = local
}
if (!activeManifestSignature) {
activeManifestSignature = getManifestSignature(local)
activeManifest = local
}
return local
}
function queueArtifactRefresh(
relativePath: string,
manifestSignature: string,
remoteManifest: BackendManifest | null
) {
if (backgroundRefreshes.has(relativePath)) {
return
}
if (backgroundRefreshes.size === 0 || refreshTargetManifestSignature !== manifestSignature) {
refreshTargetManifestSignature = manifestSignature
refreshTargetFailed = false
}
const refreshPromise = (async () => {
try {
const data = await fetchRemoteJson<unknown>(relativePath)
if (isCanonicalCacheShape(relativePath, data)) {
cache.set(relativePath, {
data,
ts: Date.now(),
manifestSignature,
})
}
} catch (err) {
refreshTargetFailed = true
console.warn(`[hf-data] Background refresh failed for ${relativePath}:`, err)
} finally {
backgroundRefreshes.delete(relativePath)
if (
backgroundRefreshes.size === 0 &&
!refreshTargetFailed &&
refreshTargetManifestSignature === manifestSignature
) {
activeManifestSignature = manifestSignature
activeManifest = remoteManifest
}
}
})()
backgroundRefreshes.set(relativePath, refreshPromise)
}
function queueObservedPathRefreshes(snapshot: {
remote: BackendManifest | null
local: BackendManifest | null
}) {
const remoteManifestSignature = getManifestSignature(snapshot.remote)
if (!remoteManifestSignature || remoteManifestSignature === activeManifestSignature) {
return
}
for (const relativePath of observedPaths) {
if (relativePath !== "manifest.json") {
queueArtifactRefresh(relativePath, remoteManifestSignature, snapshot.remote)
}
}
}
function queueManifestSnapshotRefresh() {
if (manifestRefreshPromise) {
return manifestRefreshPromise
}
manifestRefreshPromise = (async () => {
const local = await getLocalManifest()
const remote = await fetchRemoteJson<BackendManifest>("manifest.json").catch((err) => {
console.warn("[hf-data] Failed to fetch remote manifest:", err)
return null
})
manifestSnapshotCache = {
remote,
local,
ts: Date.now(),
}
const remoteManifestSignature = getManifestSignature(remote)
if (!activeManifestSignature && remoteManifestSignature) {
activeManifestSignature = remoteManifestSignature
activeManifest = remote
}
queueObservedPathRefreshes(manifestSnapshotCache)
})().finally(() => {
manifestRefreshPromise = null
})
return manifestRefreshPromise
}
async function getManifestSnapshot() {
const local = await getLocalManifest()
if (!manifestSnapshotCache) {
manifestSnapshotCache = {
remote: null,
local,
ts: 0,
}
}
if (
MANIFEST_TTL_MS === 0 ||
Date.now() - manifestSnapshotCache.ts >= MANIFEST_TTL_MS
) {
void queueManifestSnapshotRefresh()
}
return manifestSnapshotCache
}
function getCurrentManifestFromSnapshot(snapshot: {
remote: BackendManifest | null
local: BackendManifest | null
}) {
const remoteSignature = getManifestSignature(snapshot.remote)
if (remoteSignature && remoteSignature === activeManifestSignature) {
return snapshot.remote
}
return activeManifest ?? snapshot.local ?? snapshot.remote
}
async function fetchHFJson<T>(relativePath: string): Promise<T> {
if (relativePath === "manifest.json") {
const snapshot = await getManifestSnapshot()
if (snapshot.remote) {
return snapshot.remote as T
}
if (snapshot.local) {
return snapshot.local as T
}
throw new Error("HF manifest fetch failed and no local manifest cache is available")
}
const manifestSnapshot = await getManifestSnapshot()
const remoteManifestSignature = getManifestSignature(manifestSnapshot.remote)
const localManifestSignature = getManifestSignature(manifestSnapshot.local)
observedPaths.add(relativePath)
// 1. In-memory cache (hot)
const hit = cache.get(relativePath)
const validHotCache = hit ? isCanonicalCacheShape(relativePath, hit.data) : false
if (hit && !validHotCache) {
cache.delete(relativePath)
}
if (
hit &&
validHotCache &&
CACHE_TTL_MS > 0 &&
Date.now() - hit.ts < CACHE_TTL_MS &&
(!remoteManifestSignature || hit.manifestSignature === remoteManifestSignature)
) {
return hit.data as T
}
if (hit && validHotCache) {
if (remoteManifestSignature && hit.manifestSignature !== remoteManifestSignature) {
queueArtifactRefresh(relativePath, remoteManifestSignature, manifestSnapshot.remote)
}
return hit.data as T
}
const local = await readLocalCache<T>(relativePath)
const validLocalCache = local !== null && isCanonicalCacheShape(relativePath, local)
// 2. If the local cache was built from the same manifest, keep using it.
if (
validLocalCache &&
remoteManifestSignature &&
localManifestSignature &&
remoteManifestSignature === localManifestSignature
) {
cache.set(relativePath, {
data: local,
ts: Date.now(),
manifestSignature: remoteManifestSignature,
})
return local
}
// 3. Serve the local cache immediately and refresh in the background when the
// manifest indicates newer data exists.
if (validLocalCache) {
cache.set(relativePath, {
data: local,
ts: Date.now(),
manifestSignature: localManifestSignature ?? undefined,
})
if (remoteManifestSignature && remoteManifestSignature !== localManifestSignature) {
queueArtifactRefresh(relativePath, remoteManifestSignature, manifestSnapshot.remote)
}
return local
}
// 4. Fall back to a live fetch only when there is no usable stale cache.
try {
const data = await fetchRemoteJson<T>(relativePath)
cache.set(relativePath, {
data,
ts: Date.now(),
manifestSignature: remoteManifestSignature ?? undefined,
})
return data
} catch (err) {
if (hit && validHotCache) {
console.warn(`[hf-data] Using stale cache for ${relativePath} after live fetch failed`)
return hit.data as T
}
throw err
}
}
async function fetchHFJsonSafe<T>(relativePath: string): Promise<T | null> {
try {
return await fetchHFJson<T>(relativePath)
} catch (err) {
if (!String(err).includes("404")) {
console.warn(`[hf-data] Failed to fetch ${relativePath}:`, err)
}
return null
}
}
// ---------------------------------------------------------------------------
// HF dataset types (shapes of JSON files in the HF repo)
// ---------------------------------------------------------------------------
export interface HFEvalModelResult {
model_id: string
model_route_id: string
model_name: string
developer: string
raw_model_id?: string
score: number
evaluation_id?: string
retrieved_timestamp?: string
source_record_url?: string
// The pipeline copies the parent record's provenance onto every hierarchy
// model_result row. Required.
source_metadata: SourceMetadata
source_data?: SourceData | string[]
detailed_evaluation_results?: string | null
detailed_evaluation_results_meta?: unknown
instance_level_data?: unknown
passthrough_top_level_fields?: unknown
evalcards?: { annotations?: RowAnnotations }
}
export interface HFEvalMetric {
metric_summary_id: string
legacy_eval_summary_id?: string
evaluation_name?: string
metric_name: string
metric_key: string
display_name?: string
canonical_display_name?: string
metric_config?: MetricConfig | Record<string, unknown>
lower_is_better: boolean
model_results: HFEvalModelResult[]
}
export interface HFEvalDetail extends SignalSummaries {
eval_summary_id: string
benchmark: string
canonical_display_name?: string
benchmark_family_key: string
benchmark_leaf_key: string
benchmark_leaf_name: string
benchmark_parent_key?: string
benchmark_parent_name?: string
category: string
source_data: SourceData
benchmark_card: BenchmarkCard | null
metrics: HFEvalMetric[]
subtasks: unknown[]
evalcards?: { annotations?: EvalcardsAnnotations }
}
export interface HFModelDetail extends SignalSummaries {
model_info: ModelInfo & {
family_id?: string
family_slug?: string
variant_key?: string
variant_label?: string
model_route_id?: string
}
model_family_id: string
model_route_id: string
model_family_name?: string
raw_model_ids?: string[]
last_updated?: string
hierarchy_by_category: Record<string, HFModelHierarchyNode[]>
evaluations_by_category?: Record<string, HFModelEvaluation[]>
total_evaluations: number
categories_covered: string[]
variants: Array<{
variant_key: string
variant_label: string
evaluation_count: number
raw_model_ids: string[]
}>
}
/** Evaluation entry inside an HF model detail file */
export interface HFModelEvaluation {
schema_version?: string
evaluation_id: string
retrieved_timestamp: string
benchmark?: string
source_data?: SourceData | string[]
source_metadata?: SourceMetadata
eval_library?: { name: string; version?: string; additional_details?: Record<string, unknown> }
model_info?: ModelInfo
generation_config?: BenchmarkEvaluation["generation_config"]
evaluation_results: EvaluationResult[]
source_record_url?: string
detailed_evaluation_results?: string | null
detailed_evaluation_results_meta?: unknown
instance_level_data?: unknown
benchmark_card?: BenchmarkCard | null
passthrough_top_level_fields?: unknown
}
export interface HFModelHierarchyMetric {
metric_summary_id: string
legacy_eval_summary_id?: string
evaluation_name: string
display_name: string
canonical_display_name?: string
benchmark_leaf_key: string
benchmark_leaf_name: string
slice_key?: string | null
slice_name?: string | null
lower_is_better: boolean
metric_name: string
metric_id: string
metric_key: string
metric_source?: string
metric_config: MetricConfig
models_count: number
top_score?: number
model_results: HFEvalModelResult[]
}
export interface HFModelHierarchyNode {
eval_summary_id: string
benchmark: string
canonical_display_name?: string
benchmark_family_key: string
benchmark_family_name: string
benchmark_parent_key: string
benchmark_parent_name: string
benchmark_leaf_key: string
benchmark_leaf_name: string
benchmark_component_key?: string | null
benchmark_component_name?: string | null
evaluation_name: string
display_name: string
is_summary_score: boolean
category: string
source_data: SourceData
benchmark_card: BenchmarkCard | null
tags: {
domains: string[]
languages: string[]
tasks: string[]
}
subtasks_count: number
metrics_count: number
metric_names: string[]
primary_metric_name: string
metrics: HFModelHierarchyMetric[]
subtasks: HFModelHierarchyNode[]
top_score?: number
constituent_evaluation_ids?: string[]
instance_data?: {
available: boolean
url_count: number
sample_urls: string[]
models_with_loaded_instances: number
}
}
// ---------------------------------------------------------------------------
// Public data fetchers
// ---------------------------------------------------------------------------
export async function fetchBenchmarkMetadataMap(): Promise<Record<string, BenchmarkCard>> {
if (useViewLayerBackend()) {
return (await import("@/lib/view-data")).getBenchmarkMetadataMap()
}
return fetchHFJson<Record<string, BenchmarkCard>>("benchmark-metadata.json")
}
export async function fetchBackendManifest(): Promise<BackendManifest> {
if (useViewLayerBackend()) {
return (await fetchSnapshotSidecars()).fetchManifest()
}
return fetchHFJson<BackendManifest>("manifest.json")
}
export async function fetchEvalHierarchy(): Promise<EvalHierarchy> {
if (useViewLayerBackend()) {
return adaptEvalHierarchy(await (await fetchSnapshotSidecars()).fetchHierarchy())
}
const raw = await fetchHFJson<EvalHierarchy>("eval-hierarchy.json")
return adaptEvalHierarchy(raw)
}
/**
* Validate-and-passthrough for the v3 hierarchy shape. The producer's
* `write_hierarchy()` emits family-rooted trees directly; the adapter
* no longer synthesises legacy shapes.
*
* Behaviour:
* - v3 detection via `schema_version === "v3.hierarchy.1"` —
* pass through unchanged.
* - Older snapshot lacking schema_version: log a warning and
* pass through. Consumers may render empty for missing fields
* but won't crash.
*
* The adapter no longer synthesises `families[].composites[]` from a
* top-level `composites[]`; the producer does that grouping at write
* time using `canonical_composites.family_id`.
*/
export function adaptEvalHierarchy(raw: EvalHierarchy): EvalHierarchy {
if (!raw) {
return { families: [] }
}
if (raw.schema_version && !raw.schema_version.startsWith("v3.hierarchy.")) {
console.warn(
`adaptEvalHierarchy: unexpected schema_version=${JSON.stringify(raw.schema_version)}; ` +
`expected v3.hierarchy.*. Frontend may render incompletely.`,
)
}
return raw
}
export async function fetchComparisonIndex(): Promise<ComparisonIndex> {
if (useViewLayerBackend()) {
return (await fetchSnapshotSidecars()).fetchComparisonIndex()
}
return fetchHFJson<ComparisonIndex>("comparison-index.json")
}
/**
* Per-(eval, model) primary-metric peer ranks. v2 reads the wrapped
* sidecar from the pinned snapshot; legacy reads the bare-map file
* historically published unversioned at the dataset root.
*/
export async function fetchPeerRanks(): Promise<PeerRanksMap> {
if (useViewLayerBackend()) {
return (await fetchSnapshotSidecars()).fetchPeerRanks()
}
return (await fetchHFJsonSafe<PeerRanksMap>("peer-ranks.json")) ?? {}
}
export async function fetchCorpusAggregates(): Promise<CorpusAggregates | null> {
if (useViewLayerBackend()) {
return (await fetchSnapshotSidecars()).fetchHeadline()
}
return fetchHFJsonSafe<CorpusAggregates>("corpus-aggregates.json")
}
export async function fetchModelDetail(slug: string): Promise<HFModelDetail | null> {
return fetchHFJsonSafe<HFModelDetail>(`models/${slug}.json`)
}
export async function fetchEvalDetail(slug: string): Promise<HFEvalDetail | null> {
return fetchHFJsonSafe<HFEvalDetail>(`evals/${slug}.json`)
}