This view is limited to 50 files because it contains too many changes. See the raw diff here.
Files changed (50) hide show
  1. .gitignore +11 -0
  2. Dockerfile +17 -17
  3. app/developers/[id]/page.tsx +0 -46
  4. app/evals/[id]/page.tsx +32 -124
  5. app/evals/page.tsx +263 -9
  6. app/globals.css +291 -30
  7. app/models/page.tsx +35 -103
  8. app/page.tsx +1 -1
  9. components/benchmark-detail.tsx +3 -3
  10. components/eval-detail.tsx +125 -411
  11. components/family-table.tsx +234 -100
  12. components/param-range-picker.tsx +244 -0
  13. components/signals/benchmark-signals-strip.tsx +626 -0
  14. components/signals/corpus-dashboard.tsx +61 -42
  15. components/signals/corpus-signals-strip.tsx +21 -11
  16. data/benchmarks.json +0 -90
  17. data/developers.json +0 -3150
  18. data/developers/0-hero.json +0 -47
  19. data/developers/01-ai.json +0 -417
  20. data/developers/1-800-llms.json +0 -33
  21. data/developers/1024m.json +0 -33
  22. data/developers/152334h.json +0 -19
  23. data/developers/1tuanpham.json +0 -33
  24. data/developers/3rd-degree-burn.json +0 -61
  25. data/developers/4season.json +0 -19
  26. data/developers/aaditya.json +0 -19
  27. data/developers/aalf.json +0 -61
  28. data/developers/aashraf995.json +0 -61
  29. data/developers/abacusai.json +0 -145
  30. data/developers/abacusresearch.json +0 -19
  31. data/developers/abhishek.json +0 -75
  32. data/developers/abideen.json +0 -19
  33. data/developers/adamo1139.json +0 -19
  34. data/developers/adriszmar.json +0 -19
  35. data/developers/aellm.json +0 -33
  36. data/developers/aevalone.json +0 -19
  37. data/developers/agentlans.json +0 -131
  38. data/developers/agi-0.json +0 -47
  39. data/developers/ahdoot.json +0 -33
  40. data/developers/ahjeong.json +0 -33
  41. data/developers/ahmeda335.json +0 -19
  42. data/developers/ai-mo.json +0 -33
  43. data/developers/ai-sweden-models.json +0 -33
  44. data/developers/ai2.json +0 -89
  45. data/developers/ai21.json +0 -364
  46. data/developers/ai21labs.json +0 -19
  47. data/developers/ai4bharat.json +0 -19
  48. data/developers/ai4free.json +0 -33
  49. data/developers/aicoressecurity.json +0 -61
  50. data/developers/aidc-ai.json +0 -19
.gitignore CHANGED
@@ -31,3 +31,14 @@ next-env.d.ts
31
  .cache/
32
  pipeline_revised.py
33
  .local-data/
 
 
 
 
 
 
 
 
 
 
 
 
31
  .cache/
32
  pipeline_revised.py
33
  .local-data/
34
+
35
+ # Local data dumps. The runtime reads from .cache/hf-data/duckdb parquets;
36
+ # anything under /data/ is leftover from offline scripts and should not
37
+ # ship in git. The survey schema (loaded statically by app/survey) is the
38
+ # one exception.
39
+ /data/*
40
+ !/data/survey/
41
+ /data/survey/*
42
+ !/data/survey/eval-schema-fields.json
43
+ mock_design/
44
+ shoot.mjs
Dockerfile CHANGED
@@ -9,20 +9,18 @@ ARG PNPM_VERSION=10.25.0
9
 
10
  # Build-time data-source configuration. HF Spaces "Variables" are NOT injected
11
  # into Docker RUN steps automatically — only into the final runtime — so we
12
- # bake the DuckDB-mode defaults here. `cache-hf-data.mjs` reads these to know
13
- # which dataset to clone and to apply lean cache mode (skip JSON-fallback
14
- # artifacts). Override at build time via `--build-arg HF_DATASET_REPO=...`.
15
- ARG DATA_BACKEND=duckdb
16
  ARG HF_DATASET_REPO=https://huggingface.co/datasets/evaleval/card_backend
17
- # Static prerender (`next build`) executes route handlers, which call
18
- # `getModelCards` etc. `lib/duckdb-data.ts`, which requires
19
- # `LOCAL_PIPELINE_OUTPUT`. The cache populated by `cache-hf-data.mjs`
20
- # lives at `/app/.cache/hf-data`. `HF_DATA_OFFLINE=1` keeps the metadata
21
- # fetchers (`lib/hf-data.ts`) from attempting `evaleval/card_backend`
22
- # network reads with `revalidate: 0` (which Next 15 treats as dynamic
23
- # and fails the static export of `/`).
24
  ENV DATA_BACKEND=${DATA_BACKEND} \
25
  HF_DATASET_REPO=${HF_DATASET_REPO} \
 
26
  LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
27
  HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
28
  HF_DATA_OFFLINE=1
@@ -49,13 +47,15 @@ RUN pnpm run build
49
  FROM node:18-bullseye-slim AS runner
50
  WORKDIR /app
51
 
52
- # Runtime needs the same DuckDB-mode envs that the builder used. HF Space
53
- # Variables aren't set on this Space, and Docker multi-stage doesn't carry
54
- # ENVs across stages — without these, lib/duckdb-data.ts throws
55
- # "DATA_BACKEND=duckdb requires LOCAL_PIPELINE_OUTPUT" at request time and
56
- # every model/eval/developer endpoint returns empty.
 
57
  ENV NODE_ENV=production \
58
- DATA_BACKEND=duckdb \
 
59
  LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
60
  HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
61
  HF_DATA_OFFLINE=1
 
9
 
10
  # Build-time data-source configuration. HF Spaces "Variables" are NOT injected
11
  # into Docker RUN steps automatically — only into the final runtime — so we
12
+ # bake the selected backend here. `DATA_BACKEND=v2` reads `SNAPSHOT_URL`
13
+ # directly; legacy DuckDB mode still clones `HF_DATASET_REPO` into the cache.
14
+ # Override at build time via `--build-arg ...`.
15
+ ARG DATA_BACKEND=v2
16
  ARG HF_DATASET_REPO=https://huggingface.co/datasets/evaleval/card_backend
17
+ ARG SNAPSHOT_URL=https://huggingface.co/datasets/j-chim/temp_evalcard_backend/resolve/main/warehouse/2026-05-03T21-46-50Z
18
+ # Static prerender (`next build`) executes route handlers. In legacy mode the
19
+ # cache populated by `cache-hf-data.mjs` lives at `/app/.cache/hf-data`; in v2
20
+ # the cache step is skipped and the app reads the pinned Stage J snapshot.
 
 
 
21
  ENV DATA_BACKEND=${DATA_BACKEND} \
22
  HF_DATASET_REPO=${HF_DATASET_REPO} \
23
+ SNAPSHOT_URL=${SNAPSHOT_URL} \
24
  LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
25
  HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
26
  HF_DATA_OFFLINE=1
 
47
  FROM node:18-bullseye-slim AS runner
48
  WORKDIR /app
49
 
50
+ ARG DATA_BACKEND=v2
51
+ ARG SNAPSHOT_URL=https://huggingface.co/datasets/j-chim/temp_evalcard_backend/resolve/main/warehouse/2026-05-03T21-46-50Z
52
+
53
+ # Runtime needs the same data-source envs that the builder used. Docker
54
+ # multi-stage doesn't carry ENVs across stages, so keep backend selection and
55
+ # snapshot/cache pointers explicit here too.
56
  ENV NODE_ENV=production \
57
+ DATA_BACKEND=${DATA_BACKEND} \
58
+ SNAPSHOT_URL=${SNAPSHOT_URL} \
59
  LOCAL_PIPELINE_OUTPUT=/app/.cache/hf-data \
60
  HF_DATA_LOCAL_DIR=/app/.cache/hf-data \
61
  HF_DATA_OFFLINE=1
app/developers/[id]/page.tsx CHANGED
@@ -9,7 +9,6 @@ import { InfiniteScrollSentinel } from "@/components/infinite-scroll"
9
  import { ModelTable } from "@/components/model-table"
10
  import { Navigation } from "@/components/navigation"
11
  import type { BenchmarkCard } from "@/lib/benchmark-schema"
12
- import { lookupBenchmarkCard } from "@/lib/benchmark-metadata-utils"
13
  import { fetchDeveloperSummary, fetchBenchmarkMetadata } from "@/lib/dashboard-data-client"
14
 
15
  const PAGE_SIZE = 40
@@ -59,24 +58,6 @@ export default function DeveloperDetailPage() {
59
  [models]
60
  )
61
 
62
- // Collect unique domains from benchmarks this developer's models are evaluated on
63
- const domainCoverage = useMemo(() => {
64
- const domainMap = new Map<string, Set<string>>()
65
- for (const model of models) {
66
- for (const { benchmark } of model.top_scores) {
67
- const card = lookupBenchmarkCard(benchmarkCards, benchmark)
68
- for (const domain of card?.benchmark_details?.domains ?? []) {
69
- const existing = domainMap.get(domain) ?? new Set()
70
- existing.add(benchmark)
71
- domainMap.set(domain, existing)
72
- }
73
- }
74
- }
75
- return Array.from(domainMap.entries())
76
- .map(([domain, benchmarks]) => ({ domain, count: benchmarks.size }))
77
- .sort((a, b) => b.count - a.count)
78
- }, [models, benchmarkCards])
79
-
80
  const filteredModels = useMemo(() => {
81
  const query = searchQuery.trim().toLowerCase()
82
  const filtered = query
@@ -207,12 +188,6 @@ export default function DeveloperDetailPage() {
207
  · <span className="text-[color:var(--fg)] tabular-nums font-semibold mr-1">{totalResults.toLocaleString()}</span>
208
  results
209
  </span>
210
- {domainCoverage.length > 0 && (
211
- <span>
212
- · <span className="text-[color:var(--fg)] tabular-nums font-semibold mr-1">{domainCoverage.length}</span>
213
- domains
214
- </span>
215
- )}
216
  </div>
217
 
218
  <span className="hidden h-5 w-px bg-[color:var(--border-soft)] sm:block" />
@@ -239,27 +214,6 @@ export default function DeveloperDetailPage() {
239
  </select>
240
  </div>
241
 
242
- {/* DOMAIN COVERAGE — hairline tag row ---------------------- */}
243
- {domainCoverage.length > 0 && (
244
- <div className="mb-8">
245
- <div className="kicker mb-3">Benchmark domain coverage</div>
246
- <div className="flex flex-wrap gap-1.5">
247
- {domainCoverage.map(({ domain, count }) => (
248
- <span
249
- key={domain}
250
- className="ec-tag outline"
251
- style={{ textTransform: "none", letterSpacing: "normal", fontFamily: "var(--font-sans)" }}
252
- >
253
- <span className="text-[12px] font-medium text-[color:var(--fg)] capitalize">{domain}</span>
254
- <span className="font-mono text-[10px] tabular-nums text-[color:var(--fg-muted)]">
255
- {count}
256
- </span>
257
- </span>
258
- ))}
259
- </div>
260
- </div>
261
- )}
262
-
263
  {/* TABLE ---------------------------------------------------- */}
264
  {filteredModels.length === 0 ? (
265
  <div className="border border-dashed border-[color:var(--border-soft)] bg-[color:var(--bg-warm)] py-12 text-center font-mono text-[11px] uppercase tracking-[0.2em] text-[color:var(--fg-subtle)]">
 
9
  import { ModelTable } from "@/components/model-table"
10
  import { Navigation } from "@/components/navigation"
11
  import type { BenchmarkCard } from "@/lib/benchmark-schema"
 
12
  import { fetchDeveloperSummary, fetchBenchmarkMetadata } from "@/lib/dashboard-data-client"
13
 
14
  const PAGE_SIZE = 40
 
58
  [models]
59
  )
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  const filteredModels = useMemo(() => {
62
  const query = searchQuery.trim().toLowerCase()
63
  const filtered = query
 
188
  · <span className="text-[color:var(--fg)] tabular-nums font-semibold mr-1">{totalResults.toLocaleString()}</span>
189
  results
190
  </span>
 
 
 
 
 
 
191
  </div>
192
 
193
  <span className="hidden h-5 w-px bg-[color:var(--border-soft)] sm:block" />
 
214
  </select>
215
  </div>
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  {/* TABLE ---------------------------------------------------- */}
218
  {filteredModels.length === 0 ? (
219
  <div className="border border-dashed border-[color:var(--border-soft)] bg-[color:var(--bg-warm)] py-12 text-center font-mono text-[11px] uppercase tracking-[0.2em] text-[color:var(--fg-subtle)]">
app/evals/[id]/page.tsx CHANGED
@@ -6,27 +6,11 @@ import Link from "next/link"
6
  import { ArrowLeft, ArrowUpRight, BarChart3, Grid3X3, Search } from "lucide-react"
7
  import { Navigation } from "@/components/navigation"
8
  import { EvalDetail } from "@/components/eval-detail"
 
9
  import { useAudienceMode } from "@/components/audience-mode-provider"
10
  import type { BenchmarkEvalSummary } from "@/lib/eval-processing"
11
  import { fetchEvalSummary } from "@/lib/dashboard-data-client"
12
-
13
- const PARAM_RANGE_VALUES = [1, 2, 3, 4, 6, 8, 10, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 500] as const
14
- const PARAM_RANGE_MARKERS = [
15
- { label: "< 1B", step: 0 },
16
- { label: "6B", step: PARAM_RANGE_VALUES.indexOf(6) },
17
- { label: "12B", step: PARAM_RANGE_VALUES.indexOf(12) },
18
- { label: "32B", step: PARAM_RANGE_VALUES.indexOf(32) },
19
- { label: "128B", step: PARAM_RANGE_VALUES.indexOf(128) },
20
- { label: "> 500B", step: PARAM_RANGE_VALUES.length - 1 },
21
- ] as const
22
-
23
- function formatParamBoundLabel(step: number, bound: "min" | "max") {
24
- const maxStepIndex = PARAM_RANGE_VALUES.length - 1
25
- if (bound === "min" && step <= 0) return "< 1B"
26
- if (bound === "max" && step >= maxStepIndex) return "> 500B"
27
- const value = PARAM_RANGE_VALUES[step]
28
- return value != null ? `${value}B` : "Not reported"
29
- }
30
 
31
  export default function EvalDetailPage() {
32
  const params = useParams()
@@ -192,10 +176,7 @@ function CompositeEvalView({
192
  <div className="space-y-10">
193
  {/* HERO ------------------------------------------------------------- */}
194
  <header className="motion-academic-enter">
195
- <div className="kicker kicker-accent mb-2">
196
- {isPolicy ? "Benchmark suite" : "Composite · §3.2"}
197
- </div>
198
- <h1 className="ec-page-h1" style={{ marginTop: 4 }}>{summary.evaluation_name}</h1>
199
  <div
200
  className="mb-5 flex flex-wrap items-center gap-3 font-mono text-[11px] uppercase tracking-[0.12em]"
201
  style={{ color: "var(--fg-muted)" }}
@@ -434,7 +415,7 @@ function MatrixLeaderboard({
434
  const [page, setPage] = useState(1)
435
  const [hiddenCols, setHiddenCols] = useState<Set<string>>(new Set())
436
  const [minParamStep, setMinParamStep] = useState(0)
437
- const [maxParamStep, setMaxParamStep] = useState(PARAM_RANGE_VALUES.length - 1)
438
  const PAGE_SIZE = 50
439
 
440
  const metricDirection = useMemo(() => {
@@ -470,9 +451,9 @@ function MatrixLeaderboard({
470
  const avg = validScores.length > 0
471
  ? validScores.reduce((a, b) => a + b, 0) / validScores.length
472
  : 0
473
- let sizeB: number | null = null
474
- const sizeMatch = (data.name + " " + id).match(/\b(\d+(?:\.\d+)?)\s*[bB]\b/)
475
- if (sizeMatch) sizeB = parseFloat(sizeMatch[1])
476
 
477
  return { id, name: data.name, developer: data.developer, avg, scores: data.scores, sizeB }
478
  })
@@ -496,9 +477,11 @@ function MatrixLeaderboard({
496
  })
497
  }, [models, sortCol, sortAsc])
498
 
499
- const maxStepIndex = PARAM_RANGE_VALUES.length - 1
500
- const numericMinParams = minParamStep <= 0 ? null : (PARAM_RANGE_VALUES[minParamStep] ?? null)
501
- const numericMaxParams = maxParamStep >= maxStepIndex ? null : (PARAM_RANGE_VALUES[maxParamStep] ?? null)
 
 
502
 
503
  const query = search.trim().toLowerCase()
504
  const filteredModels = sortedModels.filter((m) => {
@@ -507,8 +490,9 @@ function MatrixLeaderboard({
507
  m.developer.toLowerCase().includes(query) ||
508
  m.id.toLowerCase().includes(query)
509
  )) return false
510
- if (numericMinParams != null && (m.sizeB == null || m.sizeB < numericMinParams)) return false
511
- if (numericMaxParams != null && (m.sizeB == null || m.sizeB > numericMaxParams)) return false
 
512
  return true
513
  })
514
 
@@ -594,99 +578,23 @@ function MatrixLeaderboard({
594
  />
595
  </div>
596
 
597
- {/* Param slider */}
598
- <div
599
- className="flex items-center gap-3 px-4 py-2"
600
- style={{ border: "1px solid var(--border-soft)", background: "var(--bg-warm)" }}
601
- >
602
- <span
603
- className="shrink-0 font-mono uppercase tracking-[0.14em]"
604
- style={{ fontSize: 10, color: "var(--fg-subtle)" }}
605
- >
606
- Params
607
- </span>
608
- <div className="min-w-0 flex-1 w-[min(92vw,300px)]">
609
- <div className="relative mb-1 h-4 text-[10px]" style={{ color: "var(--fg-subtle)" }}>
610
- {PARAM_RANGE_MARKERS.map((marker) => (
611
- <span
612
- key={marker.label}
613
- className="absolute top-0 whitespace-nowrap font-mono"
614
- style={{
615
- left: `${(marker.step / maxStepIndex) * 100}%`,
616
- transform:
617
- marker.step === 0 ? "translateX(0)"
618
- : marker.step === maxStepIndex ? "translateX(-100%)"
619
- : "translateX(-50%)",
620
- }}
621
- >
622
- {marker.label}
623
- </span>
624
- ))}
625
- </div>
626
-
627
- <div className="relative h-4">
628
- <div
629
- className="absolute inset-x-1.5 top-1/2 h-[3px] -translate-y-1/2"
630
- style={{ background: "var(--border-strong)" }}
631
- />
632
- <div className="absolute inset-x-1.5 top-1/2 h-[3px] -translate-y-1/2">
633
- <div
634
- className="absolute inset-y-0"
635
- style={{
636
- background: "var(--fg)",
637
- left: `${(minParamStep / maxStepIndex) * 100}%`,
638
- right: `${Math.max(100 - (maxParamStep / maxStepIndex) * 100, 0)}%`,
639
- }}
640
- />
641
- </div>
642
-
643
- <div className="absolute inset-x-1.5 top-1/2 -translate-y-1/2">
644
- {PARAM_RANGE_VALUES.map((_, stepIndex) => (
645
- <span
646
- key={`param-tick-${stepIndex}`}
647
- className="absolute top-0 h-2 w-px -translate-x-1/2"
648
- style={{ left: `${(stepIndex / maxStepIndex) * 100}%`, background: "var(--border-soft)" }}
649
- aria-hidden="true"
650
- />
651
- ))}
652
- </div>
653
-
654
- <input
655
- type="range"
656
- min={0}
657
- max={maxStepIndex}
658
- step={1}
659
- value={minParamStep}
660
- onChange={(e) => {
661
- const v = Number(e.target.value)
662
- setMinParamStep(Math.min(v, maxParamStep))
663
- }}
664
- className="param-range-input"
665
- aria-label="Minimum parameter filter"
666
- />
667
- <input
668
- type="range"
669
- min={0}
670
- max={maxStepIndex}
671
- step={1}
672
- value={maxParamStep}
673
- onChange={(e) => {
674
- const v = Number(e.target.value)
675
- setMaxParamStep(Math.max(v, minParamStep))
676
- }}
677
- className="param-range-input"
678
- aria-label="Maximum parameter filter"
679
- />
680
- </div>
681
- </div>
682
-
683
- <span
684
- className="shrink-0 font-mono"
685
- style={{ fontSize: 10, color: "var(--fg-muted)" }}
686
- >
687
- {formatParamBoundLabel(minParamStep, "min")} – {formatParamBoundLabel(maxParamStep, "max")}
688
- </span>
689
- </div>
690
 
691
  <div
692
  className="font-mono uppercase tracking-[0.14em] whitespace-nowrap ml-auto"
 
6
  import { ArrowLeft, ArrowUpRight, BarChart3, Grid3X3, Search } from "lucide-react"
7
  import { Navigation } from "@/components/navigation"
8
  import { EvalDetail } from "@/components/eval-detail"
9
+ import { ParamRangePicker } from "@/components/param-range-picker"
10
  import { useAudienceMode } from "@/components/audience-mode-provider"
11
  import type { BenchmarkEvalSummary } from "@/lib/eval-processing"
12
  import { fetchEvalSummary } from "@/lib/dashboard-data-client"
13
+ import { PARAM_RANGE_MAX_INDEX, parseParamsBillionsFromModelName, paramStepToNumeric } from "@/lib/param-range"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  export default function EvalDetailPage() {
16
  const params = useParams()
 
176
  <div className="space-y-10">
177
  {/* HERO ------------------------------------------------------------- */}
178
  <header className="motion-academic-enter">
179
+ <h1 className="ec-page-h1">{summary.evaluation_name}</h1>
 
 
 
180
  <div
181
  className="mb-5 flex flex-wrap items-center gap-3 font-mono text-[11px] uppercase tracking-[0.12em]"
182
  style={{ color: "var(--fg-muted)" }}
 
415
  const [page, setPage] = useState(1)
416
  const [hiddenCols, setHiddenCols] = useState<Set<string>>(new Set())
417
  const [minParamStep, setMinParamStep] = useState(0)
418
+ const [maxParamStep, setMaxParamStep] = useState(PARAM_RANGE_MAX_INDEX)
419
  const PAGE_SIZE = 50
420
 
421
  const metricDirection = useMemo(() => {
 
451
  const avg = validScores.length > 0
452
  ? validScores.reduce((a, b) => a + b, 0) / validScores.length
453
  : 0
454
+ const sizeB =
455
+ parseParamsBillionsFromModelName(data.name) ??
456
+ parseParamsBillionsFromModelName(id)
457
 
458
  return { id, name: data.name, developer: data.developer, avg, scores: data.scores, sizeB }
459
  })
 
477
  })
478
  }, [models, sortCol, sortAsc])
479
 
480
+ const numericMinParams = paramStepToNumeric(minParamStep, "min")
481
+ const numericMaxParams = paramStepToNumeric(maxParamStep, "max")
482
+ const [showUnknownSize, setShowUnknownSize] = useState(true)
483
+
484
+ const hasParameterData = useMemo(() => models.some((m) => m.sizeB != null), [models])
485
 
486
  const query = search.trim().toLowerCase()
487
  const filteredModels = sortedModels.filter((m) => {
 
490
  m.developer.toLowerCase().includes(query) ||
491
  m.id.toLowerCase().includes(query)
492
  )) return false
493
+ if (m.sizeB == null) return showUnknownSize
494
+ if (numericMinParams != null && m.sizeB < numericMinParams) return false
495
+ if (numericMaxParams != null && m.sizeB > numericMaxParams) return false
496
  return true
497
  })
498
 
 
578
  />
579
  </div>
580
 
581
+ {hasParameterData && (
582
+ <ParamRangePicker
583
+ variant="inline"
584
+ headline="Params"
585
+ minStep={minParamStep}
586
+ maxStep={maxParamStep}
587
+ onMinChange={setMinParamStep}
588
+ onMaxChange={setMaxParamStep}
589
+ onReset={() => {
590
+ setMinParamStep(0)
591
+ setMaxParamStep(PARAM_RANGE_MAX_INDEX)
592
+ }}
593
+ showUnknownSize={showUnknownSize}
594
+ onShowUnknownSizeChange={setShowUnknownSize}
595
+ className="min-w-[260px] flex-1 sm:max-w-[420px]"
596
+ />
597
+ )}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
598
 
599
  <div
600
  className="font-mono uppercase tracking-[0.14em] whitespace-nowrap ml-auto"
app/evals/page.tsx CHANGED
@@ -1,13 +1,15 @@
1
  "use client"
2
 
3
  import { useCallback, useDeferredValue, useEffect, useMemo, useState } from "react"
4
- import { Search } from "lucide-react"
5
 
6
  import { FamilyTable } from "@/components/family-table"
7
  import { InfiniteScrollSentinel } from "@/components/infinite-scroll"
8
  import { Navigation } from "@/components/navigation"
9
  import type { EvalHierarchy, HierarchyFamily } from "@/lib/backend-artifacts"
10
- import { fetchEvalHierarchy, fetchEvalList } from "@/lib/dashboard-data-client"
 
 
11
 
12
  const PAGE_SIZE = 60
13
 
@@ -54,17 +56,26 @@ function familyBenchmarkCount(fam: HierarchyFamily): number {
54
  export default function EvalsPage() {
55
  const [hierarchy, setHierarchy] = useState<EvalHierarchy | null>(null)
56
  const [totalModels, setTotalModels] = useState<number>(0)
 
 
57
  const [loading, setLoading] = useState(true)
58
  const [searchQuery, setSearchQuery] = useState("")
59
  const [sortBy, setSortBy] = useState<FamilySort>("results")
60
  const [visibleCount, setVisibleCount] = useState(PAGE_SIZE)
 
 
 
61
  const deferredSearchQuery = useDeferredValue(searchQuery)
62
 
63
  useEffect(() => {
64
- Promise.all([fetchEvalHierarchy(), fetchEvalList()])
65
- .then(([h, list]) => {
66
  setHierarchy(h)
67
  setTotalModels(list.totalModels)
 
 
 
 
68
  })
69
  .catch(console.error)
70
  .finally(() => setLoading(false))
@@ -72,6 +83,86 @@ export default function EvalsPage() {
72
 
73
  const families = hierarchy?.families ?? []
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  const filteredFamilies = useMemo(() => {
76
  const query = deferredSearchQuery.trim().toLowerCase()
77
  let list = families
@@ -85,6 +176,20 @@ export default function EvalsPage() {
85
  )
86
  }
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  return list.slice().sort((a, b) => {
89
  switch (sortBy) {
90
  case "name":
@@ -98,11 +203,11 @@ export default function EvalsPage() {
98
  return familyEvalsCount(b) - familyEvalsCount(a)
99
  }
100
  })
101
- }, [families, deferredSearchQuery, sortBy])
102
 
103
  useEffect(() => {
104
  setVisibleCount(PAGE_SIZE)
105
- }, [deferredSearchQuery, sortBy])
106
 
107
  const visibleFamilies = useMemo(
108
  () => filteredFamilies.slice(0, visibleCount),
@@ -126,8 +231,7 @@ export default function EvalsPage() {
126
  <p className="ec-page-lede">
127
  Evaluations are grouped into <strong>families</strong>. A family holds one or more
128
  benchmarks; each benchmark has one or more slices; each slice reports one or more
129
- metrics. Metrics are not commensurable across rows — compare within a cell, not
130
- across cells.
131
  </p>
132
 
133
  {/* META ROW ------------------------------------------------- */}
@@ -180,6 +284,41 @@ export default function EvalsPage() {
180
 
181
  <div className="grow" />
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  <select
184
  className="ec-select"
185
  value={sortBy}
@@ -192,6 +331,115 @@ export default function EvalsPage() {
192
  </select>
193
  </div>
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  {/* TABLE ---------------------------------------------------- */}
196
  {loading ? (
197
  <div className="py-24 text-center font-mono text-[11px] uppercase tracking-[0.2em] text-[color:var(--fg-subtle)]">
@@ -214,7 +462,13 @@ export default function EvalsPage() {
214
  </button>
215
  </div>
216
  ) : (
217
- <FamilyTable families={visibleFamilies} totalModels={totalModels} />
 
 
 
 
 
 
218
  )}
219
 
220
  <InfiniteScrollSentinel
 
1
  "use client"
2
 
3
  import { useCallback, useDeferredValue, useEffect, useMemo, useState } from "react"
4
+ import { ChevronDown, ChevronUp, Search, Tag } from "lucide-react"
5
 
6
  import { FamilyTable } from "@/components/family-table"
7
  import { InfiniteScrollSentinel } from "@/components/infinite-scroll"
8
  import { Navigation } from "@/components/navigation"
9
  import type { EvalHierarchy, HierarchyFamily } from "@/lib/backend-artifacts"
10
+ import { fetchBenchmarkMetadata, fetchEvalHierarchy, fetchEvalList } from "@/lib/dashboard-data-client"
11
+ import type { BenchmarkEvalListItem } from "@/lib/eval-processing"
12
+ import type { BenchmarkCard } from "@/lib/benchmark-schema"
13
 
14
  const PAGE_SIZE = 60
15
 
 
56
  export default function EvalsPage() {
57
  const [hierarchy, setHierarchy] = useState<EvalHierarchy | null>(null)
58
  const [totalModels, setTotalModels] = useState<number>(0)
59
+ const [evalItems, setEvalItems] = useState<Map<string, BenchmarkEvalListItem>>(new Map())
60
+ const [benchmarkCards, setBenchmarkCards] = useState<Record<string, BenchmarkCard>>({})
61
  const [loading, setLoading] = useState(true)
62
  const [searchQuery, setSearchQuery] = useState("")
63
  const [sortBy, setSortBy] = useState<FamilySort>("results")
64
  const [visibleCount, setVisibleCount] = useState(PAGE_SIZE)
65
+ const [domainPanelOpen, setDomainPanelOpen] = useState(false)
66
+ const [domainFilter, setDomainFilter] = useState<Set<string>>(new Set())
67
+ const [selectedCategories, setSelectedCategories] = useState<string[]>([])
68
  const deferredSearchQuery = useDeferredValue(searchQuery)
69
 
70
  useEffect(() => {
71
+ Promise.all([fetchEvalHierarchy(), fetchEvalList(), fetchBenchmarkMetadata()])
72
+ .then(([h, list, metadata]) => {
73
  setHierarchy(h)
74
  setTotalModels(list.totalModels)
75
+ const map = new Map<string, BenchmarkEvalListItem>()
76
+ for (const item of list.evals) map.set(item.evaluation_id, item)
77
+ setEvalItems(map)
78
+ setBenchmarkCards(metadata)
79
  })
80
  .catch(console.error)
81
  .finally(() => setLoading(false))
 
83
 
84
  const families = hierarchy?.families ?? []
85
 
86
+ // Build a domain → family-count map. The lite eval list doesn't carry
87
+ // benchmark cards, so we read domains from `benchmark-metadata.json`
88
+ // (keyed by benchmark / leaf / family key). For each family we union
89
+ // the domains across the family key itself and every leaf key, then
90
+ // count one bump per family per distinct domain.
91
+ const familyDomains = useMemo(() => {
92
+ const out = new Map<string, Set<string>>()
93
+ const lookupDomains = (key: string | null | undefined): string[] => {
94
+ if (!key) return []
95
+ const card = benchmarkCards[key]
96
+ const domains = card?.benchmark_details?.domains
97
+ return Array.isArray(domains) ? domains : []
98
+ }
99
+ for (const fam of families) {
100
+ const seen = new Set<string>()
101
+ for (const d of lookupDomains(fam.key)) seen.add(d.trim().toLowerCase())
102
+ for (const leaf of fam.leaves ?? []) {
103
+ for (const d of leaf.tags?.domains ?? []) seen.add(d.trim().toLowerCase())
104
+ for (const d of lookupDomains(leaf.key)) seen.add(d.trim().toLowerCase())
105
+ }
106
+ for (const id of fam.eval_summary_ids ?? []) {
107
+ for (const d of lookupDomains(id)) seen.add(d.trim().toLowerCase())
108
+ }
109
+ seen.delete("")
110
+ out.set(fam.key, seen)
111
+ }
112
+ return out
113
+ }, [families, benchmarkCards])
114
+
115
+ // Domain → display label (from the first non-empty card occurrence) +
116
+ // count of families touching that domain. Sorted descending by count.
117
+ const domainCounts = useMemo(() => {
118
+ const counts = new Map<string, number>()
119
+ const labels = new Map<string, string>()
120
+ const recordLabel = (raw: string) => {
121
+ const key = raw.trim().toLowerCase()
122
+ if (!key || labels.has(key)) return
123
+ labels.set(key, raw.trim())
124
+ }
125
+ for (const card of Object.values(benchmarkCards)) {
126
+ for (const d of card?.benchmark_details?.domains ?? []) recordLabel(d)
127
+ }
128
+ for (const fam of families) {
129
+ for (const leaf of fam.leaves ?? []) {
130
+ for (const d of leaf.tags?.domains ?? []) recordLabel(d)
131
+ }
132
+ }
133
+ for (const set of familyDomains.values()) {
134
+ for (const key of set) counts.set(key, (counts.get(key) ?? 0) + 1)
135
+ }
136
+ return Array.from(counts.entries())
137
+ .map(([key, count]) => ({ domain: labels.get(key) ?? key, count, key }))
138
+ .sort((a, b) => b.count - a.count || a.domain.localeCompare(b.domain))
139
+ }, [familyDomains, families, benchmarkCards])
140
+
141
+ const toggleDomain = useCallback((domain: string) => {
142
+ setDomainFilter((current) => {
143
+ const next = new Set(current)
144
+ if (next.has(domain)) next.delete(domain)
145
+ else next.add(domain)
146
+ return next
147
+ })
148
+ }, [])
149
+
150
+ const clearDomainFilter = useCallback(() => setDomainFilter(new Set()), [])
151
+
152
+ // Categories present on the family list — drives the pill selector
153
+ // below the toolbar. Sort them by descending family count so the most
154
+ // common ones surface first.
155
+ const availableCategories = useMemo(() => {
156
+ const counts = new Map<string, number>()
157
+ for (const fam of families) {
158
+ const cat = fam.category ?? "General"
159
+ counts.set(cat, (counts.get(cat) ?? 0) + 1)
160
+ }
161
+ return Array.from(counts.entries())
162
+ .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
163
+ .map(([category]) => category)
164
+ }, [families])
165
+
166
  const filteredFamilies = useMemo(() => {
167
  const query = deferredSearchQuery.trim().toLowerCase()
168
  let list = families
 
176
  )
177
  }
178
 
179
+ if (selectedCategories.length > 0) {
180
+ const set = new Set(selectedCategories)
181
+ list = list.filter((fam) => set.has(fam.category ?? "General"))
182
+ }
183
+
184
+ if (domainFilter.size > 0) {
185
+ list = list.filter((fam) => {
186
+ const set = familyDomains.get(fam.key)
187
+ if (!set) return false
188
+ for (const key of set) if (domainFilter.has(key)) return true
189
+ return false
190
+ })
191
+ }
192
+
193
  return list.slice().sort((a, b) => {
194
  switch (sortBy) {
195
  case "name":
 
203
  return familyEvalsCount(b) - familyEvalsCount(a)
204
  }
205
  })
206
+ }, [families, deferredSearchQuery, sortBy, domainFilter, selectedCategories, familyDomains])
207
 
208
  useEffect(() => {
209
  setVisibleCount(PAGE_SIZE)
210
+ }, [deferredSearchQuery, sortBy, domainFilter, selectedCategories])
211
 
212
  const visibleFamilies = useMemo(
213
  () => filteredFamilies.slice(0, visibleCount),
 
231
  <p className="ec-page-lede">
232
  Evaluations are grouped into <strong>families</strong>. A family holds one or more
233
  benchmarks; each benchmark has one or more slices; each slice reports one or more
234
+ metrics.
 
235
  </p>
236
 
237
  {/* META ROW ------------------------------------------------- */}
 
284
 
285
  <div className="grow" />
286
 
287
+ {domainCounts.length > 0 && (
288
+ <button
289
+ type="button"
290
+ onClick={() => setDomainPanelOpen((v) => !v)}
291
+ className="inline-flex items-center gap-2"
292
+ style={{
293
+ fontFamily: "var(--font-mono)",
294
+ fontSize: 10,
295
+ letterSpacing: "0.14em",
296
+ textTransform: "uppercase",
297
+ padding: "6px 12px",
298
+ border: "1px solid var(--border-strong)",
299
+ background:
300
+ domainPanelOpen || domainFilter.size > 0 ? "var(--fg)" : "var(--bg)",
301
+ color:
302
+ domainPanelOpen || domainFilter.size > 0 ? "var(--bg)" : "var(--fg)",
303
+ cursor: "pointer",
304
+ }}
305
+ aria-expanded={domainPanelOpen}
306
+ >
307
+ <Tag className="h-3 w-3" aria-hidden />
308
+ Filter by domain
309
+ {domainFilter.size > 0 && (
310
+ <span className="font-mono tabular-nums" style={{ marginLeft: 2 }}>
311
+ · {domainFilter.size}
312
+ </span>
313
+ )}
314
+ {domainPanelOpen ? (
315
+ <ChevronUp className="h-3 w-3" aria-hidden />
316
+ ) : (
317
+ <ChevronDown className="h-3 w-3" aria-hidden />
318
+ )}
319
+ </button>
320
+ )}
321
+
322
  <select
323
  className="ec-select"
324
  value={sortBy}
 
331
  </select>
332
  </div>
333
 
334
+ {/* DOMAIN FILTER PANEL — collapsed by default, opens when the user
335
+ wants to slice the family list by topical domain. Picks unfurl
336
+ every aggregator family in the table below so matching
337
+ benchmarks are immediately visible. */}
338
+ {domainPanelOpen && domainCounts.length > 0 && (
339
+ <div
340
+ className="mb-6"
341
+ style={{
342
+ border: "1px solid var(--border-soft)",
343
+ background: "var(--bg-warm)",
344
+ padding: "12px 16px",
345
+ }}
346
+ >
347
+ <div className="mb-3 flex items-center justify-between gap-3">
348
+ <div
349
+ className="font-mono uppercase"
350
+ style={{ fontSize: 10, letterSpacing: "0.14em", color: "var(--fg-subtle)" }}
351
+ >
352
+ {domainFilter.size === 0
353
+ ? `Pick one or more domains · ${domainCounts.length} available`
354
+ : `${domainFilter.size} selected · ${domainCounts.length - domainFilter.size} more`}
355
+ </div>
356
+ {domainFilter.size > 0 && (
357
+ <button
358
+ type="button"
359
+ onClick={clearDomainFilter}
360
+ className="font-mono uppercase"
361
+ style={{
362
+ fontSize: 10,
363
+ letterSpacing: "0.12em",
364
+ color: "var(--fg-subtle)",
365
+ background: "transparent",
366
+ border: 0,
367
+ cursor: "pointer",
368
+ }}
369
+ >
370
+ Clear
371
+ </button>
372
+ )}
373
+ </div>
374
+ <div className="flex flex-wrap gap-1.5">
375
+ {domainCounts.map(({ domain, count }) => {
376
+ const key = domain.trim().toLowerCase()
377
+ const selected = domainFilter.has(key)
378
+ return (
379
+ <button
380
+ key={key}
381
+ type="button"
382
+ onClick={() => toggleDomain(key)}
383
+ className="ec-tag outline inline-flex items-center gap-1.5"
384
+ style={{
385
+ cursor: "pointer",
386
+ background: selected ? "var(--fg)" : "var(--bg)",
387
+ color: selected ? "var(--bg)" : "var(--fg)",
388
+ borderColor: selected ? "var(--fg)" : "var(--border-strong)",
389
+ textTransform: "none",
390
+ letterSpacing: "normal",
391
+ fontFamily: "var(--font-sans)",
392
+ }}
393
+ aria-pressed={selected}
394
+ >
395
+ <span className="text-[12px] font-medium capitalize">{domain}</span>
396
+ <span
397
+ className="font-mono text-[10px] tabular-nums"
398
+ style={{ color: selected ? "var(--bg)" : "var(--fg-muted)" }}
399
+ >
400
+ {count}
401
+ </span>
402
+ </button>
403
+ )
404
+ })}
405
+ </div>
406
+ </div>
407
+ )}
408
+
409
+ {/* CATEGORY PILLS — quick toggle filter by category. Mirrors the
410
+ same pattern used on benchmark-detail's matrix browser. */}
411
+ {availableCategories.length > 0 && (
412
+ <div className="mb-5 flex flex-wrap items-center gap-2">
413
+ <span className="kicker mr-2">Category</span>
414
+ <button
415
+ type="button"
416
+ onClick={() => setSelectedCategories([])}
417
+ className={`ec-pill ${selectedCategories.length === 0 ? "on" : ""}`}
418
+ >
419
+ All
420
+ </button>
421
+ {availableCategories.map((category) => {
422
+ const isSelected = selectedCategories.includes(category)
423
+ return (
424
+ <button
425
+ key={category}
426
+ type="button"
427
+ onClick={() =>
428
+ setSelectedCategories((current) =>
429
+ current.includes(category)
430
+ ? current.filter((item) => item !== category)
431
+ : [...current, category],
432
+ )
433
+ }
434
+ className={`ec-pill ${isSelected ? "on" : ""}`}
435
+ >
436
+ {category}
437
+ </button>
438
+ )
439
+ })}
440
+ </div>
441
+ )}
442
+
443
  {/* TABLE ---------------------------------------------------- */}
444
  {loading ? (
445
  <div className="py-24 text-center font-mono text-[11px] uppercase tracking-[0.2em] text-[color:var(--fg-subtle)]">
 
462
  </button>
463
  </div>
464
  ) : (
465
+ <FamilyTable
466
+ families={visibleFamilies}
467
+ totalModels={totalModels}
468
+ evalItems={evalItems}
469
+ benchmarkCards={benchmarkCards}
470
+ domainFilter={domainFilter}
471
+ />
472
  )}
473
 
474
  <InfiniteScrollSentinel
app/globals.css CHANGED
@@ -1117,55 +1117,316 @@
1117
  any width/transform transitions so the fill bar tracks the cursor 1:1
1118
  instead of lagging 300ms behind every move (the prior cause of the
1119
  "jumpy" feel). */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1120
  .param-range-input {
1121
  appearance: none;
1122
  position: absolute;
1123
- left: 0.375rem;
1124
- right: 0.375rem;
1125
- top: 50%;
1126
- transform: translateY(-50%);
1127
- width: calc(100% - 0.75rem);
1128
- height: 18px;
1129
  background: transparent;
1130
  pointer-events: none;
1131
  margin: 0;
1132
  padding: 0;
1133
  }
1134
- .param-range-input::-webkit-slider-runnable-track { height: 18px; background: transparent; border: 0; }
 
 
 
 
1135
  .param-range-input::-webkit-slider-thumb {
1136
  appearance: none;
1137
  pointer-events: auto;
1138
- height: 18px;
1139
- width: 18px;
1140
- border-radius: 9999px;
1141
- border: 2px solid var(--color-foreground);
1142
- background: var(--color-foreground);
1143
  margin-top: 0;
1144
- cursor: grab;
1145
- box-shadow: 0 0 0 3px color-mix(in srgb, var(--color-background) 85%, transparent);
 
 
 
 
 
 
 
 
1146
  }
1147
- .param-range-input::-webkit-slider-thumb:active { cursor: grabbing; }
1148
- .param-range-input::-moz-range-track { height: 18px; border: 0; background: transparent; }
1149
  .param-range-input::-moz-range-thumb {
1150
  pointer-events: auto;
1151
- height: 18px;
1152
- width: 18px;
1153
- border-radius: 9999px;
1154
- border: 2px solid var(--color-foreground);
1155
- background: var(--color-foreground);
1156
- cursor: grab;
1157
- box-shadow: 0 0 0 3px color-mix(in srgb, var(--color-background) 85%, transparent);
 
 
 
 
 
 
1158
  }
1159
- .param-range-input::-moz-range-thumb:active { cursor: grabbing; }
1160
  .param-range-input:focus-visible::-webkit-slider-thumb {
1161
- box-shadow:
1162
- 0 0 0 3px color-mix(in srgb, var(--color-background) 85%, transparent),
1163
- 0 0 0 6px color-mix(in srgb, var(--color-ring) 35%, transparent);
1164
  }
1165
  .param-range-input:focus-visible::-moz-range-thumb {
1166
- box-shadow:
1167
- 0 0 0 3px color-mix(in srgb, var(--color-background) 85%, transparent),
1168
- 0 0 0 6px color-mix(in srgb, var(--color-ring) 35%, transparent);
1169
  }
1170
 
1171
  @media (prefers-reduced-motion: reduce) {
 
1117
  any width/transform transitions so the fill bar tracks the cursor 1:1
1118
  instead of lagging 300ms behind every move (the prior cause of the
1119
  "jumpy" feel). */
1120
+ /* ============================================================
1121
+ Param-range picker (themed)
1122
+ - Dual-handle dual-input slider, snap to fixed bucket ticks
1123
+ - Hairline rail with labelled bucket marks above the rail
1124
+ - Square outline thumbs (matches the editorial sharp-corner language)
1125
+ - Mono numerals in the read-out, kicker label on the left
1126
+ - Variants: default (A), inline (B), promo (C)
1127
+ ============================================================ */
1128
+
1129
+ .pr-slider {
1130
+ display: grid;
1131
+ grid-template-columns: 180px 1fr minmax(130px, auto);
1132
+ gap: 24px;
1133
+ align-items: center;
1134
+ }
1135
+ .pr-slider-track-only {
1136
+ grid-template-columns: 1fr minmax(130px, auto);
1137
+ gap: 18px;
1138
+ }
1139
+ .pr-slider.inline {
1140
+ grid-template-columns: max-content 1fr max-content;
1141
+ gap: 18px;
1142
+ }
1143
+ .pr-readout-cell {
1144
+ display: inline-flex;
1145
+ align-items: center;
1146
+ gap: 10px;
1147
+ white-space: nowrap;
1148
+ justify-self: end;
1149
+ }
1150
+ .pr-label {
1151
+ font-family: var(--font-mono);
1152
+ font-size: 10px;
1153
+ letter-spacing: 0.16em;
1154
+ text-transform: uppercase;
1155
+ color: var(--fg-subtle);
1156
+ line-height: 1.45;
1157
+ }
1158
+ .pr-label strong {
1159
+ display: block;
1160
+ color: var(--fg);
1161
+ font-weight: 600;
1162
+ margin-bottom: 2px;
1163
+ }
1164
+ .pr-label.inline-label {
1165
+ white-space: nowrap;
1166
+ }
1167
+ .pr-label.inline-label strong {
1168
+ display: inline;
1169
+ margin: 0;
1170
+ }
1171
+
1172
+ .pr-track-wrap {
1173
+ position: relative;
1174
+ height: 56px;
1175
+ user-select: none;
1176
+ min-width: 0;
1177
+ }
1178
+ .pr-ticks {
1179
+ position: absolute;
1180
+ left: 0;
1181
+ right: 0;
1182
+ top: 0;
1183
+ font-family: var(--font-mono);
1184
+ font-size: 9.5px;
1185
+ letter-spacing: 0.06em;
1186
+ color: var(--fg-subtle);
1187
+ font-variant-numeric: tabular-nums;
1188
+ pointer-events: none;
1189
+ }
1190
+ .pr-tick {
1191
+ position: absolute;
1192
+ top: 0;
1193
+ width: 1px;
1194
+ text-align: center;
1195
+ }
1196
+ .pr-tick::after {
1197
+ content: "";
1198
+ position: absolute;
1199
+ left: 50%;
1200
+ top: 18px;
1201
+ width: 1px;
1202
+ height: 6px;
1203
+ background: var(--border-strong);
1204
+ transform: translateX(-50%);
1205
+ }
1206
+ .pr-tick.on::after {
1207
+ background: var(--fg);
1208
+ }
1209
+ .pr-tick > span {
1210
+ display: inline-block;
1211
+ white-space: nowrap;
1212
+ }
1213
+
1214
+ .pr-rail {
1215
+ position: absolute;
1216
+ left: 0;
1217
+ right: 0;
1218
+ top: 36px;
1219
+ height: 2px;
1220
+ background: var(--border-strong);
1221
+ }
1222
+ .pr-fill {
1223
+ position: absolute;
1224
+ top: 36px;
1225
+ height: 2px;
1226
+ background: var(--fg);
1227
+ transition: left 0.08s linear, width 0.08s linear;
1228
+ }
1229
+ .pr-microticks {
1230
+ position: absolute;
1231
+ left: 0;
1232
+ right: 0;
1233
+ top: 36px;
1234
+ height: 2px;
1235
+ pointer-events: none;
1236
+ }
1237
+ .pr-microticks > span {
1238
+ position: absolute;
1239
+ top: -2px;
1240
+ width: 1px;
1241
+ height: 6px;
1242
+ background: var(--border-soft);
1243
+ transform: translateX(-50%);
1244
+ }
1245
+
1246
+ .pr-readout {
1247
+ font-family: var(--font-mono);
1248
+ font-size: 11px;
1249
+ color: var(--fg);
1250
+ text-align: right;
1251
+ padding: 6px 10px;
1252
+ border: 1px solid var(--border);
1253
+ border-bottom: 1.5px solid var(--fg);
1254
+ background: var(--bg);
1255
+ font-variant-numeric: tabular-nums;
1256
+ letter-spacing: 0.04em;
1257
+ white-space: nowrap;
1258
+ }
1259
+ .pr-readout .arrow {
1260
+ color: var(--fg-subtle);
1261
+ margin: 0 4px;
1262
+ }
1263
+ .pr-readout.inline {
1264
+ border: 0;
1265
+ padding: 0;
1266
+ font-weight: 600;
1267
+ text-align: left;
1268
+ background: transparent;
1269
+ }
1270
+
1271
+ .pr-reset {
1272
+ font-family: var(--font-mono);
1273
+ font-size: 10px;
1274
+ letter-spacing: 0.12em;
1275
+ text-transform: uppercase;
1276
+ color: var(--fg-subtle);
1277
+ background: transparent;
1278
+ border: 0;
1279
+ cursor: pointer;
1280
+ padding: 2px 4px;
1281
+ white-space: nowrap;
1282
+ transition: color var(--transition);
1283
+ }
1284
+ .pr-reset:hover {
1285
+ color: var(--accent);
1286
+ }
1287
+
1288
+ /* Inline toggle: "Show models without a reported size". Sits next to the
1289
+ readout, matches the editorial sharp-corner vocabulary. */
1290
+ .pr-unknown-toggle {
1291
+ display: inline-flex;
1292
+ align-items: center;
1293
+ gap: 6px;
1294
+ font-family: var(--font-mono);
1295
+ font-size: 10px;
1296
+ letter-spacing: 0.12em;
1297
+ text-transform: uppercase;
1298
+ color: var(--fg-subtle);
1299
+ background: transparent;
1300
+ border: 0;
1301
+ cursor: pointer;
1302
+ padding: 2px 4px;
1303
+ white-space: nowrap;
1304
+ transition: color var(--transition);
1305
+ }
1306
+ .pr-unknown-toggle.on {
1307
+ color: var(--fg);
1308
+ }
1309
+ .pr-unknown-toggle:hover {
1310
+ color: var(--accent);
1311
+ }
1312
+ .pr-unknown-toggle-box {
1313
+ display: inline-flex;
1314
+ align-items: center;
1315
+ justify-content: center;
1316
+ width: 12px;
1317
+ height: 12px;
1318
+ border: 1.5px solid currentColor;
1319
+ font-size: 10px;
1320
+ line-height: 1;
1321
+ }
1322
+ .pr-unknown-toggle.on .pr-unknown-toggle-box {
1323
+ background: var(--fg);
1324
+ color: var(--bg);
1325
+ border-color: var(--fg);
1326
+ }
1327
+
1328
+ /* Variant C: warm-bg, left-accent, "narrow the matrix" framing */
1329
+ .pr-promo {
1330
+ display: grid;
1331
+ grid-template-columns: minmax(220px, 1fr) minmax(420px, 2fr) auto;
1332
+ gap: 28px;
1333
+ align-items: center;
1334
+ padding: 16px 20px;
1335
+ border: 1px solid var(--border);
1336
+ background: var(--bg-warm);
1337
+ border-left: 2px solid var(--accent);
1338
+ }
1339
+ .pr-promo .pr-promo-head {
1340
+ line-height: 1.5;
1341
+ min-width: 0;
1342
+ }
1343
+ .pr-promo .pr-promo-head .kicker {
1344
+ display: block;
1345
+ margin-bottom: 4px;
1346
+ }
1347
+ .pr-promo .pr-promo-head p {
1348
+ margin: 0;
1349
+ font-size: 13px;
1350
+ color: var(--fg-muted);
1351
+ }
1352
+ @media (max-width: 900px) {
1353
+ .pr-promo {
1354
+ grid-template-columns: 1fr;
1355
+ }
1356
+ .pr-slider {
1357
+ grid-template-columns: 1fr;
1358
+ gap: 16px;
1359
+ }
1360
+ .pr-slider.inline {
1361
+ grid-template-columns: max-content 1fr max-content;
1362
+ }
1363
+ }
1364
+
1365
+ /* Native range inputs sit above the visual rail to provide drag + a11y.
1366
+ We hide the runnable-track and style only the thumb. The two inputs
1367
+ overlap perfectly (same min/max/step) and only the thumbs are clickable
1368
+ so the user can grab either handle without warping the wrong one. */
1369
  .param-range-input {
1370
  appearance: none;
1371
  position: absolute;
1372
+ left: 0;
1373
+ right: 0;
1374
+ top: 31px;
1375
+ width: 100%;
1376
+ height: 14px;
 
1377
  background: transparent;
1378
  pointer-events: none;
1379
  margin: 0;
1380
  padding: 0;
1381
  }
1382
+ .param-range-input::-webkit-slider-runnable-track {
1383
+ height: 14px;
1384
+ background: transparent;
1385
+ border: 0;
1386
+ }
1387
  .param-range-input::-webkit-slider-thumb {
1388
  appearance: none;
1389
  pointer-events: auto;
1390
+ height: 12px;
1391
+ width: 12px;
1392
+ border: 2px solid var(--fg);
1393
+ background: var(--bg);
1394
+ border-radius: 0;
1395
  margin-top: 0;
1396
+ cursor: ew-resize;
1397
+ transition: background var(--transition), border-color var(--transition);
1398
+ }
1399
+ .param-range-input::-webkit-slider-thumb:hover {
1400
+ background: var(--fg);
1401
+ }
1402
+ .param-range-input::-moz-range-track {
1403
+ height: 14px;
1404
+ border: 0;
1405
+ background: transparent;
1406
  }
 
 
1407
  .param-range-input::-moz-range-thumb {
1408
  pointer-events: auto;
1409
+ height: 12px;
1410
+ width: 12px;
1411
+ border: 2px solid var(--fg);
1412
+ background: var(--bg);
1413
+ border-radius: 0;
1414
+ cursor: ew-resize;
1415
+ transition: background var(--transition), border-color var(--transition);
1416
+ }
1417
+ .param-range-input::-moz-range-thumb:hover {
1418
+ background: var(--fg);
1419
+ }
1420
+ .param-range-input:focus-visible {
1421
+ outline: none;
1422
  }
 
1423
  .param-range-input:focus-visible::-webkit-slider-thumb {
1424
+ background: var(--accent);
1425
+ border-color: var(--accent);
 
1426
  }
1427
  .param-range-input:focus-visible::-moz-range-thumb {
1428
+ background: var(--accent);
1429
+ border-color: var(--accent);
 
1430
  }
1431
 
1432
  @media (prefers-reduced-motion: reduce) {
app/models/page.tsx CHANGED
@@ -9,30 +9,14 @@ import { InfiniteScrollSentinel } from "@/components/infinite-scroll"
9
  import { ModelCompareDialog } from "@/components/model-compare-dialog"
10
  import { ModelTable } from "@/components/model-table"
11
  import { Navigation } from "@/components/navigation"
 
12
  import { fetchDevelopers, fetchModelCards, fetchBenchmarkMetadata, type DeveloperListItem } from "@/lib/dashboard-data-client"
13
  import type { BenchmarkCard } from "@/lib/benchmark-schema"
 
14
 
15
  const PAGE_SIZE = 40
16
  const MAX_COMPARE_MODELS = 4
17
 
18
- const PARAM_RANGE_VALUES = [1, 2, 3, 4, 6, 8, 10, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 500] as const
19
- const PARAM_RANGE_MARKERS = [
20
- { label: "< 1B", step: 0 },
21
- { label: "6B", step: PARAM_RANGE_VALUES.indexOf(6) },
22
- { label: "12B", step: PARAM_RANGE_VALUES.indexOf(12) },
23
- { label: "32B", step: PARAM_RANGE_VALUES.indexOf(32) },
24
- { label: "128B", step: PARAM_RANGE_VALUES.indexOf(128) },
25
- { label: "> 500B", step: PARAM_RANGE_VALUES.length - 1 },
26
- ] as const
27
-
28
- function formatParamBoundLabel(step: number, bound: "min" | "max") {
29
- const maxStepIndex = PARAM_RANGE_VALUES.length - 1
30
- if (bound === "min" && step <= 0) return "< 1B"
31
- if (bound === "max" && step >= maxStepIndex) return "> 500B"
32
- const value = PARAM_RANGE_VALUES[step]
33
- return value != null ? `${value}B` : "?"
34
- }
35
-
36
  type ModelSort = "benchmarks" | "results" | "name" | "released" | "params"
37
  type DevSort = "coverage" | "evaluated" | "models" | "name"
38
 
@@ -61,17 +45,11 @@ export default function ModelsPage() {
61
  const [compareOpen, setCompareOpen] = useState(false)
62
  const [visibleCount, setVisibleCount] = useState(PAGE_SIZE)
63
  const [minParamStep, setMinParamStep] = useState(0)
64
- const [maxParamStep, setMaxParamStep] = useState(PARAM_RANGE_VALUES.length - 1)
 
65
  const deferredSearchQuery = useDeferredValue(searchQuery)
66
- const maxParamStepIndex = PARAM_RANGE_VALUES.length - 1
67
- const numericMinParams = useMemo(
68
- () => (minParamStep <= 0 ? null : PARAM_RANGE_VALUES[minParamStep] ?? null),
69
- [minParamStep]
70
- )
71
- const numericMaxParams = useMemo(
72
- () => (maxParamStep >= PARAM_RANGE_VALUES.length - 1 ? null : PARAM_RANGE_VALUES[maxParamStep] ?? null),
73
- [maxParamStep]
74
- )
75
 
76
  useEffect(() => {
77
  Promise.all([fetchModelCards(), fetchBenchmarkMetadata()])
@@ -106,16 +84,12 @@ export default function ModelsPage() {
106
  const query = deferredSearchQuery.trim().toLowerCase()
107
  let filtered = evaluations
108
 
109
- if (numericMinParams != null) {
110
- filtered = filtered.filter(
111
- (row) => row.params_billions != null && row.params_billions >= numericMinParams
112
- )
113
- }
114
- if (numericMaxParams != null) {
115
- filtered = filtered.filter(
116
- (row) => row.params_billions != null && row.params_billions <= numericMaxParams
117
- )
118
- }
119
 
120
  if (query) {
121
  filtered = filtered.filter((row) => {
@@ -143,7 +117,7 @@ export default function ModelsPage() {
143
  return b.benchmarks_count - a.benchmarks_count
144
  }
145
  })
146
- }, [evaluations, deferredSearchQuery, modelSortBy, numericMinParams, numericMaxParams])
147
 
148
  // Developers — filter + sort
149
  const sortedDevelopers = useMemo(() => {
@@ -176,7 +150,7 @@ export default function ModelsPage() {
176
  // Reset visible window when filter/sort changes
177
  useEffect(() => {
178
  setVisibleCount(PAGE_SIZE)
179
- }, [groupByDeveloper, modelSortBy, developerSortBy, deferredSearchQuery, minParamStep, maxParamStep])
180
 
181
  const totalCount = groupByDeveloper ? sortedDevelopers.length : sortedEvaluations.length
182
  const visibleEvaluations = useMemo(
@@ -292,69 +266,6 @@ export default function ModelsPage() {
292
  />
293
  </div>
294
 
295
- {!groupByDeveloper && (
296
- <>
297
- <span className="hidden h-5 w-px bg-[color:var(--border-soft)] sm:block" />
298
-
299
- {/* Compact param picker — kicker + slider + readout, all inline */}
300
- <div className="flex min-w-[260px] flex-1 items-center gap-4 sm:max-w-[300px]">
301
- <span className="kicker shrink-0">Params</span>
302
- <div className="relative h-4 min-w-0 flex-1">
303
- <div className="absolute inset-x-2 top-1/2 h-[2px] -translate-y-1/2 bg-[color:var(--border-soft)]" />
304
- <div className="absolute inset-x-2 top-1/2 h-[2px] -translate-y-1/2">
305
- <div
306
- className="absolute inset-y-0 bg-[color:var(--fg)] transition-[left,right] duration-200 ease-[var(--ease-out-quart)]"
307
- style={{
308
- left: `${(minParamStep / maxParamStepIndex) * 100}%`,
309
- right: `${Math.max(100 - (maxParamStep / maxParamStepIndex) * 100, 0)}%`,
310
- }}
311
- />
312
- </div>
313
- <input
314
- type="range"
315
- min={0}
316
- max={maxParamStepIndex}
317
- step={1}
318
- value={minParamStep}
319
- onChange={(event) =>
320
- setMinParamStep(Math.min(Number(event.target.value), maxParamStep))
321
- }
322
- className="param-range-input"
323
- aria-label="Minimum parameter filter"
324
- />
325
- <input
326
- type="range"
327
- min={0}
328
- max={maxParamStepIndex}
329
- step={1}
330
- value={maxParamStep}
331
- onChange={(event) =>
332
- setMaxParamStep(Math.max(Number(event.target.value), minParamStep))
333
- }
334
- className="param-range-input"
335
- aria-label="Maximum parameter filter"
336
- />
337
- </div>
338
- <span className="shrink-0 whitespace-nowrap font-mono text-[11px] tabular-nums text-[color:var(--fg-muted)]">
339
- {formatParamBoundLabel(minParamStep, "min")} – {formatParamBoundLabel(maxParamStep, "max")}
340
- </span>
341
- {(minParamStep > 0 || maxParamStep < maxParamStepIndex) && (
342
- <button
343
- type="button"
344
- onClick={() => {
345
- setMinParamStep(0)
346
- setMaxParamStep(maxParamStepIndex)
347
- }}
348
- className="shrink-0 font-mono text-[10px] uppercase tracking-[0.12em] text-[color:var(--fg-subtle)] hover:text-[color:var(--accent)] transition-colors"
349
- aria-label="Reset parameters filter"
350
- >
351
- Reset
352
- </button>
353
- )}
354
- </div>
355
- </>
356
- )}
357
-
358
  <select
359
  className="ec-select ml-auto shrink-0"
360
  value={groupByDeveloper ? developerSortBy : modelSortBy}
@@ -383,6 +294,27 @@ export default function ModelsPage() {
383
  </select>
384
  </div>
385
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  {/* TABLE ---------------------------------------------------- */}
387
  {loading ? (
388
  <div className="py-24 text-center font-mono text-[11px] uppercase tracking-[0.2em] text-[color:var(--fg-subtle)]">
 
9
  import { ModelCompareDialog } from "@/components/model-compare-dialog"
10
  import { ModelTable } from "@/components/model-table"
11
  import { Navigation } from "@/components/navigation"
12
+ import { ParamRangePicker } from "@/components/param-range-picker"
13
  import { fetchDevelopers, fetchModelCards, fetchBenchmarkMetadata, type DeveloperListItem } from "@/lib/dashboard-data-client"
14
  import type { BenchmarkCard } from "@/lib/benchmark-schema"
15
+ import { PARAM_RANGE_MAX_INDEX, paramStepToNumeric } from "@/lib/param-range"
16
 
17
  const PAGE_SIZE = 40
18
  const MAX_COMPARE_MODELS = 4
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  type ModelSort = "benchmarks" | "results" | "name" | "released" | "params"
21
  type DevSort = "coverage" | "evaluated" | "models" | "name"
22
 
 
45
  const [compareOpen, setCompareOpen] = useState(false)
46
  const [visibleCount, setVisibleCount] = useState(PAGE_SIZE)
47
  const [minParamStep, setMinParamStep] = useState(0)
48
+ const [maxParamStep, setMaxParamStep] = useState(PARAM_RANGE_MAX_INDEX)
49
+ const [showUnknownSize, setShowUnknownSize] = useState(true)
50
  const deferredSearchQuery = useDeferredValue(searchQuery)
51
+ const numericMinParams = useMemo(() => paramStepToNumeric(minParamStep, "min"), [minParamStep])
52
+ const numericMaxParams = useMemo(() => paramStepToNumeric(maxParamStep, "max"), [maxParamStep])
 
 
 
 
 
 
 
53
 
54
  useEffect(() => {
55
  Promise.all([fetchModelCards(), fetchBenchmarkMetadata()])
 
84
  const query = deferredSearchQuery.trim().toLowerCase()
85
  let filtered = evaluations
86
 
87
+ filtered = filtered.filter((row) => {
88
+ if (row.params_billions == null) return showUnknownSize
89
+ if (numericMinParams != null && row.params_billions < numericMinParams) return false
90
+ if (numericMaxParams != null && row.params_billions > numericMaxParams) return false
91
+ return true
92
+ })
 
 
 
 
93
 
94
  if (query) {
95
  filtered = filtered.filter((row) => {
 
117
  return b.benchmarks_count - a.benchmarks_count
118
  }
119
  })
120
+ }, [evaluations, deferredSearchQuery, modelSortBy, numericMinParams, numericMaxParams, showUnknownSize])
121
 
122
  // Developers — filter + sort
123
  const sortedDevelopers = useMemo(() => {
 
150
  // Reset visible window when filter/sort changes
151
  useEffect(() => {
152
  setVisibleCount(PAGE_SIZE)
153
+ }, [groupByDeveloper, modelSortBy, developerSortBy, deferredSearchQuery, minParamStep, maxParamStep, showUnknownSize])
154
 
155
  const totalCount = groupByDeveloper ? sortedDevelopers.length : sortedEvaluations.length
156
  const visibleEvaluations = useMemo(
 
266
  />
267
  </div>
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  <select
270
  className="ec-select ml-auto shrink-0"
271
  value={groupByDeveloper ? developerSortBy : modelSortBy}
 
294
  </select>
295
  </div>
296
 
297
+ {/* PARAM RANGE — its own row so the rail has room to breathe.
298
+ Sharing the toolbar with stats / search / sort squished it. */}
299
+ {!groupByDeveloper && (
300
+ <div className="mb-6 -mt-2">
301
+ <ParamRangePicker
302
+ variant="inline"
303
+ headline="Params"
304
+ minStep={minParamStep}
305
+ maxStep={maxParamStep}
306
+ onMinChange={setMinParamStep}
307
+ onMaxChange={setMaxParamStep}
308
+ onReset={() => {
309
+ setMinParamStep(0)
310
+ setMaxParamStep(PARAM_RANGE_MAX_INDEX)
311
+ }}
312
+ showUnknownSize={showUnknownSize}
313
+ onShowUnknownSizeChange={setShowUnknownSize}
314
+ />
315
+ </div>
316
+ )}
317
+
318
  {/* TABLE ---------------------------------------------------- */}
319
  {loading ? (
320
  <div className="py-24 text-center font-mono text-[11px] uppercase tracking-[0.2em] text-[color:var(--fg-subtle)]">
app/page.tsx CHANGED
@@ -244,7 +244,7 @@ export default async function HomePage() {
244
  <p className="mx-auto mt-2 max-w-2xl text-sm leading-6 text-[color:var(--fg-muted)]">
245
  The current backend snapshot does not include{" "}
246
  <code className="rounded-sm bg-[color:var(--bg-surface)] px-1.5 py-0.5 font-mono text-xs">
247
- corpus-aggregates.json
248
  </code>
249
  . When it does, this section will render the four corpus-level rollups.
250
  </p>
 
244
  <p className="mx-auto mt-2 max-w-2xl text-sm leading-6 text-[color:var(--fg-muted)]">
245
  The current backend snapshot does not include{" "}
246
  <code className="rounded-sm bg-[color:var(--bg-surface)] px-1.5 py-0.5 font-mono text-xs">
247
+ headline.json
248
  </code>
249
  . When it does, this section will render the four corpus-level rollups.
250
  </p>
components/benchmark-detail.tsx CHANGED
@@ -4205,10 +4205,10 @@ function SampleDataDialog({
4205
 
4206
  return (
4207
  <>
4208
- <Button variant="outline" size="sm" className="gap-2" onClick={handleOpenToggle}>
4209
- <Database className="h-4 w-4" />
4210
  {open ? "Hide instances" : "View all instances"}
4211
- </Button>
4212
  {open && (
4213
  <div className="rounded-xl border bg-background p-4 space-y-3">
4214
  <div>
 
4205
 
4206
  return (
4207
  <>
4208
+ <button type="button" className="btn-ec outline inline-flex items-center gap-2" onClick={handleOpenToggle}>
4209
+ <Database className="h-3.5 w-3.5" />
4210
  {open ? "Hide instances" : "View all instances"}
4211
+ </button>
4212
  {open && (
4213
  <div className="rounded-xl border bg-background p-4 space-y-3">
4214
  <div>
components/eval-detail.tsx CHANGED
@@ -3,14 +3,20 @@
3
  import { useAudienceMode } from "@/components/audience-mode-provider"
4
  import { Fragment, useEffect, useMemo, useState } from "react"
5
  import Link from "next/link"
 
6
  import { CompletenessPanel } from "@/components/signals/completeness-panel"
7
  import { ComparabilityPanel } from "@/components/signals/comparability-panel"
8
- import { ReproducibilityPanel } from "@/components/signals/reproducibility-panel"
9
  import { SignalsRowBadges } from "@/components/signals/signals-row-badges"
10
- import { RowSignalsCompact } from "@/components/signals/row-signals-compact"
11
  import { getCompletenessPopulatedCount } from "@/components/signals/signal-utils"
12
  import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
13
  import { ScoreDistribution } from "@/components/score-distribution"
 
 
 
 
 
 
 
14
  import {
15
  Dialog,
16
  DialogContent,
@@ -264,107 +270,6 @@ function SliceSelector({
264
  )
265
  }
266
 
267
- const PARAM_RANGE_VALUES = [1, 2, 3, 4, 6, 8, 10, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 500] as const
268
- const PARAM_RANGE_MARKERS = [
269
- { label: "< 1B", step: 0 },
270
- { label: "6B", step: PARAM_RANGE_VALUES.indexOf(6) },
271
- { label: "12B", step: PARAM_RANGE_VALUES.indexOf(12) },
272
- { label: "32B", step: PARAM_RANGE_VALUES.indexOf(32) },
273
- { label: "128B", step: PARAM_RANGE_VALUES.indexOf(128) },
274
- { label: "> 500B", step: PARAM_RANGE_VALUES.length - 1 },
275
- ] as const
276
-
277
- function formatParamBoundLabel(step: number, bound: "min" | "max") {
278
- const maxStepIndex = PARAM_RANGE_VALUES.length - 1
279
-
280
- if (bound === "min" && step <= 0) {
281
- return "< 1B"
282
- }
283
-
284
- if (bound === "max" && step >= maxStepIndex) {
285
- return "> 500B"
286
- }
287
-
288
- const value = PARAM_RANGE_VALUES[step]
289
- return value != null ? `${value}B` : "Not reported"
290
- }
291
-
292
- function parseParamsBillionsFromText(value: string | null | undefined) {
293
- if (!value) {
294
- return null
295
- }
296
-
297
- const normalized = value.trim().toLowerCase()
298
- if (!normalized) {
299
- return null
300
- }
301
-
302
- const compact = normalized.replace(/,/g, "")
303
- const tokenMatch = compact.match(/(\d+(?:\.\d+)?)\s*(trillion|tn|t|billion|bn|b|million|mn|m|thousand|k)\b/)
304
- if (tokenMatch) {
305
- const amount = Number.parseFloat(tokenMatch[1])
306
- if (!Number.isFinite(amount)) {
307
- return null
308
- }
309
-
310
- const unit = tokenMatch[2]
311
- if (unit === "trillion" || unit === "tn" || unit === "t") {
312
- return amount * 1000
313
- }
314
-
315
- if (unit === "billion" || unit === "bn" || unit === "b") {
316
- return amount
317
- }
318
-
319
- if (unit === "million" || unit === "mn" || unit === "m") {
320
- return amount / 1000
321
- }
322
-
323
- if (unit === "thousand" || unit === "k") {
324
- return amount / 1_000_000
325
- }
326
- }
327
-
328
- const numeric = Number.parseFloat(compact)
329
- return Number.isFinite(numeric) ? numeric : null
330
- }
331
-
332
- function parseParamsBillionsFromModelName(modelName: string | null | undefined) {
333
- if (!modelName) {
334
- return null
335
- }
336
-
337
- const sizeTokens = Array.from(modelName.matchAll(/\b(\d+(?:\.\d+)?)\s*([tmbk])\b/gi))
338
- if (sizeTokens.length === 0) {
339
- return null
340
- }
341
-
342
- const lastToken = sizeTokens[sizeTokens.length - 1]
343
- const numericValue = Number.parseFloat(lastToken[1])
344
- if (!Number.isFinite(numericValue)) {
345
- return null
346
- }
347
-
348
- const unit = lastToken[2].toLowerCase()
349
- if (unit === "t") {
350
- return numericValue * 1000
351
- }
352
-
353
- if (unit === "b") {
354
- return numericValue
355
- }
356
-
357
- if (unit === "m") {
358
- return numericValue / 1000
359
- }
360
-
361
- if (unit === "k") {
362
- return numericValue / 1_000_000
363
- }
364
-
365
- return null
366
- }
367
-
368
  function getParamsBillionsFromModelInfo(modelInfo: ModelResultForBenchmark["model_info"]) {
369
  const additionalDetails = modelInfo.additional_details
370
  const rawParamsBillions =
@@ -568,32 +473,16 @@ export function EvalDetail({ summary }: EvalDetailProps) {
568
  const [expandedRows, setExpandedRows] = useState<Record<string, boolean>>({})
569
  const [leaderboardPage, setLeaderboardPage] = useState(1)
570
  const [minParamStep, setMinParamStep] = useState(0)
571
- const [maxParamStep, setMaxParamStep] = useState(PARAM_RANGE_VALUES.length - 1)
572
 
573
  const maxScore = summary.metric_config.max_score ?? 1
574
  const minScore = summary.metric_config.min_score ?? 0
575
  const range = maxScore - minScore
576
 
577
  const normalizeScore = (raw: number) => (range > 0 ? (raw - minScore) / range : raw)
578
- const maxParamStepIndex = PARAM_RANGE_VALUES.length - 1
579
- const minHandlePercent = (minParamStep / maxParamStepIndex) * 100
580
- const maxHandlePercent = (maxParamStep / maxParamStepIndex) * 100
581
-
582
- const numericMinParams = useMemo(() => {
583
- if (minParamStep <= 0) {
584
- return null
585
- }
586
-
587
- return PARAM_RANGE_VALUES[minParamStep] ?? null
588
- }, [minParamStep])
589
 
590
- const numericMaxParams = useMemo(() => {
591
- if (maxParamStep >= PARAM_RANGE_VALUES.length - 1) {
592
- return null
593
- }
594
-
595
- return PARAM_RANGE_VALUES[maxParamStep] ?? null
596
- }, [maxParamStep])
597
 
598
  const sortedResults = useMemo(
599
  () =>
@@ -603,6 +492,8 @@ export function EvalDetail({ summary }: EvalDetailProps) {
603
  [summary.model_results, summary.metric_config.lower_is_better]
604
  )
605
 
 
 
606
  const hasParameterData = useMemo(
607
  () => sortedResults.some((result) => getParamsBillions(result) != null),
608
  [sortedResults]
@@ -612,17 +503,13 @@ export function EvalDetail({ summary }: EvalDetailProps) {
612
  return sortedResults.filter((modelResult) => {
613
  const paramsBillions = getParamsBillions(modelResult)
614
 
615
- if (numericMinParams != null && (paramsBillions == null || paramsBillions < numericMinParams)) {
616
- return false
617
- }
618
-
619
- if (numericMaxParams != null && (paramsBillions == null || paramsBillions > numericMaxParams)) {
620
- return false
621
- }
622
 
 
 
623
  return true
624
  })
625
- }, [numericMaxParams, numericMinParams, sortedResults])
626
 
627
  const leaderboardRows = useMemo<LeaderboardRow[]>(() => {
628
  let currentRank = 0
@@ -675,10 +562,6 @@ export function EvalDetail({ summary }: EvalDetailProps) {
675
  [key]: !current[key],
676
  }))
677
 
678
- const evalKindLabel = summary.is_aggregated
679
- ? (isResearchView ? "Composite · §3.2" : "Benchmark suite")
680
- : (isResearchView ? "Single benchmark" : "Benchmark")
681
-
682
  const headerOrg = summary.composite_benchmark_name && summary.composite_benchmark_name !== summary.evaluation_name
683
  ? summary.composite_benchmark_name
684
  : null
@@ -693,10 +576,9 @@ export function EvalDetail({ summary }: EvalDetailProps) {
693
  <div className="space-y-12">
694
  {/* HERO — paper §3.1 ------------------------------------------------ */}
695
  <header className="motion-academic-enter">
696
- <div className="kicker kicker-accent mb-2">{evalKindLabel}</div>
697
  <h1
698
  className="font-bold tracking-[-0.025em]"
699
- style={{ fontSize: "clamp(40px, 5vw, 60px)", lineHeight: 1.04, margin: "8px 0 12px" }}
700
  >
701
  {summary.evaluation_name}
702
  </h1>
@@ -790,6 +672,9 @@ export function EvalDetail({ summary }: EvalDetailProps) {
790
 
791
  <CollapsibleContent className="mt-3">
792
  <div className="space-y-4">
 
 
 
793
  {/* Metric spec / nested datalist (paper-aligned hairline def-list) */}
794
  <div className="ec-card warm" style={{ padding: "18px 22px" }}>
795
  <div className="kicker mb-3">
@@ -992,96 +877,27 @@ export function EvalDetail({ summary }: EvalDetailProps) {
992
  </div>
993
  )}
994
 
995
- <div className="ec-card" style={{ padding: 0, overflow: "hidden" }}>
996
- {hasParameterData && (
997
- <div
998
- style={{
999
- borderBottom: "1px solid var(--border-soft)",
1000
- background: "var(--bg-warm)",
1001
- padding: "16px 20px",
 
 
 
 
 
 
1002
  }}
1003
- >
1004
- <div className="flex flex-col gap-3 lg:flex-row lg:items-center lg:justify-between">
1005
- <div className="space-y-1">
1006
- <div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
1007
- Parameter range
1008
- </div>
1009
- <div className="text-sm text-muted-foreground">
1010
- Narrow the leaderboard to comparable model sizes.
1011
- </div>
1012
- </div>
1013
-
1014
- <div className="flex min-w-0 flex-1 items-center gap-4 lg:max-w-[40rem]">
1015
- <div className="min-w-0 flex-1">
1016
- <div className="mb-2 flex items-center justify-between text-[10px] font-medium uppercase tracking-[0.14em] text-muted-foreground">
1017
- {PARAM_RANGE_MARKERS.map((marker) => (
1018
- <span key={marker.label} className="text-center">
1019
- {marker.label}
1020
- </span>
1021
- ))}
1022
- </div>
1023
-
1024
- <div className="relative h-4">
1025
- <div className="absolute inset-x-1.5 top-1/2 h-[3px] -translate-y-1/2 rounded-full bg-border/80" />
1026
- <div className="absolute inset-x-1.5 top-1/2 h-[3px] -translate-y-1/2">
1027
- <div
1028
- className="absolute inset-y-0 rounded-full bg-foreground"
1029
- style={{
1030
- left: `${minHandlePercent}%`,
1031
- right: `${Math.max(100 - maxHandlePercent, 0)}%`,
1032
- }}
1033
- />
1034
- </div>
1035
-
1036
- <div className="absolute inset-x-1.5 top-1/2 -translate-y-1/2">
1037
- {PARAM_RANGE_VALUES.map((_, stepIndex) => (
1038
- <span
1039
- key={`param-tick-${stepIndex}`}
1040
- className="absolute top-0 h-2 w-px -translate-x-1/2 rounded-full bg-border"
1041
- style={{ left: `${(stepIndex / maxParamStepIndex) * 100}%` }}
1042
- aria-hidden="true"
1043
- />
1044
- ))}
1045
- </div>
1046
-
1047
- <input
1048
- type="range"
1049
- min={0}
1050
- max={maxParamStepIndex}
1051
- step={1}
1052
- value={minParamStep}
1053
- onChange={(event) => {
1054
- const nextMin = Number(event.target.value)
1055
- setMinParamStep(Math.min(nextMin, maxParamStep))
1056
- }}
1057
- className="param-range-input"
1058
- aria-label="Minimum parameter filter"
1059
- />
1060
-
1061
- <input
1062
- type="range"
1063
- min={0}
1064
- max={maxParamStepIndex}
1065
- step={1}
1066
- value={maxParamStep}
1067
- onChange={(event) => {
1068
- const nextMax = Number(event.target.value)
1069
- setMaxParamStep(Math.max(nextMax, minParamStep))
1070
- }}
1071
- className="param-range-input"
1072
- aria-label="Maximum parameter filter"
1073
- />
1074
- </div>
1075
- </div>
1076
-
1077
- <span className="shrink-0 text-[11px] text-muted-foreground">
1078
- {formatParamBoundLabel(minParamStep, "min")} to {formatParamBoundLabel(maxParamStep, "max")}
1079
- </span>
1080
- </div>
1081
- </div>
1082
- </div>
1083
- )}
1084
 
 
1085
  <div className="overflow-x-auto">
1086
  <table className="ec-htable" style={{ minWidth: 980 }}>
1087
  <thead>
@@ -1204,7 +1020,6 @@ export function EvalDetail({ summary }: EvalDetailProps) {
1204
  Avg of {modelResult.aggregate_components.length}
1205
  </div>
1206
  )}
1207
- <RowSignalsCompact annotations={rowAnnotations} className="mt-1" />
1208
  </div>
1209
  </div>
1210
  </td>
@@ -1375,10 +1190,6 @@ export function EvalDetail({ summary }: EvalDetailProps) {
1375
  )}
1376
  </DetailPanel>
1377
 
1378
- {!isResearchView && (
1379
- <ReproducibilityPanel gap={rowAnnotations?.reproducibility_gap} />
1380
- )}
1381
-
1382
  <DetailPanel
1383
  title={isResearchView ? "Score Breakdown" : "Metric Summary"}
1384
  subtitle={
@@ -1636,11 +1447,19 @@ function MultiMetricLeaderboard({
1636
  isResearchView: boolean
1637
  }) {
1638
  const [page, setPage] = useState(1)
1639
- const [sortKey, setSortKey] = useState<string>("coverage")
 
 
 
 
 
 
 
 
1640
  const [sortDirection, setSortDirection] = useState<"asc" | "desc">("desc")
1641
  const [activeSubtaskTab, setActiveSubtaskTab] = useState<string>("all")
1642
  const [minParamStep, setMinParamStep] = useState(0)
1643
- const [maxParamStep, setMaxParamStep] = useState(PARAM_RANGE_VALUES.length - 1)
1644
  const [expandedRows, setExpandedRows] = useState<Record<string, boolean>>({})
1645
 
1646
  // Index ModelResultForBenchmark entries by model_info.id so we can power the
@@ -1678,7 +1497,6 @@ function MultiMetricLeaderboard({
1678
  [allMetricKeys]
1679
  )
1680
  const [visibleMetricKeys, setVisibleMetricKeys] = useState<string[]>(() => defaultVisibleMetricKeys)
1681
- const maxParamStepIndex = PARAM_RANGE_VALUES.length - 1
1682
  const leaderboardMetricMap = useMemo(
1683
  () => new Map(leaderboardMetrics.map((metric) => [metric.column_key, metric])),
1684
  [leaderboardMetrics]
@@ -1725,38 +1543,26 @@ function MultiMetricLeaderboard({
1725
  [visibleMetrics]
1726
  )
1727
 
1728
- const numericMinParams = useMemo(() => {
1729
- if (minParamStep <= 0) {
1730
- return null
1731
- }
1732
-
1733
- return PARAM_RANGE_VALUES[minParamStep] ?? null
1734
- }, [minParamStep])
1735
 
1736
- const numericMaxParams = useMemo(() => {
1737
- if (maxParamStep >= PARAM_RANGE_VALUES.length - 1) {
1738
- return null
1739
- }
1740
-
1741
- return PARAM_RANGE_VALUES[maxParamStep] ?? null
1742
- }, [maxParamStep])
1743
 
1744
  const filteredRows = useMemo(() => {
1745
- return leaderboardRows
1746
- .filter((row) => {
1747
- const paramsBillions = getParamsBillionsFromModelInfo(row.model_info)
1748
 
1749
- if (numericMinParams != null && (paramsBillions == null || paramsBillions < numericMinParams)) {
1750
- return false
1751
- }
1752
-
1753
- if (numericMaxParams != null && (paramsBillions == null || paramsBillions > numericMaxParams)) {
1754
- return false
1755
- }
1756
 
1757
- return true
1758
- })
1759
- }, [leaderboardRows, numericMaxParams, numericMinParams])
 
 
1760
 
1761
  const sortedRows = useMemo(() => {
1762
  const rows = [...filteredRows]
@@ -1789,11 +1595,6 @@ function MultiMetricLeaderboard({
1789
  return sortDirection === "asc" ? comparison : -comparison
1790
  }
1791
 
1792
- if (sortKey === "coverage") {
1793
- const comparison = left.metrics_present - right.metrics_present || compareNames(left, right)
1794
- return sortDirection === "asc" ? comparison : -comparison
1795
- }
1796
-
1797
  if (sortKey === "updated") {
1798
  const comparison = compareTimestamps(left.evaluation_timestamp, right.evaluation_timestamp) || compareNames(left, right)
1799
  return sortDirection === "asc" ? comparison : -comparison
@@ -1836,10 +1637,19 @@ function MultiMetricLeaderboard({
1836
 
1837
  useEffect(() => {
1838
  if (leaderboardMetricMap.has(sortKey) && !visibleMetricColumnKeySet.has(sortKey)) {
1839
- setSortKey("coverage")
 
 
 
 
 
 
 
 
 
1840
  setSortDirection("desc")
1841
  }
1842
- }, [leaderboardMetricMap, sortKey, visibleMetricColumnKeySet])
1843
 
1844
  useEffect(() => {
1845
  if (!hasSubtaskTabs) {
@@ -1858,11 +1668,6 @@ function MultiMetricLeaderboard({
1858
  }
1859
  }, [activeSubtaskTab, hasSubtaskTabs, singleMetricSubtaskTabs])
1860
 
1861
- const hasParameterData = useMemo(
1862
- () => leaderboardRows.some((row) => getParamsBillionsFromModelInfo(row.model_info) != null),
1863
- [leaderboardRows]
1864
- )
1865
-
1866
  const pagedRows = useMemo(
1867
  () => sortedRows.slice(0, page * 50),
1868
  [page, sortedRows]
@@ -1883,18 +1688,12 @@ function MultiMetricLeaderboard({
1883
  })
1884
  }
1885
 
1886
- const getVisibleMetricCount = (row: LeaderboardMatrixRow) =>
1887
- visibleMetrics.reduce(
1888
- (count, metric) => count + (isNumericScore(row.values[metric.column_key]) ? 1 : 0),
1889
- 0
1890
- )
1891
-
1892
  const getDefaultSortDirection = (key: string): "asc" | "desc" => {
1893
  if (key === "model" || key === "developer") {
1894
  return "asc"
1895
  }
1896
 
1897
- if (key === "updated" || key === "coverage") {
1898
  return "desc"
1899
  }
1900
 
@@ -2033,84 +1832,21 @@ function MultiMetricLeaderboard({
2033
 
2034
  {hasParameterData && (
2035
  <div className="border-b bg-background px-5 py-4 sm:px-6">
2036
- <div className="flex flex-col gap-3 lg:flex-row lg:items-center lg:justify-between">
2037
- <div className="space-y-1">
2038
- <div className="text-[11px] font-semibold uppercase tracking-[0.18em] text-muted-foreground">
2039
- Parameter range
2040
- </div>
2041
- <div className="text-sm text-muted-foreground">
2042
- Narrow the matrix to comparable model sizes.
2043
- </div>
2044
- </div>
2045
-
2046
- <div className="flex min-w-0 flex-1 items-center gap-4 lg:max-w-[40rem]">
2047
- <div className="min-w-0 flex-1">
2048
- <div className="mb-2 flex items-center justify-between text-[10px] font-medium uppercase tracking-[0.14em] text-muted-foreground">
2049
- {PARAM_RANGE_MARKERS.map((marker) => (
2050
- <span key={marker.label} className="text-center">
2051
- {marker.label}
2052
- </span>
2053
- ))}
2054
- </div>
2055
-
2056
- <div className="relative h-4">
2057
- <div className="absolute inset-x-1.5 top-1/2 h-[3px] -translate-y-1/2 rounded-full bg-border/80" />
2058
- <div className="absolute inset-x-1.5 top-1/2 h-[3px] -translate-y-1/2">
2059
- <div
2060
- className="absolute inset-y-0 rounded-full bg-foreground"
2061
- style={{
2062
- left: `${(minParamStep / maxParamStepIndex) * 100}%`,
2063
- right: `${Math.max(100 - (maxParamStep / maxParamStepIndex) * 100, 0)}%`,
2064
- }}
2065
- />
2066
- </div>
2067
-
2068
- <div className="absolute inset-x-1.5 top-1/2 -translate-y-1/2">
2069
- {PARAM_RANGE_VALUES.map((_, stepIndex) => (
2070
- <span
2071
- key={`param-matrix-tick-${stepIndex}`}
2072
- className="absolute top-0 h-2 w-px -translate-x-1/2 rounded-full bg-border"
2073
- style={{ left: `${(stepIndex / maxParamStepIndex) * 100}%` }}
2074
- aria-hidden="true"
2075
- />
2076
- ))}
2077
- </div>
2078
-
2079
- <input
2080
- type="range"
2081
- min={0}
2082
- max={maxParamStepIndex}
2083
- step={1}
2084
- value={minParamStep}
2085
- onChange={(event) => {
2086
- const nextMin = Number(event.target.value)
2087
- setMinParamStep(Math.min(nextMin, maxParamStep))
2088
- }}
2089
- className="param-range-input"
2090
- aria-label="Minimum parameter filter"
2091
- />
2092
-
2093
- <input
2094
- type="range"
2095
- min={0}
2096
- max={maxParamStepIndex}
2097
- step={1}
2098
- value={maxParamStep}
2099
- onChange={(event) => {
2100
- const nextMax = Number(event.target.value)
2101
- setMaxParamStep(Math.max(nextMax, minParamStep))
2102
- }}
2103
- className="param-range-input"
2104
- aria-label="Maximum parameter filter"
2105
- />
2106
- </div>
2107
- </div>
2108
-
2109
- <span className="shrink-0 text-[11px] text-muted-foreground">
2110
- {formatParamBoundLabel(minParamStep, "min")} to {formatParamBoundLabel(maxParamStep, "max")}
2111
- </span>
2112
- </div>
2113
- </div>
2114
  </div>
2115
  )}
2116
 
@@ -2135,13 +1871,6 @@ function MultiMetricLeaderboard({
2135
  {isResearchView ? "Developer" : "Provider"}
2136
  {getSortIndicator("developer")}
2137
  </th>
2138
- <th
2139
- className="num"
2140
- style={{ width: 110, cursor: "pointer" }}
2141
- onClick={() => handleSort("coverage")}
2142
- >
2143
- Coverage{getSortIndicator("coverage")}
2144
- </th>
2145
  {visibleMetrics.map((metric) => {
2146
  const showSubtaskTopline =
2147
  !hasSubtaskTabs &&
@@ -2248,10 +1977,6 @@ function MultiMetricLeaderboard({
2248
  >
2249
  {row.model_info.developer ?? "Unknown developer"}
2250
  </div>
2251
- <RowSignalsCompact
2252
- annotations={getRowLevelAnnotations(row, visibleMetrics)}
2253
- className="mt-1"
2254
- />
2255
  </div>
2256
  </div>
2257
  </td>
@@ -2262,11 +1987,6 @@ function MultiMetricLeaderboard({
2262
  </div>
2263
  </td>
2264
 
2265
- <td className="num align-top tabular-nums" style={{ fontSize: 13, fontWeight: 600 }}>
2266
- {getVisibleMetricCount(row)}
2267
- <span style={{ color: "var(--fg-subtle)", fontWeight: 400 }}>/{visibleMetrics.length}</span>
2268
- </td>
2269
-
2270
  {visibleMetrics.map((metric) => {
2271
  const score = row.values[metric.column_key]
2272
  const annotations = row.annotations_by_metric?.[metric.column_key]
@@ -2561,23 +2281,40 @@ function BenchmarkCardPanel({
2561
  const license = ethical.data_licensing ?? ""
2562
  const shortLicense = license && license !== "Not specified" ? license : null
2563
 
 
 
 
 
 
 
 
 
 
 
2564
  return (
2565
  <div className="ec-card" style={{ padding: 0, overflow: "hidden" }}>
2566
- <div
2567
- style={{
2568
- padding: "16px 20px",
2569
- background: "var(--bg-warm)",
2570
- borderBottom: "1px solid var(--border-soft)",
2571
- }}
2572
- >
2573
- <div className="flex flex-wrap items-center gap-3 mb-1.5">
2574
- <BookOpen className="h-4 w-4" style={{ color: "var(--fg-muted)" }} />
2575
- <span className="kicker kicker-fg" style={{ fontSize: 12, letterSpacing: "0.16em" }}>
2576
- Benchmark Card
2577
- </span>
2578
- {shortLicense && (
2579
- <span className="ec-tag outline">{shortLicense}</span>
2580
- )}
 
 
 
 
 
 
 
2581
  {(flaggedFields.length > 0 || missingFields.length > 0) && (
2582
  <span
2583
  className="font-mono inline-flex items-center gap-1"
@@ -2596,34 +2333,11 @@ function BenchmarkCardPanel({
2596
  </span>
2597
  )}
2598
  </div>
2599
- <div className="text-[12px]" style={{ color: "var(--fg-muted)" }}>
2600
- Structured metadata about this benchmark: what it measures, how it was built, and known limitations.
2601
- </div>
2602
- </div>
2603
 
2604
  <div className="space-y-6 p-5 sm:p-6">
2605
  {knownIssues.length > 0 && <KnownIssuesPanel issues={knownIssues} variant="full" />}
2606
 
2607
- {/* Overview + domains */}
2608
- <div className="space-y-3">
2609
- <p className="text-sm leading-6 text-muted-foreground">{details.overview}</p>
2610
-
2611
- <div className="flex flex-wrap gap-2">
2612
- {domains.map((d) => (
2613
- <span key={d} className="ec-tag outline">
2614
- <Tag className="h-3 w-3 shrink-0" />
2615
- {d}
2616
- </span>
2617
- ))}
2618
- {languages.map((l) => (
2619
- <span key={l} className="ec-tag outline">
2620
- <Globe className="h-3 w-3 shrink-0" />
2621
- {l}
2622
- </span>
2623
- ))}
2624
- </div>
2625
- </div>
2626
-
2627
  <div className="grid gap-3 sm:grid-cols-2 xl:grid-cols-3">
2628
  {/* Goal */}
2629
  <div
 
3
  import { useAudienceMode } from "@/components/audience-mode-provider"
4
  import { Fragment, useEffect, useMemo, useState } from "react"
5
  import Link from "next/link"
6
+ import { BenchmarkSignalsStrip } from "@/components/signals/benchmark-signals-strip"
7
  import { CompletenessPanel } from "@/components/signals/completeness-panel"
8
  import { ComparabilityPanel } from "@/components/signals/comparability-panel"
 
9
  import { SignalsRowBadges } from "@/components/signals/signals-row-badges"
 
10
  import { getCompletenessPopulatedCount } from "@/components/signals/signal-utils"
11
  import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"
12
  import { ScoreDistribution } from "@/components/score-distribution"
13
+ import { ParamRangePicker } from "@/components/param-range-picker"
14
+ import {
15
+ PARAM_RANGE_MAX_INDEX,
16
+ paramStepToNumeric,
17
+ parseParamsBillionsFromText,
18
+ parseParamsBillionsFromModelName,
19
+ } from "@/lib/param-range"
20
  import {
21
  Dialog,
22
  DialogContent,
 
270
  )
271
  }
272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  function getParamsBillionsFromModelInfo(modelInfo: ModelResultForBenchmark["model_info"]) {
274
  const additionalDetails = modelInfo.additional_details
275
  const rawParamsBillions =
 
473
  const [expandedRows, setExpandedRows] = useState<Record<string, boolean>>({})
474
  const [leaderboardPage, setLeaderboardPage] = useState(1)
475
  const [minParamStep, setMinParamStep] = useState(0)
476
+ const [maxParamStep, setMaxParamStep] = useState(PARAM_RANGE_MAX_INDEX)
477
 
478
  const maxScore = summary.metric_config.max_score ?? 1
479
  const minScore = summary.metric_config.min_score ?? 0
480
  const range = maxScore - minScore
481
 
482
  const normalizeScore = (raw: number) => (range > 0 ? (raw - minScore) / range : raw)
 
 
 
 
 
 
 
 
 
 
 
483
 
484
+ const numericMinParams = useMemo(() => paramStepToNumeric(minParamStep, "min"), [minParamStep])
485
+ const numericMaxParams = useMemo(() => paramStepToNumeric(maxParamStep, "max"), [maxParamStep])
 
 
 
 
 
486
 
487
  const sortedResults = useMemo(
488
  () =>
 
492
  [summary.model_results, summary.metric_config.lower_is_better]
493
  )
494
 
495
+ const [showUnknownSize, setShowUnknownSize] = useState(true)
496
+
497
  const hasParameterData = useMemo(
498
  () => sortedResults.some((result) => getParamsBillions(result) != null),
499
  [sortedResults]
 
503
  return sortedResults.filter((modelResult) => {
504
  const paramsBillions = getParamsBillions(modelResult)
505
 
506
+ if (paramsBillions == null) return showUnknownSize
 
 
 
 
 
 
507
 
508
+ if (numericMinParams != null && paramsBillions < numericMinParams) return false
509
+ if (numericMaxParams != null && paramsBillions > numericMaxParams) return false
510
  return true
511
  })
512
+ }, [numericMaxParams, numericMinParams, showUnknownSize, sortedResults])
513
 
514
  const leaderboardRows = useMemo<LeaderboardRow[]>(() => {
515
  let currentRank = 0
 
562
  [key]: !current[key],
563
  }))
564
 
 
 
 
 
565
  const headerOrg = summary.composite_benchmark_name && summary.composite_benchmark_name !== summary.evaluation_name
566
  ? summary.composite_benchmark_name
567
  : null
 
576
  <div className="space-y-12">
577
  {/* HERO — paper §3.1 ------------------------------------------------ */}
578
  <header className="motion-academic-enter">
 
579
  <h1
580
  className="font-bold tracking-[-0.025em]"
581
+ style={{ fontSize: "clamp(40px, 5vw, 60px)", lineHeight: 1.04, margin: "0 0 12px" }}
582
  >
583
  {summary.evaluation_name}
584
  </h1>
 
672
 
673
  <CollapsibleContent className="mt-3">
674
  <div className="space-y-4">
675
+ {/* Four interpretive signals (paper §4.2.1), benchmark-level. */}
676
+ <BenchmarkSignalsStrip summary={summary} />
677
+
678
  {/* Metric spec / nested datalist (paper-aligned hairline def-list) */}
679
  <div className="ec-card warm" style={{ padding: "18px 22px" }}>
680
  <div className="kicker mb-3">
 
877
  </div>
878
  )}
879
 
880
+ {hasParameterData && (
881
+ <div className="mb-4">
882
+ <ParamRangePicker
883
+ variant="promo"
884
+ headline="Parameter range"
885
+ subline="Narrow the leaderboard to comparable model sizes."
886
+ minStep={minParamStep}
887
+ maxStep={maxParamStep}
888
+ onMinChange={setMinParamStep}
889
+ onMaxChange={setMaxParamStep}
890
+ onReset={() => {
891
+ setMinParamStep(0)
892
+ setMaxParamStep(PARAM_RANGE_MAX_INDEX)
893
  }}
894
+ showUnknownSize={showUnknownSize}
895
+ onShowUnknownSizeChange={setShowUnknownSize}
896
+ />
897
+ </div>
898
+ )}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
899
 
900
+ <div className="ec-card" style={{ padding: 0, overflow: "hidden" }}>
901
  <div className="overflow-x-auto">
902
  <table className="ec-htable" style={{ minWidth: 980 }}>
903
  <thead>
 
1020
  Avg of {modelResult.aggregate_components.length}
1021
  </div>
1022
  )}
 
1023
  </div>
1024
  </div>
1025
  </td>
 
1190
  )}
1191
  </DetailPanel>
1192
 
 
 
 
 
1193
  <DetailPanel
1194
  title={isResearchView ? "Score Breakdown" : "Metric Summary"}
1195
  subtitle={
 
1447
  isResearchView: boolean
1448
  }) {
1449
  const [page, setPage] = useState(1)
1450
+ // Default sort: the first root-scope metric (the benchmark's overall
1451
+ // score), falling back to the first metric overall, then to model name.
1452
+ // We don't sort by metric coverage by default — coverage tells you how
1453
+ // many slices reported, not how the model performed.
1454
+ const [sortKey, setSortKey] = useState<string>(() => {
1455
+ const metrics = summary.leaderboard_metrics ?? []
1456
+ const root = metrics.find((m) => m.scope === "root")
1457
+ return root?.column_key ?? metrics[0]?.column_key ?? "model"
1458
+ })
1459
  const [sortDirection, setSortDirection] = useState<"asc" | "desc">("desc")
1460
  const [activeSubtaskTab, setActiveSubtaskTab] = useState<string>("all")
1461
  const [minParamStep, setMinParamStep] = useState(0)
1462
+ const [maxParamStep, setMaxParamStep] = useState(PARAM_RANGE_MAX_INDEX)
1463
  const [expandedRows, setExpandedRows] = useState<Record<string, boolean>>({})
1464
 
1465
  // Index ModelResultForBenchmark entries by model_info.id so we can power the
 
1497
  [allMetricKeys]
1498
  )
1499
  const [visibleMetricKeys, setVisibleMetricKeys] = useState<string[]>(() => defaultVisibleMetricKeys)
 
1500
  const leaderboardMetricMap = useMemo(
1501
  () => new Map(leaderboardMetrics.map((metric) => [metric.column_key, metric])),
1502
  [leaderboardMetrics]
 
1543
  [visibleMetrics]
1544
  )
1545
 
1546
+ const numericMinParams = useMemo(() => paramStepToNumeric(minParamStep, "min"), [minParamStep])
1547
+ const numericMaxParams = useMemo(() => paramStepToNumeric(maxParamStep, "max"), [maxParamStep])
1548
+ const [showUnknownSize, setShowUnknownSize] = useState(true)
 
 
 
 
1549
 
1550
+ const hasParameterData = useMemo(
1551
+ () => leaderboardRows.some((row) => getParamsBillionsFromModelInfo(row.model_info) != null),
1552
+ [leaderboardRows]
1553
+ )
 
 
 
1554
 
1555
  const filteredRows = useMemo(() => {
1556
+ return leaderboardRows.filter((row) => {
1557
+ const paramsBillions = getParamsBillionsFromModelInfo(row.model_info)
 
1558
 
1559
+ if (paramsBillions == null) return showUnknownSize
 
 
 
 
 
 
1560
 
1561
+ if (numericMinParams != null && paramsBillions < numericMinParams) return false
1562
+ if (numericMaxParams != null && paramsBillions > numericMaxParams) return false
1563
+ return true
1564
+ })
1565
+ }, [leaderboardRows, numericMaxParams, numericMinParams, showUnknownSize])
1566
 
1567
  const sortedRows = useMemo(() => {
1568
  const rows = [...filteredRows]
 
1595
  return sortDirection === "asc" ? comparison : -comparison
1596
  }
1597
 
 
 
 
 
 
1598
  if (sortKey === "updated") {
1599
  const comparison = compareTimestamps(left.evaluation_timestamp, right.evaluation_timestamp) || compareNames(left, right)
1600
  return sortDirection === "asc" ? comparison : -comparison
 
1637
 
1638
  useEffect(() => {
1639
  if (leaderboardMetricMap.has(sortKey) && !visibleMetricColumnKeySet.has(sortKey)) {
1640
+ // The currently-sorted metric was hidden — fall back to the first
1641
+ // visible root-scope metric, then the first visible metric overall,
1642
+ // then to the model name.
1643
+ const visibleRoot = leaderboardMetrics.find(
1644
+ (m) => m.scope === "root" && visibleMetricColumnKeySet.has(m.column_key),
1645
+ )
1646
+ const fallback = visibleRoot?.column_key
1647
+ ?? leaderboardMetrics.find((m) => visibleMetricColumnKeySet.has(m.column_key))?.column_key
1648
+ ?? "model"
1649
+ setSortKey(fallback)
1650
  setSortDirection("desc")
1651
  }
1652
+ }, [leaderboardMetricMap, leaderboardMetrics, sortKey, visibleMetricColumnKeySet])
1653
 
1654
  useEffect(() => {
1655
  if (!hasSubtaskTabs) {
 
1668
  }
1669
  }, [activeSubtaskTab, hasSubtaskTabs, singleMetricSubtaskTabs])
1670
 
 
 
 
 
 
1671
  const pagedRows = useMemo(
1672
  () => sortedRows.slice(0, page * 50),
1673
  [page, sortedRows]
 
1688
  })
1689
  }
1690
 
 
 
 
 
 
 
1691
  const getDefaultSortDirection = (key: string): "asc" | "desc" => {
1692
  if (key === "model" || key === "developer") {
1693
  return "asc"
1694
  }
1695
 
1696
+ if (key === "updated") {
1697
  return "desc"
1698
  }
1699
 
 
1832
 
1833
  {hasParameterData && (
1834
  <div className="border-b bg-background px-5 py-4 sm:px-6">
1835
+ <ParamRangePicker
1836
+ variant="promo"
1837
+ headline="Parameter range"
1838
+ subline="Narrow the matrix to comparable model sizes."
1839
+ minStep={minParamStep}
1840
+ maxStep={maxParamStep}
1841
+ onMinChange={setMinParamStep}
1842
+ onMaxChange={setMaxParamStep}
1843
+ onReset={() => {
1844
+ setMinParamStep(0)
1845
+ setMaxParamStep(PARAM_RANGE_MAX_INDEX)
1846
+ }}
1847
+ showUnknownSize={showUnknownSize}
1848
+ onShowUnknownSizeChange={setShowUnknownSize}
1849
+ />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1850
  </div>
1851
  )}
1852
 
 
1871
  {isResearchView ? "Developer" : "Provider"}
1872
  {getSortIndicator("developer")}
1873
  </th>
 
 
 
 
 
 
 
1874
  {visibleMetrics.map((metric) => {
1875
  const showSubtaskTopline =
1876
  !hasSubtaskTabs &&
 
1977
  >
1978
  {row.model_info.developer ?? "Unknown developer"}
1979
  </div>
 
 
 
 
1980
  </div>
1981
  </div>
1982
  </td>
 
1987
  </div>
1988
  </td>
1989
 
 
 
 
 
 
1990
  {visibleMetrics.map((metric) => {
1991
  const score = row.values[metric.column_key]
1992
  const annotations = row.annotations_by_metric?.[metric.column_key]
 
2281
  const license = ethical.data_licensing ?? ""
2282
  const shortLicense = license && license !== "Not specified" ? license : null
2283
 
2284
+ // The outer collapsible trigger names the panel; the prominent top
2285
+ // strip surfaces what readers most often want at a glance — domain
2286
+ // and language tags, license, and any flagged/missing-field badge.
2287
+ const hasChipStrip =
2288
+ domains.length > 0 ||
2289
+ languages.length > 0 ||
2290
+ Boolean(shortLicense) ||
2291
+ flaggedFields.length > 0 ||
2292
+ missingFields.length > 0
2293
+
2294
  return (
2295
  <div className="ec-card" style={{ padding: 0, overflow: "hidden" }}>
2296
+ {hasChipStrip && (
2297
+ <div
2298
+ className="flex flex-wrap items-center gap-2"
2299
+ style={{
2300
+ padding: "10px 20px",
2301
+ background: "var(--bg-warm)",
2302
+ borderBottom: "1px solid var(--border-soft)",
2303
+ }}
2304
+ >
2305
+ {domains.map((d) => (
2306
+ <span key={`d-${d}`} className="ec-tag outline">
2307
+ <Tag className="h-3 w-3 shrink-0" />
2308
+ {d}
2309
+ </span>
2310
+ ))}
2311
+ {languages.map((l) => (
2312
+ <span key={`l-${l}`} className="ec-tag outline">
2313
+ <Globe className="h-3 w-3 shrink-0" />
2314
+ {l}
2315
+ </span>
2316
+ ))}
2317
+ {shortLicense && <span className="ec-tag outline">{shortLicense}</span>}
2318
  {(flaggedFields.length > 0 || missingFields.length > 0) && (
2319
  <span
2320
  className="font-mono inline-flex items-center gap-1"
 
2333
  </span>
2334
  )}
2335
  </div>
2336
+ )}
 
 
 
2337
 
2338
  <div className="space-y-6 p-5 sm:p-6">
2339
  {knownIssues.length > 0 && <KnownIssuesPanel issues={knownIssues} variant="full" />}
2340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2341
  <div className="grid gap-3 sm:grid-cols-2 xl:grid-cols-3">
2342
  {/* Goal */}
2343
  <div
components/family-table.tsx CHANGED
@@ -5,17 +5,16 @@ import { useRouter } from "next/navigation"
5
  import { ArrowUpRight, ChevronDown, ChevronRight } from "lucide-react"
6
 
7
  import type { HierarchyFamily, HierarchyLeaf } from "@/lib/backend-artifacts"
8
- import type { CategoryType } from "@/lib/benchmark-schema"
9
-
10
- const CATEGORY_DOT: Record<string, string> = {
11
- General: "bg-sky-400",
12
- Reasoning: "bg-violet-400",
13
- Agentic: "bg-amber-400",
14
- Safety: "bg-rose-400",
15
- Code: "bg-emerald-400",
16
- Math: "bg-indigo-400",
17
- Multilingual: "bg-teal-400",
18
- }
19
 
20
  const LEAVES_INLINE_MIN = 2
21
  const LEAVES_INLINE_MAX = 50
@@ -23,89 +22,172 @@ const LEAVES_INLINE_MAX = 50
23
  interface FamilyTableProps {
24
  families: HierarchyFamily[]
25
  totalModels: number
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
27
 
28
  function slugify(value: string | null | undefined): string {
29
  return (value ?? "").toLowerCase().replace(/[^a-z0-9]+/g, "")
30
  }
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  interface LeafEntry {
33
  id: string
34
  leafKey: string
35
  leafName: string
36
  evalsCount: number
 
37
  }
38
 
39
- function collectLeafEntries(fam: HierarchyFamily): LeafEntry[] {
 
 
 
40
  const out: LeafEntry[] = []
41
  for (const leaf of fam.leaves ?? []) {
42
  const ids = leaf.eval_summary_ids ?? []
43
  if (ids.length === 0) continue
 
 
 
 
 
 
 
 
 
 
 
 
44
  out.push({
45
  id: ids[0],
46
  leafKey: leaf.key,
47
  leafName: leaf.display_name || leaf.key,
48
  evalsCount: leaf.evals_count ?? ids.length,
 
49
  })
50
  }
51
  return out
52
  }
53
 
54
  /**
55
- * Pick the eval_summary_id that best matches a family's stated display_name.
 
 
 
56
  *
57
- * Backend hierarchy data sometimes has a family whose display_name names one
58
- * specific leaf (e.g. family `llm_stats`, display_name "HumanEval", with 471
59
- * leaves). The legacy "directIds[0]" pick navigates to whichever leaf was
60
- * processed first (often `aa_index`) wrong. This helper:
 
61
  *
62
- * 1. If there is a leaf whose slug matches the family's display_name slug,
63
- * prefer that leaf's id. (`HumanEval` leaf `humaneval`.)
64
- * 2. Else if there is a direct family-level id whose slug equals the family
65
- * key slug, prefer that (genuine family-level page).
66
- * 3. Otherwise fall back to the first available id.
67
  */
68
  function pickFamilyNavId(fam: HierarchyFamily, leafEntries: LeafEntry[]): string | null {
69
  const directIds = fam.eval_summary_ids ?? []
70
- const all: Array<{ id: string; source: "direct" | "leaf"; leafKey?: string; leafName?: string }> = [
71
- ...directIds.map((id) => ({ id, source: "direct" as const })),
72
- ...leafEntries.map((l) => ({ id: l.id, source: "leaf" as const, leafKey: l.leafKey, leafName: l.leafName })),
73
- ]
74
- if (all.length === 0) return null
75
- if (all.length === 1) return all[0].id
 
 
 
 
 
 
 
 
 
 
76
 
77
  const famNameSlug = slugify(fam.display_name)
78
  const famKeySlug = slugify(fam.key)
79
 
80
- // 1. Leaf slug matches family display_name: e.g. display "HumanEval" → leaf "humaneval"
81
- if (famNameSlug && famNameSlug !== famKeySlug) {
82
- for (const entry of all) {
83
- if (entry.source !== "leaf") continue
84
- if (slugify(entry.leafKey) === famNameSlug || slugify(entry.leafName) === famNameSlug) {
85
- return entry.id
86
- }
87
  }
88
  }
89
 
90
- // 2. Direct family-level id: id slug equals family key slug
91
- for (const entry of all) {
92
- if (entry.source !== "direct") continue
93
- if (slugify(entry.id) === famKeySlug) return entry.id
94
  }
95
 
96
- // 3. Direct id starting with the family key only (a true family-level summary)
97
- for (const entry of all) {
98
- if (entry.source !== "direct") continue
99
- const idSlug = slugify(entry.id)
100
- if (idSlug.startsWith(famKeySlug) && idSlug.length === famKeySlug.length) {
101
- return entry.id
102
- }
103
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- // 4. Fall back: leaves first, then direct
106
- const leafFallback = all.find((e) => e.source === "leaf")
107
- if (leafFallback) return leafFallback.id
108
- return all[0].id
 
 
 
 
 
 
 
 
 
109
  }
110
 
111
  interface RowData {
@@ -114,23 +196,63 @@ interface RowData {
114
  name: string
115
  keySlug: string
116
  category: CategoryType
117
- composites: number
118
  benchmarks: number
119
- slices: number
120
- metrics: number
121
  evalsCount: number
122
  leaves: LeafEntry[]
123
  /** True when the family has many leaves with no clean family-level summary —
124
  * we open it expanded so the user picks a leaf directly. */
125
  isAggregator: boolean
 
126
  }
127
 
128
- export function FamilyTable({ families, totalModels }: FamilyTableProps) {
 
 
 
 
 
 
129
  const router = useRouter()
130
  const [expanded, setExpanded] = useState<Record<string, boolean>>({})
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  const rows = useMemo<RowData[]>(() => {
133
- return families.map((fam) => {
 
134
  const composites = fam.composites ?? []
135
  const standalone = fam.standalone_benchmarks ?? []
136
  const benchmarks = fam.benchmarks ?? []
@@ -141,12 +263,6 @@ export function FamilyTable({ families, totalModels }: FamilyTableProps) {
141
  ...benchmarks,
142
  ...composites.flatMap((c) => c.benchmarks ?? []),
143
  ]
144
- const sliceCount =
145
- fam.slices?.length ??
146
- allBenchmarks.reduce(
147
- (sum, b) => sum + ((b as { slices?: unknown[] }).slices?.length ?? 0),
148
- 0,
149
- )
150
  const metricCount =
151
  (fam.metrics?.length ?? 0) +
152
  allBenchmarks.reduce(
@@ -156,51 +272,70 @@ export function FamilyTable({ families, totalModels }: FamilyTableProps) {
156
  const benchmarkCount =
157
  allBenchmarks.length > 0 ? allBenchmarks.length : leaves.length
158
 
159
- const leafEntries = collectLeafEntries(fam)
160
  const navId = pickFamilyNavId(fam, leafEntries)
161
 
162
- // An "aggregator" family is one whose display_name doesn't really
163
- // describe a single benchmark (its leaves are heterogeneous). We
164
- // detect this by counting leaves and, when there are many, prefer
165
- // showing the leaf list rather than relying on the family-level id.
166
- const isAggregator = leafEntries.length >= LEAVES_INLINE_MIN
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
- return {
 
 
 
 
 
169
  key: fam.key,
170
  navId,
171
- name: fam.display_name,
172
  keySlug: fam.key,
173
  category: (fam.category ?? "General") as CategoryType,
174
- composites: composites.length,
175
  benchmarks: benchmarkCount,
176
- slices: sliceCount,
177
- metrics: metricCount,
178
  evalsCount: fam.evals_count ?? metricCount,
179
- leaves: leafEntries,
180
  isAggregator,
181
- }
182
- })
183
- }, [families])
 
 
 
184
 
185
  return (
186
  <div className="overflow-x-auto">
187
  <table className="ec-htable">
188
  <thead>
189
  <tr>
190
- <th style={{ width: "30%" }}>Family</th>
191
  <th>Category</th>
192
- <th className="num">Suites</th>
193
  <th className="num">Benchmarks</th>
194
- <th className="num">Slices</th>
195
- <th className="num">Metrics</th>
196
  <th className="num">Reported results</th>
197
  <th style={{ width: 90 }} />
198
  </tr>
199
  </thead>
200
  <tbody>
201
  {rows.map((row) => {
202
- const dotClass = CATEGORY_DOT[row.category] ?? "bg-stone-400"
203
- const isExpanded = expanded[row.key] ?? false
 
 
 
 
204
  const expandable = row.isAggregator
205
  const visibleLeaves = isExpanded
206
  ? row.leaves.slice(0, LEAVES_INLINE_MAX)
@@ -213,12 +348,16 @@ export function FamilyTable({ families, totalModels }: FamilyTableProps) {
213
  <Fragment key={row.key}>
214
  <tr
215
  onClick={(event) => {
216
- // Allow chevron click without navigating
217
  const target = event.target as HTMLElement
218
  if (target.closest("[data-row-toggle]")) return
219
- if (row.navId) router.push(`/evals/${encodeURIComponent(row.navId)}`)
 
 
 
 
220
  }}
221
- style={{ cursor: row.navId ? "pointer" : "default" }}
222
  >
223
  <td>
224
  <div className="flex items-start gap-2.5 min-w-0">
@@ -240,14 +379,18 @@ export function FamilyTable({ families, totalModels }: FamilyTableProps) {
240
  ) : (
241
  <span className="-ml-1 mt-0.5 inline-block h-4 w-4" aria-hidden />
242
  )}
243
- <span
244
- className={`shrink-0 mt-1.5 h-2 w-2 rounded-full ${dotClass}`}
245
- aria-hidden
246
- />
247
  <div className="min-w-0">
248
  <div className="font-semibold text-[14px] text-[color:var(--fg)] truncate">
249
  {row.name}
250
  </div>
 
 
 
 
 
 
 
 
251
  <div className="font-mono text-[10px] tracking-[0.06em] text-[color:var(--fg-subtle)] mt-0.5 truncate">
252
  {row.keySlug}
253
  {expandable && (
@@ -264,18 +407,9 @@ export function FamilyTable({ families, totalModels }: FamilyTableProps) {
264
  {row.category}
265
  </span>
266
  </td>
267
- <td className="num font-mono text-[13px]">
268
- {row.composites > 0 ? row.composites.toLocaleString() : "—"}
269
- </td>
270
  <td className="num font-mono text-[13px]">
271
  {row.benchmarks.toLocaleString()}
272
  </td>
273
- <td className="num font-mono text-[13px]">
274
- {row.slices > 0 ? row.slices.toLocaleString() : "—"}
275
- </td>
276
- <td className="num font-mono text-[13px]">
277
- {row.metrics > 0 ? row.metrics.toLocaleString() : "—"}
278
- </td>
279
  <td className="num font-mono text-[13px]">
280
  {row.evalsCount.toLocaleString()}
281
  {totalModels > 0 && (
@@ -292,7 +426,7 @@ export function FamilyTable({ families, totalModels }: FamilyTableProps) {
292
 
293
  {isExpanded && visibleLeaves.length > 0 && (
294
  <tr style={{ background: "var(--bg-warm)" }}>
295
- <td colSpan={8} style={{ padding: 0 }}>
296
  <div style={{ padding: "10px 24px 14px 64px" }}>
297
  <div
298
  className="font-mono uppercase mb-2"
 
5
  import { ArrowUpRight, ChevronDown, ChevronRight } from "lucide-react"
6
 
7
  import type { HierarchyFamily, HierarchyLeaf } from "@/lib/backend-artifacts"
8
+ import type { BenchmarkCard, CategoryType } from "@/lib/benchmark-schema"
9
+ import type { BenchmarkEvalListItem } from "@/lib/eval-processing"
10
+
11
+ /**
12
+ * Per-category chip colour. Uses oklch tokens so the chip reads against
13
+ * both light and dark backgrounds; the saturation is held low to stay in
14
+ * the editorial palette (no candy-bright accents).
15
+ */
16
+ // Categories use the neutral chip styling — colour-coded chips read as
17
+ // noise against the editorial palette.
 
18
 
19
  const LEAVES_INLINE_MIN = 2
20
  const LEAVES_INLINE_MAX = 50
 
22
  interface FamilyTableProps {
23
  families: HierarchyFamily[]
24
  totalModels: number
25
+ evalItems?: Map<string, BenchmarkEvalListItem>
26
+ /** Optional benchmark-metadata index (keyed by benchmark / leaf / family
27
+ * key). Used to look up per-leaf domains when the hierarchy doesn't
28
+ * carry `leaf.tags.domains`, so the domain filter works on data that
29
+ * only ships domains via the metadata file. */
30
+ benchmarkCards?: Record<string, BenchmarkCard>
31
+ /** Lower-cased domain slugs to filter the listing. When non-empty, every
32
+ * expandable family is auto-expanded and its leaves are restricted to
33
+ * those that touch one of the selected domains. Single-benchmark
34
+ * families are kept only when their domains intersect the filter.
35
+ * Pass `null`/`undefined` to disable filtering. */
36
+ domainFilter?: Set<string> | null
37
  }
38
 
39
  function slugify(value: string | null | undefined): string {
40
  return (value ?? "").toLowerCase().replace(/[^a-z0-9]+/g, "")
41
  }
42
 
43
+ /** Render a family key as a human-readable title — used as a fallback when
44
+ * the backend `display_name` is misleading (e.g. names a single leaf instead
45
+ * of the family). Common acronyms stay uppercase; everything else is title
46
+ * case. */
47
+ const FAMILY_KEY_ACRONYMS = new Set([
48
+ "llm", "llms", "aa", "hf", "api", "cli", "sql", "gpt", "qa", "ai", "ml",
49
+ "nlp", "rl", "vqa", "vlm", "mt", "cv",
50
+ ])
51
+ function humanizeFamilyKey(key: string): string {
52
+ return key
53
+ .split(/[_\-\s]+/)
54
+ .filter(Boolean)
55
+ .map((word) => {
56
+ if (FAMILY_KEY_ACRONYMS.has(word.toLowerCase())) return word.toUpperCase()
57
+ return word.charAt(0).toUpperCase() + word.slice(1).toLowerCase()
58
+ })
59
+ .join("-")
60
+ }
61
+
62
  interface LeafEntry {
63
  id: string
64
  leafKey: string
65
  leafName: string
66
  evalsCount: number
67
+ domains: string[]
68
  }
69
 
70
+ function collectLeafEntries(
71
+ fam: HierarchyFamily,
72
+ benchmarkCards?: Record<string, BenchmarkCard>,
73
+ ): LeafEntry[] {
74
  const out: LeafEntry[] = []
75
  for (const leaf of fam.leaves ?? []) {
76
  const ids = leaf.eval_summary_ids ?? []
77
  if (ids.length === 0) continue
78
+ // Domain sources, in order of trust:
79
+ // (1) hierarchy `leaf.tags.domains` — sometimes absent
80
+ // (2) benchmark-metadata keyed by leaf.key
81
+ // (3) benchmark-metadata keyed by the leaf's eval_summary_id
82
+ const collected = new Set<string>()
83
+ for (const d of leaf.tags?.domains ?? []) collected.add(d.toLowerCase())
84
+ const cardByLeaf = benchmarkCards?.[leaf.key]
85
+ for (const d of cardByLeaf?.benchmark_details?.domains ?? []) collected.add(d.toLowerCase())
86
+ for (const id of ids) {
87
+ const cardById = benchmarkCards?.[id]
88
+ for (const d of cardById?.benchmark_details?.domains ?? []) collected.add(d.toLowerCase())
89
+ }
90
  out.push({
91
  id: ids[0],
92
  leafKey: leaf.key,
93
  leafName: leaf.display_name || leaf.key,
94
  evalsCount: leaf.evals_count ?? ids.length,
95
+ domains: Array.from(collected),
96
  })
97
  }
98
  return out
99
  }
100
 
101
  /**
102
+ * Pick the eval_summary_id to navigate to when the user clicks the family
103
+ * row. Returns null when the family has no genuine family-level summary —
104
+ * in that case the row click should expand the leaf list instead of
105
+ * opening one arbitrary child.
106
  *
107
+ * Some backend families flatten their leaf eval_summary_ids into the
108
+ * family's own `eval_summary_ids` array (e.g. family `llm_stats` whose
109
+ * direct ids are `llm_stats_aa_index`, `llm_stats_humaneval`, ... each
110
+ * a leaf summary). Those are NOT family-level composites; treating them
111
+ * as such is what made clicking "LLM-Stats" land on AA Index.
112
  *
113
+ * We filter direct ids down to those that are NOT also leaf ids. Whatever
114
+ * remains is a real family-level summary. Then we apply slug-based
115
+ * priority among those.
 
 
116
  */
117
  function pickFamilyNavId(fam: HierarchyFamily, leafEntries: LeafEntry[]): string | null {
118
  const directIds = fam.eval_summary_ids ?? []
119
+ const leafIdSet = new Set(leafEntries.map((l) => l.id))
120
+
121
+ // Real family-level summaries: direct ids that aren't actually leaf ids
122
+ // pulled up to the family. These resolve to is_aggregated/composite
123
+ // summaries on the detail page.
124
+ const compositeDirectIds = directIds.filter((id) => !leafIdSet.has(id))
125
+
126
+ if (compositeDirectIds.length === 0) {
127
+ // No genuine family-level composite. If there's exactly one leaf, the
128
+ // family is just that leaf in disguise — open it. Otherwise return
129
+ // null and let the caller expand the list.
130
+ if (leafEntries.length === 1) return leafEntries[0].id
131
+ return null
132
+ }
133
+
134
+ if (compositeDirectIds.length === 1) return compositeDirectIds[0]
135
 
136
  const famNameSlug = slugify(fam.display_name)
137
  const famKeySlug = slugify(fam.key)
138
 
139
+ // 1. Direct composite whose slug equals the family display_name slug
140
+ if (famNameSlug) {
141
+ for (const id of compositeDirectIds) {
142
+ if (slugify(id) === famNameSlug) return id
 
 
 
143
  }
144
  }
145
 
146
+ // 2. Direct composite whose slug equals the family key slug
147
+ for (const id of compositeDirectIds) {
148
+ if (slugify(id) === famKeySlug) return id
 
149
  }
150
 
151
+ // 3. First direct composite
152
+ return compositeDirectIds[0]
153
+ }
154
+
155
+ /** Returns a one-line description for the family — but only when the
156
+ * description applies to the whole family. Specifically: we only use the
157
+ * benchmark_card overview attached to the family's *own* navigation
158
+ * target (a family-level/composite eval). We don't borrow descriptions
159
+ * from individual leaves, because a leaf's description describes that
160
+ * one benchmark, not the family as a whole. */
161
+ function pickFamilyDescription(
162
+ navId: string | null,
163
+ leafEntries: LeafEntry[],
164
+ evalItems: Map<string, BenchmarkEvalListItem> | undefined,
165
+ ): string | null {
166
+ if (!evalItems || !navId) return null
167
+ // If navId resolved to a leaf (single-benchmark family), the leaf's
168
+ // description IS the family's description — that case is fine.
169
+ // If navId resolved to a composite, ditto. The only case we exclude is
170
+ // navId === null (no family-level summary), which the early return
171
+ // covers.
172
+ void leafEntries
173
+ const overview = evalItems.get(navId)?.benchmark_card?.benchmark_details?.overview
174
+ if (!overview) return null
175
+ return overview.length > 140 ? overview.slice(0, 137) + "…" : overview
176
+ }
177
 
178
+ /** Detects whether the family's `display_name` is misleading: backend data
179
+ * sometimes labels a family after one of its leaves (e.g. family
180
+ * `llm_stats` with display_name "HumanEval"). When that's the case the
181
+ * row should be titled with the humanized key instead, so the user can
182
+ * see they're looking at a *family* rather than a single benchmark. */
183
+ function isFamilyDisplayNameMisleading(fam: HierarchyFamily, leafEntries: LeafEntry[]): boolean {
184
+ const nameSlug = slugify(fam.display_name)
185
+ if (!nameSlug) return false
186
+ if (nameSlug === slugify(fam.key)) return false
187
+ if (leafEntries.length < 2) return false
188
+ return leafEntries.some(
189
+ (l) => slugify(l.leafKey) === nameSlug || slugify(l.leafName) === nameSlug,
190
+ )
191
  }
192
 
193
  interface RowData {
 
196
  name: string
197
  keySlug: string
198
  category: CategoryType
 
199
  benchmarks: number
 
 
200
  evalsCount: number
201
  leaves: LeafEntry[]
202
  /** True when the family has many leaves with no clean family-level summary —
203
  * we open it expanded so the user picks a leaf directly. */
204
  isAggregator: boolean
205
+ description: string | null
206
  }
207
 
208
+ export function FamilyTable({
209
+ families,
210
+ totalModels,
211
+ evalItems,
212
+ benchmarkCards,
213
+ domainFilter,
214
+ }: FamilyTableProps) {
215
  const router = useRouter()
216
  const [expanded, setExpanded] = useState<Record<string, boolean>>({})
217
 
218
+ const filterActive = Boolean(domainFilter && domainFilter.size > 0)
219
+
220
+ function leafMatchesFilter(leaf: LeafEntry): boolean {
221
+ if (!filterActive || !domainFilter) return true
222
+ return leaf.domains.some((d) => domainFilter.has(d))
223
+ }
224
+
225
+ function familyMatchesFilter(
226
+ fam: HierarchyFamily,
227
+ navId: string | null,
228
+ leafEntries: LeafEntry[],
229
+ ): boolean {
230
+ if (!filterActive || !domainFilter) return true
231
+ if (leafEntries.some(leafMatchesFilter)) return true
232
+ const candidates: BenchmarkCard | undefined = (() => {
233
+ if (navId) {
234
+ const fromList = evalItems?.get(navId)?.benchmark_card
235
+ if (fromList) return fromList
236
+ }
237
+ return undefined
238
+ })()
239
+ const sources: Array<string[]> = []
240
+ if (candidates) sources.push(candidates.benchmark_details?.domains ?? [])
241
+ sources.push(benchmarkCards?.[fam.key]?.benchmark_details?.domains ?? [])
242
+ for (const id of fam.eval_summary_ids ?? []) {
243
+ sources.push(benchmarkCards?.[id]?.benchmark_details?.domains ?? [])
244
+ }
245
+ for (const list of sources) {
246
+ for (const d of list) {
247
+ if (domainFilter.has(d.trim().toLowerCase())) return true
248
+ }
249
+ }
250
+ return false
251
+ }
252
+
253
  const rows = useMemo<RowData[]>(() => {
254
+ const out: RowData[] = []
255
+ for (const fam of families) {
256
  const composites = fam.composites ?? []
257
  const standalone = fam.standalone_benchmarks ?? []
258
  const benchmarks = fam.benchmarks ?? []
 
263
  ...benchmarks,
264
  ...composites.flatMap((c) => c.benchmarks ?? []),
265
  ]
 
 
 
 
 
 
266
  const metricCount =
267
  (fam.metrics?.length ?? 0) +
268
  allBenchmarks.reduce(
 
272
  const benchmarkCount =
273
  allBenchmarks.length > 0 ? allBenchmarks.length : leaves.length
274
 
275
+ const leafEntries = collectLeafEntries(fam, benchmarkCards)
276
  const navId = pickFamilyNavId(fam, leafEntries)
277
 
278
+ // An "aggregator" family has heterogeneous leaves; we expand it
279
+ // inline so the user can pick a benchmark directly. When the family
280
+ // has no real composite summary (navId === null) it's necessarily
281
+ // an aggregator clicking the row toggles expand instead of
282
+ // navigating.
283
+ const isAggregator = leafEntries.length >= LEAVES_INLINE_MIN || navId == null
284
+
285
+ const displayName = isFamilyDisplayNameMisleading(fam, leafEntries)
286
+ ? humanizeFamilyKey(fam.key)
287
+ : fam.display_name
288
+
289
+ // Description sourcing: prefer the eval item the row navigates to;
290
+ // when there's no navId or its eval item carries no overview, walk
291
+ // the leaves until we find one whose benchmark_card has one. That
292
+ // way an aggregator family ("HELM", "BFCL") whose family-level row
293
+ // doesn't directly link to a single eval still surfaces a one-line
294
+ // description from any of its component benchmarks.
295
+ const description = pickFamilyDescription(navId, leafEntries, evalItems)
296
 
297
+ if (!familyMatchesFilter(fam, navId, leafEntries)) continue
298
+ const visibleLeafEntries = filterActive
299
+ ? leafEntries.filter(leafMatchesFilter)
300
+ : leafEntries
301
+
302
+ out.push({
303
  key: fam.key,
304
  navId,
305
+ name: displayName,
306
  keySlug: fam.key,
307
  category: (fam.category ?? "General") as CategoryType,
 
308
  benchmarks: benchmarkCount,
 
 
309
  evalsCount: fam.evals_count ?? metricCount,
310
+ leaves: visibleLeafEntries,
311
  isAggregator,
312
+ description,
313
+ })
314
+ }
315
+ return out
316
+ // eslint-disable-next-line react-hooks/exhaustive-deps
317
+ }, [families, evalItems, benchmarkCards, domainFilter])
318
 
319
  return (
320
  <div className="overflow-x-auto">
321
  <table className="ec-htable">
322
  <thead>
323
  <tr>
324
+ <th style={{ width: "60%" }}>Family</th>
325
  <th>Category</th>
 
326
  <th className="num">Benchmarks</th>
 
 
327
  <th className="num">Reported results</th>
328
  <th style={{ width: 90 }} />
329
  </tr>
330
  </thead>
331
  <tbody>
332
  {rows.map((row) => {
333
+ // When a domain filter is active we auto-expand every aggregator
334
+ // so the matching leaves are immediately visible, but still let
335
+ // the user collapse a row manually via the chevron.
336
+ const isExpanded = filterActive
337
+ ? expanded[row.key] ?? true
338
+ : expanded[row.key] ?? false
339
  const expandable = row.isAggregator
340
  const visibleLeaves = isExpanded
341
  ? row.leaves.slice(0, LEAVES_INLINE_MAX)
 
348
  <Fragment key={row.key}>
349
  <tr
350
  onClick={(event) => {
351
+ // Allow chevron click without double-handling
352
  const target = event.target as HTMLElement
353
  if (target.closest("[data-row-toggle]")) return
354
+ if (row.navId) {
355
+ router.push(`/evals/${encodeURIComponent(row.navId)}`)
356
+ } else if (expandable) {
357
+ setExpanded((current) => ({ ...current, [row.key]: !isExpanded }))
358
+ }
359
  }}
360
+ style={{ cursor: row.navId || expandable ? "pointer" : "default" }}
361
  >
362
  <td>
363
  <div className="flex items-start gap-2.5 min-w-0">
 
379
  ) : (
380
  <span className="-ml-1 mt-0.5 inline-block h-4 w-4" aria-hidden />
381
  )}
 
 
 
 
382
  <div className="min-w-0">
383
  <div className="font-semibold text-[14px] text-[color:var(--fg)] truncate">
384
  {row.name}
385
  </div>
386
+ {row.description && (
387
+ <div
388
+ className="mt-0.5"
389
+ style={{ fontSize: 12, color: "var(--fg-muted)", lineHeight: 1.45, display: "-webkit-box", WebkitLineClamp: 2, WebkitBoxOrient: "vertical", overflow: "hidden" }}
390
+ >
391
+ {row.description}
392
+ </div>
393
+ )}
394
  <div className="font-mono text-[10px] tracking-[0.06em] text-[color:var(--fg-subtle)] mt-0.5 truncate">
395
  {row.keySlug}
396
  {expandable && (
 
407
  {row.category}
408
  </span>
409
  </td>
 
 
 
410
  <td className="num font-mono text-[13px]">
411
  {row.benchmarks.toLocaleString()}
412
  </td>
 
 
 
 
 
 
413
  <td className="num font-mono text-[13px]">
414
  {row.evalsCount.toLocaleString()}
415
  {totalModels > 0 && (
 
426
 
427
  {isExpanded && visibleLeaves.length > 0 && (
428
  <tr style={{ background: "var(--bg-warm)" }}>
429
+ <td colSpan={5} style={{ padding: 0 }}>
430
  <div style={{ padding: "10px 24px 14px 64px" }}>
431
  <div
432
  className="font-mono uppercase mb-2"
components/param-range-picker.tsx ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import { useId } from "react"
4
+
5
+ import {
6
+ PARAM_RANGE_MARKERS,
7
+ PARAM_RANGE_MAX_INDEX,
8
+ PARAM_RANGE_VALUES,
9
+ formatParamBoundLabel,
10
+ } from "@/lib/param-range"
11
+
12
+ export type ParamRangeVariant = "default" | "inline" | "promo"
13
+
14
+ interface ParamRangePickerProps {
15
+ /** Index into PARAM_RANGE_VALUES for the lower handle (0 = "< 1B"). */
16
+ minStep: number
17
+ /** Index into PARAM_RANGE_VALUES for the upper handle (max = "> 500B"). */
18
+ maxStep: number
19
+ onMinChange: (next: number) => void
20
+ onMaxChange: (next: number) => void
21
+ /**
22
+ * `default` — Variant A: bracketed range with a labelled rail and a boxed
23
+ * mono readout, suitable for use as the headline call-out at the top of a
24
+ * leaderboard.
25
+ *
26
+ * `inline` — Variant B: a single-line picker with no boxed readout, sized
27
+ * to drop into a hairline toolbar alongside Sort and Filter pickers.
28
+ *
29
+ * `promo` — Variant C: warm-background framed slider with a left accent
30
+ * rule. Use when the slider actively reframes a chart/matrix below it.
31
+ */
32
+ variant?: ParamRangeVariant
33
+ /** Headline shown to the left of the slider (default & promo variants). */
34
+ headline?: string
35
+ /** Sub-text shown under the headline (default & promo variants). */
36
+ subline?: string
37
+ /** Callback to reset both handles to the open range. When provided, a
38
+ * Reset affordance is rendered next to the readout while the slider
39
+ * is constrained. */
40
+ onReset?: () => void
41
+ /** When defined, renders a small "Show models without known size" pill
42
+ * next to the readout. The toggle is independent of the slider — when
43
+ * off, models with no detected size are filtered out regardless of
44
+ * where the handles are. */
45
+ showUnknownSize?: boolean
46
+ onShowUnknownSizeChange?: (next: boolean) => void
47
+ className?: string
48
+ }
49
+
50
+ /**
51
+ * Themed dual-handle parameter-range picker. Shape and colour come from the
52
+ * design system: hairline rail, square outline thumbs, mono uppercase tick
53
+ * labels above the rail, and a boxed mono readout for the explicit bounds.
54
+ *
55
+ * The two `<input type="range">` elements provide native dragging + arrow-key
56
+ * a11y. The visual rail/fill/ticks/thumbs are absolutely-positioned overlays;
57
+ * the inputs themselves are kept transparent except for their thumbs (see
58
+ * `.param-range-input` in globals.css).
59
+ */
60
+ export function ParamRangePicker({
61
+ minStep,
62
+ maxStep,
63
+ onMinChange,
64
+ onMaxChange,
65
+ variant = "default",
66
+ headline = "Parameter range",
67
+ subline = "Narrow the matrix to comparable model sizes.",
68
+ onReset,
69
+ showUnknownSize,
70
+ onShowUnknownSizeChange,
71
+ className,
72
+ }: ParamRangePickerProps) {
73
+ const minId = useId()
74
+ const maxId = useId()
75
+
76
+ const isInline = variant === "inline"
77
+ const isPromo = variant === "promo"
78
+
79
+ const minPercent = (minStep / PARAM_RANGE_MAX_INDEX) * 100
80
+ const maxPercent = (maxStep / PARAM_RANGE_MAX_INDEX) * 100
81
+ const isConstrained = minStep > 0 || maxStep < PARAM_RANGE_MAX_INDEX
82
+
83
+ const track = (
84
+ <div className="pr-track-wrap">
85
+ <div className="pr-ticks" aria-hidden>
86
+ {PARAM_RANGE_MARKERS.map((marker, idx) => {
87
+ const isFirst = idx === 0
88
+ const isLast = idx === PARAM_RANGE_MARKERS.length - 1
89
+ const active = marker.step === minStep || marker.step === maxStep
90
+ return (
91
+ <div
92
+ key={marker.label}
93
+ className={`pr-tick${active ? " on" : ""}`}
94
+ style={{
95
+ left: `${(marker.step / PARAM_RANGE_MAX_INDEX) * 100}%`,
96
+ }}
97
+ >
98
+ <span
99
+ style={{
100
+ transform: isFirst
101
+ ? "translateX(0)"
102
+ : isLast
103
+ ? "translateX(-100%)"
104
+ : "translateX(-50%)",
105
+ marginLeft: 0,
106
+ }}
107
+ >
108
+ {marker.label}
109
+ </span>
110
+ </div>
111
+ )
112
+ })}
113
+ </div>
114
+
115
+ <div className="pr-rail" />
116
+ <div
117
+ className="pr-fill"
118
+ style={{
119
+ left: `${minPercent}%`,
120
+ width: `${Math.max(maxPercent - minPercent, 0)}%`,
121
+ }}
122
+ />
123
+
124
+ {/* Hidden inter-bucket micro-ticks to give the rail a metered feel */}
125
+ <div className="pr-microticks" aria-hidden>
126
+ {PARAM_RANGE_VALUES.map((_, stepIndex) => (
127
+ <span
128
+ key={`pr-micro-${stepIndex}`}
129
+ style={{ left: `${(stepIndex / PARAM_RANGE_MAX_INDEX) * 100}%` }}
130
+ />
131
+ ))}
132
+ </div>
133
+
134
+ {/* Native inputs provide a11y + drag; we hide them visually and rely
135
+ on the .param-range-input thumb styling for the visible handles. */}
136
+ <input
137
+ id={minId}
138
+ type="range"
139
+ min={0}
140
+ max={PARAM_RANGE_MAX_INDEX}
141
+ step={1}
142
+ value={minStep}
143
+ onChange={(event) => {
144
+ const next = Number(event.target.value)
145
+ onMinChange(Math.min(next, maxStep))
146
+ }}
147
+ className="param-range-input"
148
+ aria-label={`Minimum ${headline.toLowerCase()}`}
149
+ />
150
+ <input
151
+ id={maxId}
152
+ type="range"
153
+ min={0}
154
+ max={PARAM_RANGE_MAX_INDEX}
155
+ step={1}
156
+ value={maxStep}
157
+ onChange={(event) => {
158
+ const next = Number(event.target.value)
159
+ onMaxChange(Math.max(next, minStep))
160
+ }}
161
+ className="param-range-input"
162
+ aria-label={`Maximum ${headline.toLowerCase()}`}
163
+ />
164
+ </div>
165
+ )
166
+
167
+ const resetBtn = onReset && isConstrained ? (
168
+ <button
169
+ type="button"
170
+ onClick={onReset}
171
+ className="pr-reset"
172
+ aria-label="Reset parameter range"
173
+ >
174
+ Reset
175
+ </button>
176
+ ) : null
177
+
178
+ const unknownToggle =
179
+ onShowUnknownSizeChange != null ? (
180
+ <button
181
+ type="button"
182
+ onClick={() => onShowUnknownSizeChange(!showUnknownSize)}
183
+ className={`pr-unknown-toggle${showUnknownSize ? " on" : ""}`}
184
+ aria-pressed={Boolean(showUnknownSize)}
185
+ title="Models without a reported parameter count"
186
+ >
187
+ <span className="pr-unknown-toggle-box" aria-hidden>
188
+ {showUnknownSize ? "✓" : ""}
189
+ </span>
190
+ Unknown size
191
+ </button>
192
+ ) : null
193
+
194
+ const readout = (
195
+ <div className="pr-readout-cell">
196
+ <div className={`pr-readout${isInline ? " inline" : ""}`}>
197
+ <span>{formatParamBoundLabel(minStep, "min")}</span>
198
+ <span className="arrow">{isInline ? "–" : "→"}</span>
199
+ <span>{formatParamBoundLabel(maxStep, "max")}</span>
200
+ </div>
201
+ {unknownToggle}
202
+ {resetBtn}
203
+ </div>
204
+ )
205
+
206
+ if (isInline) {
207
+ return (
208
+ <div className={`pr-slider inline${className ? ` ${className}` : ""}`}>
209
+ <span className="pr-label inline-label">
210
+ <strong>{headline}</strong>
211
+ </span>
212
+ {track}
213
+ {readout}
214
+ </div>
215
+ )
216
+ }
217
+
218
+ if (isPromo) {
219
+ return (
220
+ <div className={`pr-promo${className ? ` ${className}` : ""}`}>
221
+ <div className="pr-promo-head">
222
+ <span className="kicker">{headline}</span>
223
+ <p>{subline}</p>
224
+ </div>
225
+ <div className="pr-slider pr-slider-track-only">
226
+ {track}
227
+ {readout}
228
+ </div>
229
+ </div>
230
+ )
231
+ }
232
+
233
+ // Default (Variant A)
234
+ return (
235
+ <div className={`pr-slider${className ? ` ${className}` : ""}`}>
236
+ <div className="pr-label">
237
+ <strong>{headline}</strong>
238
+ {subline}
239
+ </div>
240
+ {track}
241
+ {readout}
242
+ </div>
243
+ )
244
+ }
components/signals/benchmark-signals-strip.tsx ADDED
@@ -0,0 +1,626 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "use client"
2
+
3
+ import type { BenchmarkEvalSummary } from "@/lib/eval-processing"
4
+ import type { ModelResultForBenchmark } from "@/lib/eval-processing"
5
+
6
+ type SignalId = "reproducibility" | "completeness" | "provenance" | "comparability"
7
+
8
+ const SIGNAL_GLYPHS: Record<SignalId, string> = {
9
+ reproducibility: "R",
10
+ completeness: "C",
11
+ provenance: "P",
12
+ comparability: "X",
13
+ }
14
+
15
+ const SIGNAL_NAMES: Record<SignalId, string> = {
16
+ reproducibility: "Reproducibility",
17
+ completeness: "Completeness",
18
+ provenance: "Provenance",
19
+ comparability: "Comparability",
20
+ }
21
+
22
+ const SIGNAL_ASKS: Record<SignalId, string> = {
23
+ reproducibility: "Could someone re-run this benchmark with what's documented?",
24
+ completeness: "How much of the benchmark card is filled in?",
25
+ provenance: "Who reported these scores and how many parties have replicated?",
26
+ comparability: "Where multiple reports exist, do they agree?",
27
+ }
28
+
29
+ /**
30
+ * Reproducibility — paper §4.2.1, signal spec §3.
31
+ *
32
+ * The spec lists `temperature, top_p, max_tokens, prompt_template` as the
33
+ * base required fields. In the live EEE corpus only `temperature` and
34
+ * `max_tokens` are reliably populated, so we restrict the check to those
35
+ * two for now (per maintainer guidance). Agentic benchmarks additionally
36
+ * require `eval_plan` and `eval_limits` — the spec's classification rule
37
+ * is followed verbatim.
38
+ */
39
+ const BASE_REQUIRED_FIELDS = ["temperature", "max_tokens"] as const
40
+ const AGENTIC_REQUIRED_FIELDS = ["eval_plan", "eval_limits"] as const
41
+
42
+ const FIELD_LABELS: Record<string, string> = {
43
+ temperature: "temperature",
44
+ top_p: "top-p",
45
+ max_tokens: "max tokens",
46
+ prompt_template: "prompt template",
47
+ eval_plan: "eval plan",
48
+ eval_limits: "eval limits",
49
+ }
50
+
51
+ /** Setup fields compared to detect variant divergence (spec §6.1.2). */
52
+ const COMPARABILITY_COMPARE_FIELDS = [
53
+ "temperature",
54
+ "top_p",
55
+ "top_k",
56
+ "max_tokens",
57
+ "prompt_template",
58
+ "reasoning",
59
+ ] as const
60
+
61
+ /**
62
+ * Benchmark-level rollup of the four interpretive signals (paper §4.2.1,
63
+ * spec v1.0 §§3-6). Mirrors `CorpusSignalsStrip` but operates over a
64
+ * single `BenchmarkEvalSummary`.
65
+ *
66
+ * Each tile reports one headline statistic that reads "higher is better,
67
+ * more documentation = better", so the four are visually comparable.
68
+ */
69
+ export function BenchmarkSignalsStrip({ summary }: { summary: BenchmarkEvalSummary }) {
70
+ const repro = deriveReproducibility(summary)
71
+ const comp = deriveCompleteness(summary)
72
+ const prov = deriveProvenance(summary)
73
+ const cmp = deriveComparability(summary)
74
+
75
+ return (
76
+ <div
77
+ className="grid gap-x-6 gap-y-3"
78
+ style={{
79
+ gridTemplateColumns: "repeat(auto-fit, minmax(220px, 1fr))",
80
+ border: "1px solid var(--border-soft)",
81
+ background: "var(--bg)",
82
+ padding: "12px 16px",
83
+ }}
84
+ >
85
+ <SignalRow id="reproducibility" {...repro} />
86
+ <SignalRow id="completeness" {...comp} />
87
+ <SignalRow id="provenance" {...prov} />
88
+ <SignalRow id="comparability" {...cmp} />
89
+ </div>
90
+ )
91
+ }
92
+
93
+ interface DerivedSignal {
94
+ statValue: string
95
+ statUnit: string
96
+ headline: string
97
+ detail: string
98
+ }
99
+
100
+ // ──────────────────────────────────────────────────────────────────────────
101
+ // Reproducibility (spec §3)
102
+ // ──────────────────────────────────────────────────────────────────────────
103
+
104
+ function isAgenticBenchmark(summary: BenchmarkEvalSummary): boolean {
105
+ const tasks = summary.benchmark_card?.purpose_and_intended_users?.tasks
106
+ if (Array.isArray(tasks)) {
107
+ const set = new Set(tasks.map((t) => String(t).toLowerCase()))
108
+ if (set.has("agentic") || set.has("tool_use") || set.has("multi_step_agent")) return true
109
+ }
110
+ for (const r of summary.model_results ?? []) {
111
+ const args = getGenerationArgs(r)
112
+ if (args && args.agentic_eval_config != null) return true
113
+ }
114
+ return false
115
+ }
116
+
117
+ function getGenerationArgs(result: ModelResultForBenchmark): Record<string, unknown> | null {
118
+ const gc = (result.result as { generation_config?: { generation_args?: Record<string, unknown> } } | undefined)
119
+ ?.generation_config
120
+ if (!gc) return null
121
+ const args = gc.generation_args
122
+ return args && typeof args === "object" ? args : null
123
+ }
124
+
125
+ function deriveReproducibility(summary: BenchmarkEvalSummary): DerivedSignal {
126
+ const triples = summary.model_results ?? []
127
+ const agentic = isAgenticBenchmark(summary)
128
+ const required: string[] = agentic
129
+ ? [...BASE_REQUIRED_FIELDS, ...AGENTIC_REQUIRED_FIELDS]
130
+ : [...BASE_REQUIRED_FIELDS]
131
+
132
+ if (triples.length === 0) {
133
+ return {
134
+ statValue: "—",
135
+ statUnit: "",
136
+ headline: "Reproducibility doesn't apply — no reported scores.",
137
+ detail: "",
138
+ }
139
+ }
140
+
141
+ const fieldMissing = new Map<string, number>(required.map((f) => [f, 0]))
142
+ let triplesWithoutGap = 0
143
+
144
+ for (const triple of triples) {
145
+ const args = getGenerationArgs(triple) ?? {}
146
+ let allPresent = true
147
+ for (const f of required) {
148
+ if (!isPopulated(args[f])) {
149
+ fieldMissing.set(f, (fieldMissing.get(f) ?? 0) + 1)
150
+ allPresent = false
151
+ }
152
+ }
153
+ if (allPresent) triplesWithoutGap++
154
+ }
155
+
156
+ const total = triples.length
157
+ const score = triplesWithoutGap / total
158
+
159
+ const topMissing = Array.from(fieldMissing.entries())
160
+ .filter(([, n]) => n > 0)
161
+ .sort((a, b) => b[1] - a[1])
162
+ .slice(0, 2)
163
+ .map(([f, n]) => `${FIELD_LABELS[f] ?? f} (${formatPct(n / total)})`)
164
+ .join(", ")
165
+
166
+ const headline =
167
+ score === 1
168
+ ? "Every reported score has a complete generation config."
169
+ : score === 0
170
+ ? "No reported score has all required setup fields."
171
+ : `${triplesWithoutGap} of ${total} triples document the full setup.`
172
+
173
+ const detail = topMissing
174
+ ? `Most often missing: ${topMissing}.`
175
+ : `Required: ${required.map((f) => FIELD_LABELS[f] ?? f).join(", ")}.`
176
+
177
+ return { statValue: pctNum(score), statUnit: "%", headline, detail }
178
+ }
179
+
180
+ // ──────────────────────────────────────────────────────────────────────────
181
+ // Completeness (spec §4)
182
+ // ──────────────────────────────────────────────────────────────────────────
183
+
184
+ interface CompletenessField {
185
+ path: string
186
+ label: string
187
+ coverage: "full" | "partial" | "reserved"
188
+ /** For partial: list of sub-item names whose presence is checked. */
189
+ subitems?: readonly string[]
190
+ }
191
+
192
+ const COMPLETENESS_FIELD_SET: readonly CompletenessField[] = [
193
+ { path: "benchmark_details.overview", label: "overview", coverage: "full" },
194
+ { path: "benchmark_details.data_type", label: "data type", coverage: "full" },
195
+ {
196
+ path: "benchmark_details",
197
+ label: "domains / languages / resources",
198
+ coverage: "partial",
199
+ subitems: ["domains", "languages", "resources"],
200
+ },
201
+ {
202
+ path: "purpose_and_intended_users",
203
+ label: "purpose",
204
+ coverage: "partial",
205
+ subitems: ["goal", "audience", "tasks", "limitations"],
206
+ },
207
+ {
208
+ path: "data",
209
+ label: "data",
210
+ coverage: "partial",
211
+ subitems: ["source", "size", "format", "annotation"],
212
+ },
213
+ {
214
+ path: "methodology",
215
+ label: "methodology",
216
+ coverage: "partial",
217
+ subitems: ["methods", "metrics", "calculation", "interpretation", "baseline_results", "validation"],
218
+ },
219
+ {
220
+ path: "ethical_and_legal_considerations",
221
+ label: "ethical & legal",
222
+ coverage: "partial",
223
+ subitems: ["privacy_and_anonymity", "data_licensing", "consent_procedures", "compliance_with_regulations"],
224
+ },
225
+ // Reserved — counted in the denominator even when unset (spec §4.2).
226
+ { path: "evalcards.lifecycle_status", label: "lifecycle status", coverage: "reserved" },
227
+ ] as const
228
+
229
+ function deriveCompleteness(summary: BenchmarkEvalSummary): DerivedSignal {
230
+ const card = summary.benchmark_card
231
+
232
+ const fieldScores: { path: string; label: string; coverage: CompletenessField["coverage"]; score: number }[] = []
233
+
234
+ for (const field of COMPLETENESS_FIELD_SET) {
235
+ let score = 0
236
+ if (field.coverage === "reserved") {
237
+ // The eval-summary payload doesn't currently carry an
238
+ // evalcards.lifecycle_status section, so this scores 0 for now.
239
+ // It still occupies a denominator slot per spec.
240
+ score = 0
241
+ } else if (field.coverage === "full") {
242
+ const value = card ? readCardPath(card, field.path) : undefined
243
+ score = isPopulated(value) ? 1 : 0
244
+ } else {
245
+ // partial
246
+ const parent = card ? (readCardPath(card, field.path) as Record<string, unknown> | undefined) : undefined
247
+ const subs = field.subitems ?? []
248
+ if (!parent || subs.length === 0) {
249
+ score = 0
250
+ } else {
251
+ let populated = 0
252
+ for (const key of subs) if (isPopulated(parent[key])) populated++
253
+ score = populated / subs.length
254
+ }
255
+ }
256
+ fieldScores.push({ path: field.path, label: field.label, coverage: field.coverage, score })
257
+ }
258
+
259
+ const total = fieldScores.length
260
+ const sumScore = fieldScores.reduce((acc, f) => acc + f.score, 0)
261
+ const completeness = total > 0 ? sumScore / total : null
262
+
263
+ const populatedCount = fieldScores.reduce((acc, f) => acc + (f.score === 1 ? 1 : 0), 0)
264
+ const partialCount = fieldScores.filter((f) => f.score > 0 && f.score < 1).length
265
+ const missingCount = fieldScores.filter((f) => f.score === 0).length
266
+
267
+ const topMissing = fieldScores
268
+ .filter((f) => f.score === 0 && f.coverage !== "reserved")
269
+ .slice(0, 2)
270
+ .map((f) => f.label)
271
+ .join(", ")
272
+
273
+ const headline = !card
274
+ ? "No benchmark card has been authored yet."
275
+ : completeness === 1
276
+ ? "Every documented field is populated."
277
+ : completeness != null && completeness >= 0.6
278
+ ? "Most documented fields are populated."
279
+ : "Several documented fields are still empty."
280
+
281
+ const detail = !card
282
+ ? "Reading context will lean on whatever the leaderboard JSON provides."
283
+ : `${populatedCount} full · ${partialCount} partial · ${missingCount} missing of ${total}${
284
+ topMissing ? ` · gaps: ${topMissing}` : ""
285
+ }`
286
+
287
+ return { statValue: pctNum(completeness), statUnit: "%", headline, detail }
288
+ }
289
+
290
+ function readCardPath(card: unknown, path: string): unknown {
291
+ if (!card || typeof card !== "object") return undefined
292
+ let cur: unknown = card
293
+ for (const segment of path.split(".")) {
294
+ if (cur == null || typeof cur !== "object") return undefined
295
+ cur = (cur as Record<string, unknown>)[segment]
296
+ }
297
+ return cur
298
+ }
299
+
300
+ // ──────────────────────────────────────────────────────────────────────────
301
+ // Provenance (spec §5)
302
+ // ──────────────────────────────────────────────────────────────────────────
303
+
304
+ type ProvenanceSourceType = "first_party" | "third_party" | "collaborative" | "unspecified"
305
+
306
+ function readSourceType(result: ModelResultForBenchmark): ProvenanceSourceType {
307
+ const sm = result.source_metadata as { evaluator_relationship?: string } | undefined
308
+ const rel = sm?.evaluator_relationship
309
+ if (rel === "first_party" || rel === "third_party" || rel === "collaborative") return rel
310
+ return "unspecified"
311
+ }
312
+
313
+ function readSourceOrg(result: ModelResultForBenchmark): string | null {
314
+ const sm = result.source_metadata as { source_organization_name?: string } | undefined
315
+ const org = sm?.source_organization_name
316
+ if (typeof org === "string" && org.trim().length > 0) return org.trim()
317
+ return null
318
+ }
319
+
320
+ function metricKeyForResult(result: ModelResultForBenchmark): string {
321
+ const r = result.result as { metric_summary_id?: string; metric_key?: string; evaluation_name?: string } | undefined
322
+ return r?.metric_summary_id ?? r?.metric_key ?? r?.evaluation_name ?? ""
323
+ }
324
+
325
+ function modelKeyForResult(result: ModelResultForBenchmark): string {
326
+ return result.model_info?.id ?? result.model_info?.name ?? ""
327
+ }
328
+
329
+ function deriveProvenance(summary: BenchmarkEvalSummary): DerivedSignal {
330
+ const triples = summary.model_results ?? []
331
+ if (triples.length === 0) {
332
+ return {
333
+ statValue: "—",
334
+ statUnit: "",
335
+ headline: "No reported scores yet.",
336
+ detail: "",
337
+ }
338
+ }
339
+
340
+ const counts: Record<ProvenanceSourceType, number> = {
341
+ first_party: 0,
342
+ third_party: 0,
343
+ collaborative: 0,
344
+ unspecified: 0,
345
+ }
346
+ const distinctOrgs = new Set<string>()
347
+ const orgsByGroup = new Map<string, Set<string>>()
348
+
349
+ for (const t of triples) {
350
+ counts[readSourceType(t)]++
351
+ const org = readSourceOrg(t)
352
+ if (org) distinctOrgs.add(org)
353
+ const groupKey = `${modelKeyForResult(t)}::${metricKeyForResult(t)}`
354
+ if (org) {
355
+ const existing = orgsByGroup.get(groupKey)
356
+ if (existing) existing.add(org)
357
+ else orgsByGroup.set(groupKey, new Set([org]))
358
+ }
359
+ }
360
+
361
+ const total = triples.length
362
+ const attributed = total - counts.unspecified
363
+ const score = attributed / total
364
+
365
+ const multiSourceGroups = Array.from(orgsByGroup.values()).filter((s) => s.size > 1).length
366
+ const eligibleGroups = orgsByGroup.size
367
+ const multiRate = eligibleGroups > 0 ? multiSourceGroups / eligibleGroups : null
368
+
369
+ const headline =
370
+ counts.unspecified === total
371
+ ? "No triple carries an attribution."
372
+ : multiSourceGroups > 0
373
+ ? `${multiSourceGroups} of ${eligibleGroups} (model, metric) groups have reports from more than one party.`
374
+ : `Single-source benchmark: ${distinctOrgs.size} reporting org${distinctOrgs.size === 1 ? "" : "s"}.`
375
+
376
+ const dist: string[] = []
377
+ if (counts.first_party > 0) dist.push(`${formatPct(counts.first_party / total)} first-party`)
378
+ if (counts.third_party > 0) dist.push(`${formatPct(counts.third_party / total)} third-party`)
379
+ if (counts.collaborative > 0) dist.push(`${formatPct(counts.collaborative / total)} collaborative`)
380
+ if (counts.unspecified > 0) dist.push(`${formatPct(counts.unspecified / total)} unspecified`)
381
+
382
+ const detailBits = [dist.join(" · ")]
383
+ if (multiRate != null) detailBits.push(`${formatPct(multiRate)} multi-source`)
384
+
385
+ return { statValue: pctNum(score), statUnit: "%", headline, detail: detailBits.join(" · ") }
386
+ }
387
+
388
+ // ──────────────────────────────────────────────────────────────────────────
389
+ // Comparability (spec §6)
390
+ // ──────────────���───────────────────────────────────────────────────────────
391
+
392
+ function computeThreshold(metricConfig: BenchmarkEvalSummary["metric_config"]): number {
393
+ if (!metricConfig) return 0.05
394
+ const unit = (metricConfig as { unit?: string; metric_unit?: string }).unit
395
+ ?? (metricConfig as { metric_unit?: string }).metric_unit
396
+ const scoreType = (metricConfig as { score_type?: string }).score_type
397
+ if (unit === "proportion" || scoreType === "continuous_normalized") return 0.05
398
+ if (unit === "percent") return 5.0
399
+ const min = metricConfig.min_score
400
+ const max = metricConfig.max_score
401
+ if (typeof min === "number" && typeof max === "number" && max > min) return 0.05 * (max - min)
402
+ return 0.05
403
+ }
404
+
405
+ function median(values: number[]): number {
406
+ if (values.length === 0) return Number.NaN
407
+ const sorted = [...values].sort((a, b) => a - b)
408
+ const mid = Math.floor(sorted.length / 2)
409
+ return sorted.length % 2 === 1 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2
410
+ }
411
+
412
+ function deriveComparability(summary: BenchmarkEvalSummary): DerivedSignal {
413
+ const triples = summary.model_results ?? []
414
+ if (triples.length === 0) {
415
+ return { statValue: "—", statUnit: "", headline: "No reported scores yet.", detail: "" }
416
+ }
417
+
418
+ const threshold = computeThreshold(summary.metric_config)
419
+
420
+ // Group triples by (model_id, metric_path).
421
+ const groups = new Map<
422
+ string,
423
+ Array<{ score: number; args: Record<string, unknown>; org: string | null }>
424
+ >()
425
+ for (const t of triples) {
426
+ const score = t.score_details?.score
427
+ if (typeof score !== "number" || !Number.isFinite(score)) continue
428
+ const key = `${modelKeyForResult(t)}::${metricKeyForResult(t)}`
429
+ const args = getGenerationArgs(t) ?? {}
430
+ const entry = { score, args, org: readSourceOrg(t) }
431
+ const list = groups.get(key)
432
+ if (list) list.push(entry)
433
+ else groups.set(key, [entry])
434
+ }
435
+
436
+ let variantEligible = 0
437
+ let variantDivergent = 0
438
+ let crossPartyEligible = 0
439
+ let crossPartyDivergent = 0
440
+
441
+ for (const list of groups.values()) {
442
+ if (list.length < 2) continue
443
+
444
+ // Variant divergence — same group, different setups (spec §6.1).
445
+ const setupValueSets = new Map<string, Set<string>>()
446
+ for (const entry of list) {
447
+ for (const f of COMPARABILITY_COMPARE_FIELDS) {
448
+ const valKey = JSON.stringify(entry.args[f] ?? null)
449
+ let set = setupValueSets.get(f)
450
+ if (!set) {
451
+ set = new Set()
452
+ setupValueSets.set(f, set)
453
+ }
454
+ set.add(valKey)
455
+ }
456
+ }
457
+ const setupsDiffer = Array.from(setupValueSets.values()).some((s) => s.size > 1)
458
+ if (setupsDiffer) {
459
+ variantEligible++
460
+ const scores = list.map((e) => e.score)
461
+ const divergence = Math.max(...scores) - Math.min(...scores)
462
+ if (divergence > threshold) variantDivergent++
463
+ }
464
+
465
+ // Cross-party divergence — same group, different orgs (spec §6.2).
466
+ const byOrg = new Map<string, number[]>()
467
+ for (const entry of list) {
468
+ if (!entry.org) continue
469
+ const arr = byOrg.get(entry.org)
470
+ if (arr) arr.push(entry.score)
471
+ else byOrg.set(entry.org, [entry.score])
472
+ }
473
+ if (byOrg.size >= 2) {
474
+ crossPartyEligible++
475
+ const orgScores = Array.from(byOrg.values()).map((s) => median(s))
476
+ const divergence = Math.max(...orgScores) - Math.min(...orgScores)
477
+ if (divergence > threshold) crossPartyDivergent++
478
+ }
479
+ }
480
+
481
+ const totalEligible = variantEligible + crossPartyEligible
482
+ if (totalEligible === 0) {
483
+ return {
484
+ statValue: "—",
485
+ statUnit: "",
486
+ headline: "Not enough overlapping reports to compare.",
487
+ detail: `${groups.size} (model, metric) groups · 0 multi-report`,
488
+ }
489
+ }
490
+
491
+ const totalDivergent = variantDivergent + crossPartyDivergent
492
+ const agreementRate = (totalEligible - totalDivergent) / totalEligible
493
+
494
+ const detailBits: string[] = []
495
+ if (variantEligible > 0) {
496
+ detailBits.push(
497
+ `variant ${variantEligible - variantDivergent}/${variantEligible} agree`,
498
+ )
499
+ }
500
+ if (crossPartyEligible > 0) {
501
+ detailBits.push(
502
+ `cross-party ${crossPartyEligible - crossPartyDivergent}/${crossPartyEligible} agree`,
503
+ )
504
+ }
505
+ detailBits.push(`threshold ±${formatNumber(threshold)}`)
506
+
507
+ const headline =
508
+ totalDivergent === 0
509
+ ? "Reports that are directly comparable agree within threshold."
510
+ : totalDivergent === totalEligible
511
+ ? "Every comparable report disagrees beyond threshold."
512
+ : `${totalEligible - totalDivergent} of ${totalEligible} comparable reports agree.`
513
+
514
+ return {
515
+ statValue: pctNum(agreementRate),
516
+ statUnit: "%",
517
+ headline,
518
+ detail: detailBits.join(" · "),
519
+ }
520
+ }
521
+
522
+ // ──────────────────────────────────────────────────────────────────────��───
523
+ // Helpers
524
+ // ──────────────────────────────────────────────────────────────────────────
525
+
526
+ function isPopulated(value: unknown): boolean {
527
+ if (value == null) return false
528
+ if (typeof value === "string") return value.trim().length > 0
529
+ if (Array.isArray(value)) return value.length > 0
530
+ if (typeof value === "object") return Object.keys(value as Record<string, unknown>).length > 0
531
+ return Boolean(value)
532
+ }
533
+
534
+ function pctNum(value: number | null | undefined): string {
535
+ if (value == null || !Number.isFinite(value)) return "—"
536
+ if (value <= 0) return "0"
537
+ if (value < 0.01) return "<1"
538
+ return `${Math.round(value * 100)}`
539
+ }
540
+
541
+ function formatPct(value: number | null | undefined): string {
542
+ if (value == null || !Number.isFinite(value)) return "—"
543
+ if (value === 0) return "0%"
544
+ if (value < 0.01) return "<1%"
545
+ return `${Math.round(value * 100)}%`
546
+ }
547
+
548
+ function formatNumber(value: number): string {
549
+ if (!Number.isFinite(value)) return "—"
550
+ if (value >= 100) return value.toFixed(0)
551
+ if (value >= 1) return value.toFixed(2)
552
+ return value.toFixed(3).replace(/0+$/g, "").replace(/\.$/, "")
553
+ }
554
+
555
+ /**
556
+ * Compact one-row layout per signal — meant to drop in alongside the Card
557
+ * Quality Notes box, not dominate the page like the corpus dashboard's
558
+ * full tile grid. Glyph + name + percent live on one line; one short
559
+ * sentence summarising the score lives below. The "Asks" prompt is moved
560
+ * to the title attribute so it stays discoverable on hover but doesn't
561
+ * eat vertical space.
562
+ */
563
+ function SignalRow({
564
+ id,
565
+ statValue,
566
+ statUnit,
567
+ headline,
568
+ detail,
569
+ }: {
570
+ id: SignalId
571
+ } & DerivedSignal) {
572
+ return (
573
+ <div className="min-w-0" title={SIGNAL_ASKS[id]}>
574
+ <div className="flex items-center gap-2">
575
+ <span
576
+ className={`sig-glyph sig-${id}`}
577
+ style={{ width: 22, height: 22, fontSize: "0.7rem", flexShrink: 0 }}
578
+ >
579
+ <span>{SIGNAL_GLYPHS[id]}</span>
580
+ </span>
581
+ <span
582
+ className="font-mono uppercase"
583
+ style={{
584
+ fontSize: 10,
585
+ letterSpacing: "0.14em",
586
+ color: "var(--fg-muted)",
587
+ flexShrink: 0,
588
+ }}
589
+ >
590
+ {SIGNAL_NAMES[id]}
591
+ </span>
592
+ <span
593
+ className="ml-auto font-mono tabular-nums"
594
+ style={{ fontSize: 16, fontWeight: 600, color: "var(--fg)" }}
595
+ >
596
+ {statValue}
597
+ {statUnit && (
598
+ <span style={{ fontSize: 10, color: "var(--fg-subtle)", marginLeft: 2 }}>
599
+ {statUnit}
600
+ </span>
601
+ )}
602
+ </span>
603
+ </div>
604
+ <div
605
+ className="mt-1"
606
+ style={{
607
+ fontSize: 11,
608
+ lineHeight: 1.4,
609
+ color: "var(--fg-muted)",
610
+ display: "-webkit-box",
611
+ WebkitLineClamp: 2,
612
+ WebkitBoxOrient: "vertical",
613
+ overflow: "hidden",
614
+ }}
615
+ >
616
+ {headline}
617
+ {detail && (
618
+ <span style={{ color: "var(--fg-subtle)" }}>
619
+ {" · "}
620
+ {detail}
621
+ </span>
622
+ )}
623
+ </div>
624
+ </div>
625
+ )
626
+ }
components/signals/corpus-dashboard.tsx CHANGED
@@ -20,7 +20,7 @@ import {
20
  formatPercent,
21
  } from "./signal-utils"
22
 
23
- const CATEGORY_ORDER = ["agentic", "general", "knowledge", "reasoning", "safety", "other"]
24
 
25
  const SOURCE_COLORS: Record<string, string> = {
26
  first_party: "bg-amber-500",
@@ -51,13 +51,21 @@ export function CorpusDashboard({
51
  }, [mode])
52
 
53
  const categoryKeys = useMemo(
54
- () =>
55
- CATEGORY_ORDER.filter((category) =>
56
- aggregates.reproducibility.by_category[category] ||
57
- aggregates.completeness.by_category[category] ||
58
- aggregates.provenance.by_category[category] ||
59
- aggregates.comparability.by_category[category]
60
- ),
 
 
 
 
 
 
 
 
61
  [aggregates]
62
  )
63
 
@@ -190,25 +198,14 @@ function CompletenessSection({
190
  icon={<ClipboardCheck className="h-5 w-5" />}
191
  title="Reporting Completeness"
192
  subtitle="How much benchmark documentation is populated."
193
- headline={formatPercent(block.completeness_score_mean)}
194
- headlineLabel={`Median ${formatPercent(block.completeness_score_median)} across ${block.total_benchmarks.toLocaleString()} benchmarks`}
195
  >
196
  {scores.length > 0 && <Histogram scores={scores} />}
197
- <div className="mt-4 grid gap-2">
198
- {Object.entries(block.per_field_population).slice(0, 10).map(([field, value]) => (
199
- <div key={field} className="rounded-xl border border-border/60 bg-background px-3 py-2">
200
- <div className="flex items-start justify-between gap-3 text-sm">
201
- <span className="font-medium">{formatFieldLabel(field)}</span>
202
- <span className="shrink-0 tabular-nums text-muted-foreground">
203
- {formatPercent(value.mean_score)}
204
- </span>
205
- </div>
206
- <div className="mt-2 grid gap-1.5">
207
- <MetricBar label="Any data" value={value.populated_rate} compact />
208
- <MetricBar label="Fully populated" value={value.fully_populated_rate} compact />
209
- </div>
210
- </div>
211
- ))}
212
  </div>
213
  </DashboardSection>
214
  )
@@ -217,14 +214,16 @@ function CompletenessSection({
217
  function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
218
  const distribution = block.source_type_distribution
219
  const total = Object.values(distribution).reduce((sum, value) => sum + value, 0)
 
 
220
 
221
  return (
222
  <DashboardSection
223
  icon={<BarChart3 className="h-5 w-5" />}
224
  title="Provenance"
225
  subtitle="Who reported the scores, and whether groups have multiple sources."
226
- headline={formatPercent(block.multi_source_rate)}
227
- headlineLabel="of (model, benchmark, metric) groups have multiple reporting sources"
228
  >
229
  <div className="overflow-hidden rounded-full border border-border/70 bg-muted/30">
230
  <div className="flex h-4 w-full">
@@ -240,34 +239,40 @@ function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
240
  </div>
241
 
242
  <div className="mt-3 grid gap-2 sm:grid-cols-2">
243
- <RatioTile label="Multi-source groups" value={block.multi_source_rate} count={block.multi_source_groups} />
244
- <RatioTile label="First-party only groups" value={block.first_party_only_rate} count={block.first_party_only_groups} />
245
  </div>
246
  </DashboardSection>
247
  )
248
  }
249
 
250
  function ComparabilitySection({ block }: { block: ComparabilityCorpusBlock }) {
 
 
 
 
 
 
251
  return (
252
  <DashboardSection
253
  icon={<GitCompareArrows className="h-5 w-5" />}
254
  title="Comparability"
255
  subtitle="Eligible groups where scores diverge across setups or reporting organizations."
256
- headline={formatNullableRate(block.variant_divergence_rate)}
257
- headlineLabel={`${block.variant_divergent_groups.toLocaleString()} of ${block.variant_eligible_groups.toLocaleString()} setup-eligible groups diverge`}
258
  >
259
  <div className="grid gap-3 md:grid-cols-2">
260
  <ComparabilityRateCard
261
  title="Variant divergence"
262
- rate={block.variant_divergence_rate}
263
- eligible={block.variant_eligible_groups}
264
- divergent={block.variant_divergent_groups}
265
  />
266
  <ComparabilityRateCard
267
  title="Cross-party divergence"
268
- rate={block.cross_party_divergence_rate}
269
- eligible={block.cross_party_eligible_groups}
270
- divergent={block.cross_party_divergent_groups}
271
  />
272
  </div>
273
  </DashboardSection>
@@ -288,6 +293,15 @@ function CategoryPanel({
288
  comparability?: ComparabilityCorpusBlock
289
  }) {
290
  const categoryLabel = `${category.charAt(0).toUpperCase()}${category.slice(1)}`
 
 
 
 
 
 
 
 
 
291
 
292
  return (
293
  <section className="rounded-2xl border border-border/70 bg-card p-4 shadow-sm">
@@ -297,11 +311,11 @@ function CategoryPanel({
297
  </div>
298
  <div className="grid gap-3 sm:grid-cols-2">
299
  <MiniMetric label="Reproducibility gaps" value={formatPercent(reproducibility?.reproducibility_gap_rate)} />
300
- <MiniMetric label="Documentation mean" value={formatPercent(completeness?.completeness_score_mean)} />
301
- <MiniMetric label="Multi-source groups" value={formatPercent(provenance?.multi_source_rate)} />
302
- <MiniMetric label="Variant divergence" value={formatNullableRate(comparability?.variant_divergence_rate)} />
303
  </div>
304
- {comparability?.cross_party_divergence_rate == null && (
305
  <div className="mt-3 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
306
  Cross-party divergence: N/A - not enough multi-org coverage.
307
  </div>
@@ -411,7 +425,7 @@ function RatioTile({ label, value, count }: { label: string; value: number | nul
411
  <div className="text-sm font-medium">{label}</div>
412
  <div className="mt-1 flex items-baseline justify-between gap-2">
413
  <span className="text-xl font-semibold tabular-nums">{formatPercent(value)}</span>
414
- <span className="text-xs text-muted-foreground">{count.toLocaleString()} groups</span>
415
  </div>
416
  </div>
417
  )
@@ -463,6 +477,11 @@ function formatNullableRate(value: number | null | undefined) {
463
  return value == null ? "N/A" : formatPercent(value)
464
  }
465
 
 
 
 
 
 
466
  function formatGeneratedDate(value: string) {
467
  const date = new Date(value)
468
  if (Number.isNaN(date.getTime())) {
 
20
  formatPercent,
21
  } from "./signal-utils"
22
 
23
+ const CATEGORY_ORDER = ["Agentic", "General", "Knowledge", "Reasoning", "Safety", "Other"]
24
 
25
  const SOURCE_COLORS: Record<string, string> = {
26
  first_party: "bg-amber-500",
 
51
  }, [mode])
52
 
53
  const categoryKeys = useMemo(
54
+ () => {
55
+ const available = new Set([
56
+ ...Object.keys(aggregates.reproducibility.by_category),
57
+ ...Object.keys(aggregates.completeness.by_category),
58
+ ...Object.keys(aggregates.provenance.by_category),
59
+ ...Object.keys(aggregates.comparability.by_category),
60
+ ])
61
+
62
+ return [
63
+ ...CATEGORY_ORDER.filter((category) => available.has(category)),
64
+ ...Array.from(available)
65
+ .filter((category) => !CATEGORY_ORDER.includes(category))
66
+ .sort((a, b) => a.localeCompare(b)),
67
+ ]
68
+ },
69
  [aggregates]
70
  )
71
 
 
198
  icon={<ClipboardCheck className="h-5 w-5" />}
199
  title="Reporting Completeness"
200
  subtitle="How much benchmark documentation is populated."
201
+ headline={formatPercent(block.completeness_avg)}
202
+ headlineLabel={`Range ${formatPercent(block.completeness_min)} to ${formatPercent(block.completeness_max)} across ${block.total_triples.toLocaleString()} reported score triples`}
203
  >
204
  {scores.length > 0 && <Histogram scores={scores} />}
205
+ <div className="mt-4 grid gap-2 sm:grid-cols-3">
206
+ <MiniMetric label="Minimum" value={formatPercent(block.completeness_min)} />
207
+ <MiniMetric label="Average" value={formatPercent(block.completeness_avg)} />
208
+ <MiniMetric label="Maximum" value={formatPercent(block.completeness_max)} />
 
 
 
 
 
 
 
 
 
 
 
209
  </div>
210
  </DashboardSection>
211
  )
 
214
  function ProvenanceSection({ block }: { block: ProvenanceCorpusBlock }) {
215
  const distribution = block.source_type_distribution
216
  const total = Object.values(distribution).reduce((sum, value) => sum + value, 0)
217
+ const multiSourceRate = rate(block.multi_source_triples, block.total_triples)
218
+ const firstPartyOnlyRate = rate(block.first_party_only_triples, block.total_triples)
219
 
220
  return (
221
  <DashboardSection
222
  icon={<BarChart3 className="h-5 w-5" />}
223
  title="Provenance"
224
  subtitle="Who reported the scores, and whether groups have multiple sources."
225
+ headline={formatPercent(multiSourceRate)}
226
+ headlineLabel="of reported score triples have multiple reporting sources"
227
  >
228
  <div className="overflow-hidden rounded-full border border-border/70 bg-muted/30">
229
  <div className="flex h-4 w-full">
 
239
  </div>
240
 
241
  <div className="mt-3 grid gap-2 sm:grid-cols-2">
242
+ <RatioTile label="Multi-source triples" value={multiSourceRate} count={block.multi_source_triples} />
243
+ <RatioTile label="First-party only triples" value={firstPartyOnlyRate} count={block.first_party_only_triples} />
244
  </div>
245
  </DashboardSection>
246
  )
247
  }
248
 
249
  function ComparabilitySection({ block }: { block: ComparabilityCorpusBlock }) {
250
+ const variantRate = rate(block.variant_divergent_count, block.groups_with_variant_check)
251
+ const crossPartyRate = rate(
252
+ block.cross_party_divergent_count,
253
+ block.groups_with_cross_party_check
254
+ )
255
+
256
  return (
257
  <DashboardSection
258
  icon={<GitCompareArrows className="h-5 w-5" />}
259
  title="Comparability"
260
  subtitle="Eligible groups where scores diverge across setups or reporting organizations."
261
+ headline={formatNullableRate(variantRate)}
262
+ headlineLabel={`${block.variant_divergent_count.toLocaleString()} of ${block.groups_with_variant_check.toLocaleString()} setup-eligible groups diverge`}
263
  >
264
  <div className="grid gap-3 md:grid-cols-2">
265
  <ComparabilityRateCard
266
  title="Variant divergence"
267
+ rate={variantRate}
268
+ eligible={block.groups_with_variant_check}
269
+ divergent={block.variant_divergent_count}
270
  />
271
  <ComparabilityRateCard
272
  title="Cross-party divergence"
273
+ rate={crossPartyRate}
274
+ eligible={block.groups_with_cross_party_check}
275
+ divergent={block.cross_party_divergent_count}
276
  />
277
  </div>
278
  </DashboardSection>
 
293
  comparability?: ComparabilityCorpusBlock
294
  }) {
295
  const categoryLabel = `${category.charAt(0).toUpperCase()}${category.slice(1)}`
296
+ const multiSourceRate = rate(provenance?.multi_source_triples, provenance?.total_triples)
297
+ const variantRate = rate(
298
+ comparability?.variant_divergent_count,
299
+ comparability?.groups_with_variant_check
300
+ )
301
+ const crossPartyRate = rate(
302
+ comparability?.cross_party_divergent_count,
303
+ comparability?.groups_with_cross_party_check
304
+ )
305
 
306
  return (
307
  <section className="rounded-2xl border border-border/70 bg-card p-4 shadow-sm">
 
311
  </div>
312
  <div className="grid gap-3 sm:grid-cols-2">
313
  <MiniMetric label="Reproducibility gaps" value={formatPercent(reproducibility?.reproducibility_gap_rate)} />
314
+ <MiniMetric label="Documentation mean" value={formatPercent(completeness?.completeness_avg)} />
315
+ <MiniMetric label="Multi-source triples" value={formatPercent(multiSourceRate)} />
316
+ <MiniMetric label="Variant divergence" value={formatNullableRate(variantRate)} />
317
  </div>
318
+ {crossPartyRate == null && (
319
  <div className="mt-3 rounded-xl border border-dashed border-border/70 bg-muted/10 px-3 py-2 text-sm text-muted-foreground">
320
  Cross-party divergence: N/A - not enough multi-org coverage.
321
  </div>
 
425
  <div className="text-sm font-medium">{label}</div>
426
  <div className="mt-1 flex items-baseline justify-between gap-2">
427
  <span className="text-xl font-semibold tabular-nums">{formatPercent(value)}</span>
428
+ <span className="text-xs text-muted-foreground">{count.toLocaleString()} triples</span>
429
  </div>
430
  </div>
431
  )
 
477
  return value == null ? "N/A" : formatPercent(value)
478
  }
479
 
480
+ function rate(numerator: number | null | undefined, denominator: number | null | undefined) {
481
+ if (numerator == null || denominator == null || denominator <= 0) return null
482
+ return numerator / denominator
483
+ }
484
+
485
  function formatGeneratedDate(value: string) {
486
  const date = new Date(value)
487
  if (Number.isNaN(date.getTime())) {
components/signals/corpus-signals-strip.tsx CHANGED
@@ -39,8 +39,13 @@ export function CorpusSignalsStrip({
39
  const tpShare = totalReports > 0 ? prov.source_type_distribution.third_party / totalReports : 0
40
  const fpShare = totalReports > 0 ? prov.source_type_distribution.first_party / totalReports : 0
41
 
42
- const cmpRate = cmp.variant_divergence_rate
43
- const crossPartyAvailable = cmp.cross_party_eligible_groups > 0
 
 
 
 
 
44
 
45
  return (
46
  <div className="signals-grid">
@@ -58,29 +63,29 @@ export function CorpusSignalsStrip({
58
  />
59
  <SignalTile
60
  id="completeness"
61
- statValue={pctNum(comp.completeness_score_mean)}
62
  statUnit="%"
63
- headline={`mean across ${comp.total_benchmarks.toLocaleString()} benchmarks (median ${formatPct(comp.completeness_score_median)}).`}
64
- detail="Source-provenance fields populate fully; preregistration fields are unmet."
65
  asks="Is the benchmark itself documented well enough to interpret a score on it?"
66
  />
67
  <SignalTile
68
  id="provenance"
69
- statValue={pctNum(prov.multi_source_rate)}
70
  statUnit="%"
71
- headline="of (model, benchmark) groups have reports from more than one party."
72
- detail={`${formatPct(tpShare)} third-party, ${formatPct(fpShare)} first-party of ${totalReports.toLocaleString()} results.`}
73
  asks="Who reported this score, and have others reproduced it?"
74
  />
75
  <SignalTile
76
  id="comparability"
77
  statValue={pctNum(cmpRate)}
78
  statUnit="%"
79
- headline={`of setup-eligible groups diverge across variants (${cmp.variant_divergent_groups.toLocaleString()} of ${cmp.variant_eligible_groups.toLocaleString()}).`}
80
  detail={
81
  crossPartyAvailable
82
- ? `Cross-party divergence: ${formatPct(cmp.cross_party_divergence_rate)}.`
83
- : "Cross-party divergence not yet computable too few multi-org reports."
84
  }
85
  asks="Are scores on the same benchmark actually measuring the same thing?"
86
  />
@@ -154,6 +159,11 @@ function formatPct(value: number | null | undefined): string {
154
  return `${Math.round(value * 100)}%`
155
  }
156
 
 
 
 
 
 
157
  const FIELD_LABELS: Record<string, string> = {
158
  temperature: "temperature",
159
  max_tokens: "max tokens",
 
39
  const tpShare = totalReports > 0 ? prov.source_type_distribution.third_party / totalReports : 0
40
  const fpShare = totalReports > 0 ? prov.source_type_distribution.first_party / totalReports : 0
41
 
42
+ const multiSourceRate = rate(prov.multi_source_triples, prov.total_triples)
43
+ const cmpRate = rate(cmp.variant_divergent_count, cmp.groups_with_variant_check)
44
+ const crossPartyRate = rate(
45
+ cmp.cross_party_divergent_count,
46
+ cmp.groups_with_cross_party_check
47
+ )
48
+ const crossPartyAvailable = cmp.groups_with_cross_party_check > 0
49
 
50
  return (
51
  <div className="signals-grid">
 
63
  />
64
  <SignalTile
65
  id="completeness"
66
+ statValue={pctNum(comp.completeness_avg)}
67
  statUnit="%"
68
+ headline={`mean across ${comp.total_triples.toLocaleString()} reported score triples.`}
69
+ detail={`Observed range: ${formatPct(comp.completeness_min)} to ${formatPct(comp.completeness_max)}.`}
70
  asks="Is the benchmark itself documented well enough to interpret a score on it?"
71
  />
72
  <SignalTile
73
  id="provenance"
74
+ statValue={pctNum(multiSourceRate)}
75
  statUnit="%"
76
+ headline="of reported score triples have reports from more than one party."
77
+ detail={`${formatPct(tpShare)} third-party, ${formatPct(fpShare)} first-party of ${totalReports.toLocaleString()} triples.`}
78
  asks="Who reported this score, and have others reproduced it?"
79
  />
80
  <SignalTile
81
  id="comparability"
82
  statValue={pctNum(cmpRate)}
83
  statUnit="%"
84
+ headline={`of setup-eligible groups diverge across variants (${cmp.variant_divergent_count.toLocaleString()} of ${cmp.groups_with_variant_check.toLocaleString()}).`}
85
  detail={
86
  crossPartyAvailable
87
+ ? `Cross-party divergence: ${formatPct(crossPartyRate)}.`
88
+ : "Cross-party divergence not yet computable: too few multi-org reports."
89
  }
90
  asks="Are scores on the same benchmark actually measuring the same thing?"
91
  />
 
159
  return `${Math.round(value * 100)}%`
160
  }
161
 
162
+ function rate(numerator: number | null | undefined, denominator: number | null | undefined) {
163
+ if (numerator == null || denominator == null || denominator <= 0) return null
164
+ return numerator / denominator
165
+ }
166
+
167
  const FIELD_LABELS: Record<string, string> = {
168
  temperature: "temperature",
169
  max_tokens: "max tokens",
data/benchmarks.json DELETED
@@ -1,90 +0,0 @@
1
- [
2
- {
3
- "benchmark": "ace",
4
- "model_count": 12
5
- },
6
- {
7
- "benchmark": "apex-agents",
8
- "model_count": 20
9
- },
10
- {
11
- "benchmark": "apex-v1",
12
- "model_count": 10
13
- },
14
- {
15
- "benchmark": "appworld_test_normal",
16
- "model_count": 3
17
- },
18
- {
19
- "benchmark": "bfcl",
20
- "model_count": 109
21
- },
22
- {
23
- "benchmark": "browsecompplus",
24
- "model_count": 3
25
- },
26
- {
27
- "benchmark": "global-mmlu-lite",
28
- "model_count": 27
29
- },
30
- {
31
- "benchmark": "helm_capabilities",
32
- "model_count": 61
33
- },
34
- {
35
- "benchmark": "helm_classic",
36
- "model_count": 67
37
- },
38
- {
39
- "benchmark": "helm_instruct",
40
- "model_count": 4
41
- },
42
- {
43
- "benchmark": "helm_lite",
44
- "model_count": 91
45
- },
46
- {
47
- "benchmark": "helm_mmlu",
48
- "model_count": 79
49
- },
50
- {
51
- "benchmark": "hfopenllm_v2",
52
- "model_count": 4493
53
- },
54
- {
55
- "benchmark": "la_leaderboard",
56
- "model_count": 5
57
- },
58
- {
59
- "benchmark": "livecodebenchpro",
60
- "model_count": 27
61
- },
62
- {
63
- "benchmark": "reward-bench",
64
- "model_count": 328
65
- },
66
- {
67
- "benchmark": "swe-bench",
68
- "model_count": 3
69
- },
70
- {
71
- "benchmark": "tau-bench-2_airline",
72
- "model_count": 3
73
- },
74
- {
75
- "benchmark": "tau-bench-2_retail",
76
- "model_count": 3
77
- },
78
- {
79
- "benchmark": "tau-bench-2_telecom",
80
- "model_count": 3
81
- },
82
- {
83
- "benchmark": "terminal-bench-2.0",
84
- "model_count": 37
85
- },
86
- {
87
- "benchmark": "theory_of_mind",
88
- "model_count": 1
89
- }
90
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers.json DELETED
@@ -1,3150 +0,0 @@
1
- [
2
- {
3
- "developer": "0-hero",
4
- "model_count": 3
5
- },
6
- {
7
- "developer": "01-ai",
8
- "model_count": 20
9
- },
10
- {
11
- "developer": "1-800-LLMs",
12
- "model_count": 2
13
- },
14
- {
15
- "developer": "1024m",
16
- "model_count": 2
17
- },
18
- {
19
- "developer": "152334H",
20
- "model_count": 1
21
- },
22
- {
23
- "developer": "1TuanPham",
24
- "model_count": 2
25
- },
26
- {
27
- "developer": "3rd-Degree-Burn",
28
- "model_count": 4
29
- },
30
- {
31
- "developer": "4season",
32
- "model_count": 1
33
- },
34
- {
35
- "developer": "aaditya",
36
- "model_count": 1
37
- },
38
- {
39
- "developer": "AALF",
40
- "model_count": 4
41
- },
42
- {
43
- "developer": "Aashraf995",
44
- "model_count": 4
45
- },
46
- {
47
- "developer": "abacusai",
48
- "model_count": 10
49
- },
50
- {
51
- "developer": "AbacusResearch",
52
- "model_count": 1
53
- },
54
- {
55
- "developer": "abhishek",
56
- "model_count": 5
57
- },
58
- {
59
- "developer": "abideen",
60
- "model_count": 1
61
- },
62
- {
63
- "developer": "adamo1139",
64
- "model_count": 1
65
- },
66
- {
67
- "developer": "adriszmar",
68
- "model_count": 1
69
- },
70
- {
71
- "developer": "AELLM",
72
- "model_count": 2
73
- },
74
- {
75
- "developer": "aevalone",
76
- "model_count": 1
77
- },
78
- {
79
- "developer": "agentlans",
80
- "model_count": 9
81
- },
82
- {
83
- "developer": "AGI-0",
84
- "model_count": 3
85
- },
86
- {
87
- "developer": "Ahdoot",
88
- "model_count": 2
89
- },
90
- {
91
- "developer": "Ahjeong",
92
- "model_count": 2
93
- },
94
- {
95
- "developer": "ahmeda335",
96
- "model_count": 1
97
- },
98
- {
99
- "developer": "AI-MO",
100
- "model_count": 2
101
- },
102
- {
103
- "developer": "AI-Sweden-Models",
104
- "model_count": 2
105
- },
106
- {
107
- "developer": "AI2",
108
- "model_count": 7
109
- },
110
- {
111
- "developer": "ai21",
112
- "model_count": 12
113
- },
114
- {
115
- "developer": "ai21labs",
116
- "model_count": 1
117
- },
118
- {
119
- "developer": "ai4bharat",
120
- "model_count": 1
121
- },
122
- {
123
- "developer": "AI4free",
124
- "model_count": 2
125
- },
126
- {
127
- "developer": "AicoresSecurity",
128
- "model_count": 4
129
- },
130
- {
131
- "developer": "AIDC-AI",
132
- "model_count": 1
133
- },
134
- {
135
- "developer": "aixonlab",
136
- "model_count": 3
137
- },
138
- {
139
- "developer": "akhadangi",
140
- "model_count": 5
141
- },
142
- {
143
- "developer": "akjindal53244",
144
- "model_count": 1
145
- },
146
- {
147
- "developer": "alcholjung",
148
- "model_count": 1
149
- },
150
- {
151
- "developer": "Alepach",
152
- "model_count": 3
153
- },
154
- {
155
- "developer": "aleph-alpha",
156
- "model_count": 3
157
- },
158
- {
159
- "developer": "AlephAlpha",
160
- "model_count": 3
161
- },
162
- {
163
- "developer": "Alibaba",
164
- "model_count": 6
165
- },
166
- {
167
- "developer": "Alibaba-NLP",
168
- "model_count": 1
169
- },
170
- {
171
- "developer": "aliyun",
172
- "model_count": 1
173
- },
174
- {
175
- "developer": "allenai",
176
- "model_count": 162
177
- },
178
- {
179
- "developer": "allknowingroger",
180
- "model_count": 88
181
- },
182
- {
183
- "developer": "allura-org",
184
- "model_count": 9
185
- },
186
- {
187
- "developer": "aloobun",
188
- "model_count": 2
189
- },
190
- {
191
- "developer": "alpindale",
192
- "model_count": 2
193
- },
194
- {
195
- "developer": "Alsebay",
196
- "model_count": 1
197
- },
198
- {
199
- "developer": "altomek",
200
- "model_count": 1
201
- },
202
- {
203
- "developer": "Amaorynho",
204
- "model_count": 4
205
- },
206
- {
207
- "developer": "amazon",
208
- "model_count": 8
209
- },
210
- {
211
- "developer": "amd",
212
- "model_count": 1
213
- },
214
- {
215
- "developer": "Amu",
216
- "model_count": 2
217
- },
218
- {
219
- "developer": "anakin87",
220
- "model_count": 1
221
- },
222
- {
223
- "developer": "anthracite-org",
224
- "model_count": 12
225
- },
226
- {
227
- "developer": "Anthropic",
228
- "model_count": 34
229
- },
230
- {
231
- "developer": "apple",
232
- "model_count": 1
233
- },
234
- {
235
- "developer": "applied-compute",
236
- "model_count": 1
237
- },
238
- {
239
- "developer": "appvoid",
240
- "model_count": 2
241
- },
242
- {
243
- "developer": "arcee-ai",
244
- "model_count": 11
245
- },
246
- {
247
- "developer": "argilla",
248
- "model_count": 2
249
- },
250
- {
251
- "developer": "argilla-warehouse",
252
- "model_count": 1
253
- },
254
- {
255
- "developer": "arisin",
256
- "model_count": 1
257
- },
258
- {
259
- "developer": "ark",
260
- "model_count": 1
261
- },
262
- {
263
- "developer": "ArliAI",
264
- "model_count": 2
265
- },
266
- {
267
- "developer": "arshiaafshani",
268
- "model_count": 1
269
- },
270
- {
271
- "developer": "Arthur-LAGACHERIE",
272
- "model_count": 1
273
- },
274
- {
275
- "developer": "Artples",
276
- "model_count": 2
277
- },
278
- {
279
- "developer": "Aryanne",
280
- "model_count": 3
281
- },
282
- {
283
- "developer": "asharsha30",
284
- "model_count": 1
285
- },
286
- {
287
- "developer": "ashercn97",
288
- "model_count": 2
289
- },
290
- {
291
- "developer": "assskelad",
292
- "model_count": 1
293
- },
294
- {
295
- "developer": "AtAndDev",
296
- "model_count": 1
297
- },
298
- {
299
- "developer": "Ateron",
300
- "model_count": 3
301
- },
302
- {
303
- "developer": "athirdpath",
304
- "model_count": 1
305
- },
306
- {
307
- "developer": "AtlaAI",
308
- "model_count": 2
309
- },
310
- {
311
- "developer": "AuraIndustries",
312
- "model_count": 4
313
- },
314
- {
315
- "developer": "Aurel9",
316
- "model_count": 1
317
- },
318
- {
319
- "developer": "automerger",
320
- "model_count": 1
321
- },
322
- {
323
- "developer": "avemio",
324
- "model_count": 1
325
- },
326
- {
327
- "developer": "awnr",
328
- "model_count": 5
329
- },
330
- {
331
- "developer": "aws-prototyping",
332
- "model_count": 1
333
- },
334
- {
335
- "developer": "axolotl-ai-co",
336
- "model_count": 1
337
- },
338
- {
339
- "developer": "Ayush-Singh",
340
- "model_count": 1
341
- },
342
- {
343
- "developer": "Azure99",
344
- "model_count": 6
345
- },
346
- {
347
- "developer": "Ba2han",
348
- "model_count": 1
349
- },
350
- {
351
- "developer": "BAAI",
352
- "model_count": 14
353
- },
354
- {
355
- "developer": "baconnier",
356
- "model_count": 2
357
- },
358
- {
359
- "developer": "baebee",
360
- "model_count": 3
361
- },
362
- {
363
- "developer": "bamec66557",
364
- "model_count": 27
365
- },
366
- {
367
- "developer": "Baptiste-HUVELLE-10",
368
- "model_count": 1
369
- },
370
- {
371
- "developer": "BEE-spoke-data",
372
- "model_count": 9
373
- },
374
- {
375
- "developer": "belztjti",
376
- "model_count": 2
377
- },
378
- {
379
- "developer": "BenevolenceMessiah",
380
- "model_count": 2
381
- },
382
- {
383
- "developer": "benhaotang",
384
- "model_count": 1
385
- },
386
- {
387
- "developer": "beomi",
388
- "model_count": 1
389
- },
390
- {
391
- "developer": "beowolx",
392
- "model_count": 1
393
- },
394
- {
395
- "developer": "berkeley-nest",
396
- "model_count": 2
397
- },
398
- {
399
- "developer": "bfuzzy1",
400
- "model_count": 7
401
- },
402
- {
403
- "developer": "bhuvneshsaini",
404
- "model_count": 1
405
- },
406
- {
407
- "developer": "bigcode",
408
- "model_count": 3
409
- },
410
- {
411
- "developer": "bigscience",
412
- "model_count": 7
413
- },
414
- {
415
- "developer": "bittensor",
416
- "model_count": 1
417
- },
418
- {
419
- "developer": "BlackBeenie",
420
- "model_count": 9
421
- },
422
- {
423
- "developer": "Bllossom",
424
- "model_count": 1
425
- },
426
- {
427
- "developer": "bluuwhale",
428
- "model_count": 1
429
- },
430
- {
431
- "developer": "BoltMonkey",
432
- "model_count": 3
433
- },
434
- {
435
- "developer": "bond005",
436
- "model_count": 1
437
- },
438
- {
439
- "developer": "bosonai",
440
- "model_count": 1
441
- },
442
- {
443
- "developer": "braindao",
444
- "model_count": 17
445
- },
446
- {
447
- "developer": "BrainWave-ML",
448
- "model_count": 1
449
- },
450
- {
451
- "developer": "BramVanroy",
452
- "model_count": 4
453
- },
454
- {
455
- "developer": "brgx53",
456
- "model_count": 6
457
- },
458
- {
459
- "developer": "BSC-LT",
460
- "model_count": 2
461
- },
462
- {
463
- "developer": "bunnycore",
464
- "model_count": 85
465
- },
466
- {
467
- "developer": "byroneverson",
468
- "model_count": 3
469
- },
470
- {
471
- "developer": "ByteDance",
472
- "model_count": 1
473
- },
474
- {
475
- "developer": "c10x",
476
- "model_count": 2
477
- },
478
- {
479
- "developer": "CarrotAI",
480
- "model_count": 2
481
- },
482
- {
483
- "developer": "carsenk",
484
- "model_count": 2
485
- },
486
- {
487
- "developer": "Casual-Autopsy",
488
- "model_count": 1
489
- },
490
- {
491
- "developer": "cat-searcher",
492
- "model_count": 2
493
- },
494
- {
495
- "developer": "CausalLM",
496
- "model_count": 3
497
- },
498
- {
499
- "developer": "cckm",
500
- "model_count": 1
501
- },
502
- {
503
- "developer": "cgato",
504
- "model_count": 1
505
- },
506
- {
507
- "developer": "Changgil",
508
- "model_count": 2
509
- },
510
- {
511
- "developer": "chargoddard",
512
- "model_count": 1
513
- },
514
- {
515
- "developer": "chujiezheng",
516
- "model_count": 2
517
- },
518
- {
519
- "developer": "CIR-AMS",
520
- "model_count": 1
521
- },
522
- {
523
- "developer": "cjvt",
524
- "model_count": 1
525
- },
526
- {
527
- "developer": "ClaudioItaly",
528
- "model_count": 4
529
- },
530
- {
531
- "developer": "cloudyu",
532
- "model_count": 7
533
- },
534
- {
535
- "developer": "cluebbers",
536
- "model_count": 3
537
- },
538
- {
539
- "developer": "cognitivecomputations",
540
- "model_count": 17
541
- },
542
- {
543
- "developer": "cohere",
544
- "model_count": 18
545
- },
546
- {
547
- "developer": "CohereForAI",
548
- "model_count": 8
549
- },
550
- {
551
- "developer": "collaiborateorg",
552
- "model_count": 1
553
- },
554
- {
555
- "developer": "Columbia-NLP",
556
- "model_count": 6
557
- },
558
- {
559
- "developer": "CombinHorizon",
560
- "model_count": 6
561
- },
562
- {
563
- "developer": "ContactDoctor",
564
- "model_count": 2
565
- },
566
- {
567
- "developer": "ContextualAI",
568
- "model_count": 16
569
- },
570
- {
571
- "developer": "CoolSpring",
572
- "model_count": 3
573
- },
574
- {
575
- "developer": "Corianas",
576
- "model_count": 3
577
- },
578
- {
579
- "developer": "CortexLM",
580
- "model_count": 1
581
- },
582
- {
583
- "developer": "cpayne1303",
584
- "model_count": 4
585
- },
586
- {
587
- "developer": "Cran-May",
588
- "model_count": 7
589
- },
590
- {
591
- "developer": "CreitinGameplays",
592
- "model_count": 1
593
- },
594
- {
595
- "developer": "crestf411",
596
- "model_count": 1
597
- },
598
- {
599
- "developer": "cstr",
600
- "model_count": 1
601
- },
602
- {
603
- "developer": "CultriX",
604
- "model_count": 32
605
- },
606
- {
607
- "developer": "cyberagent",
608
- "model_count": 1
609
- },
610
- {
611
- "developer": "CYFRAGOVPL",
612
- "model_count": 6
613
- },
614
- {
615
- "developer": "Daemontatox",
616
- "model_count": 32
617
- },
618
- {
619
- "developer": "Dampfinchen",
620
- "model_count": 1
621
- },
622
- {
623
- "developer": "Danielbrdz",
624
- "model_count": 7
625
- },
626
- {
627
- "developer": "Dans-DiscountModels",
628
- "model_count": 9
629
- },
630
- {
631
- "developer": "darkc0de",
632
- "model_count": 3
633
- },
634
- {
635
- "developer": "Darkknight535",
636
- "model_count": 1
637
- },
638
- {
639
- "developer": "databricks",
640
- "model_count": 6
641
- },
642
- {
643
- "developer": "Databricks-Mosaic-Research",
644
- "model_count": 1
645
- },
646
- {
647
- "developer": "DavidAU",
648
- "model_count": 25
649
- },
650
- {
651
- "developer": "davidkim205",
652
- "model_count": 2
653
- },
654
- {
655
- "developer": "Davidsv",
656
- "model_count": 1
657
- },
658
- {
659
- "developer": "DavieLion",
660
- "model_count": 5
661
- },
662
- {
663
- "developer": "DebateLabKIT",
664
- "model_count": 1
665
- },
666
- {
667
- "developer": "Deci",
668
- "model_count": 2
669
- },
670
- {
671
- "developer": "DeepAutoAI",
672
- "model_count": 12
673
- },
674
- {
675
- "developer": "DeepMount00",
676
- "model_count": 13
677
- },
678
- {
679
- "developer": "DeepSeek",
680
- "model_count": 9
681
- },
682
- {
683
- "developer": "deepseek-ai",
684
- "model_count": 13
685
- },
686
- {
687
- "developer": "Delta-Vector",
688
- "model_count": 7
689
- },
690
- {
691
- "developer": "DevQuasar",
692
- "model_count": 1
693
- },
694
- {
695
- "developer": "dfurman",
696
- "model_count": 4
697
- },
698
- {
699
- "developer": "dicta-il",
700
- "model_count": 2
701
- },
702
- {
703
- "developer": "distilbert",
704
- "model_count": 1
705
- },
706
- {
707
- "developer": "divyanshukunwar",
708
- "model_count": 1
709
- },
710
- {
711
- "developer": "djuna",
712
- "model_count": 15
713
- },
714
- {
715
- "developer": "djuna-test-lab",
716
- "model_count": 2
717
- },
718
- {
719
- "developer": "dnhkng",
720
- "model_count": 10
721
- },
722
- {
723
- "developer": "Dongwei",
724
- "model_count": 1
725
- },
726
- {
727
- "developer": "DoppelReflEx",
728
- "model_count": 29
729
- },
730
- {
731
- "developer": "DreadPoor",
732
- "model_count": 119
733
- },
734
- {
735
- "developer": "dreamgen",
736
- "model_count": 1
737
- },
738
- {
739
- "developer": "DRXD1000",
740
- "model_count": 2
741
- },
742
- {
743
- "developer": "DUAL-GPO",
744
- "model_count": 1
745
- },
746
- {
747
- "developer": "dustinwloring1988",
748
- "model_count": 7
749
- },
750
- {
751
- "developer": "duyhv1411",
752
- "model_count": 2
753
- },
754
- {
755
- "developer": "dwikitheduck",
756
- "model_count": 6
757
- },
758
- {
759
- "developer": "dzakwan",
760
- "model_count": 1
761
- },
762
- {
763
- "developer": "DZgas",
764
- "model_count": 1
765
- },
766
- {
767
- "developer": "ECE-ILAB-PRYMMAL",
768
- "model_count": 1
769
- },
770
- {
771
- "developer": "Edgerunners",
772
- "model_count": 1
773
- },
774
- {
775
- "developer": "ehristoforu",
776
- "model_count": 36
777
- },
778
- {
779
- "developer": "EleutherAI",
780
- "model_count": 12
781
- },
782
- {
783
- "developer": "elinas",
784
- "model_count": 1
785
- },
786
- {
787
- "developer": "ell44ot",
788
- "model_count": 1
789
- },
790
- {
791
- "developer": "Enno-Ai",
792
- "model_count": 4
793
- },
794
- {
795
- "developer": "EnnoAi",
796
- "model_count": 2
797
- },
798
- {
799
- "developer": "Epiculous",
800
- "model_count": 4
801
- },
802
- {
803
- "developer": "EpistemeAI",
804
- "model_count": 47
805
- },
806
- {
807
- "developer": "EpistemeAI2",
808
- "model_count": 15
809
- },
810
- {
811
- "developer": "Eric111",
812
- "model_count": 2
813
- },
814
- {
815
- "developer": "Etherll",
816
- "model_count": 8
817
- },
818
- {
819
- "developer": "euclaise",
820
- "model_count": 1
821
- },
822
- {
823
- "developer": "Eurdem",
824
- "model_count": 1
825
- },
826
- {
827
- "developer": "EVA-UNIT-01",
828
- "model_count": 2
829
- },
830
- {
831
- "developer": "eworojoshua",
832
- "model_count": 1
833
- },
834
- {
835
- "developer": "ewre324",
836
- "model_count": 4
837
- },
838
- {
839
- "developer": "experiment-llm",
840
- "model_count": 1
841
- },
842
- {
843
- "developer": "facebook",
844
- "model_count": 4
845
- },
846
- {
847
- "developer": "failspy",
848
- "model_count": 6
849
- },
850
- {
851
- "developer": "FallenMerick",
852
- "model_count": 1
853
- },
854
- {
855
- "developer": "fblgit",
856
- "model_count": 11
857
- },
858
- {
859
- "developer": "Felladrin",
860
- "model_count": 2
861
- },
862
- {
863
- "developer": "fhai50032",
864
- "model_count": 2
865
- },
866
- {
867
- "developer": "FINGU-AI",
868
- "model_count": 7
869
- },
870
- {
871
- "developer": "flammenai",
872
- "model_count": 6
873
- },
874
- {
875
- "developer": "FlofloB",
876
- "model_count": 27
877
- },
878
- {
879
- "developer": "fluently-lm",
880
- "model_count": 3
881
- },
882
- {
883
- "developer": "fluently-sets",
884
- "model_count": 2
885
- },
886
- {
887
- "developer": "formulae",
888
- "model_count": 10
889
- },
890
- {
891
- "developer": "frameai",
892
- "model_count": 1
893
- },
894
- {
895
- "developer": "freewheelin",
896
- "model_count": 4
897
- },
898
- {
899
- "developer": "FuJhen",
900
- "model_count": 4
901
- },
902
- {
903
- "developer": "fulim",
904
- "model_count": 1
905
- },
906
- {
907
- "developer": "FuseAI",
908
- "model_count": 4
909
- },
910
- {
911
- "developer": "gabrielmbmb",
912
- "model_count": 1
913
- },
914
- {
915
- "developer": "GalrionSoftworks",
916
- "model_count": 2
917
- },
918
- {
919
- "developer": "gaverfraxz",
920
- "model_count": 2
921
- },
922
- {
923
- "developer": "gbueno86",
924
- "model_count": 2
925
- },
926
- {
927
- "developer": "general-preference",
928
- "model_count": 2
929
- },
930
- {
931
- "developer": "GenVRadmin",
932
- "model_count": 4
933
- },
934
- {
935
- "developer": "ghost-x",
936
- "model_count": 1
937
- },
938
- {
939
- "developer": "glaiveai",
940
- "model_count": 1
941
- },
942
- {
943
- "developer": "gmonsoon",
944
- "model_count": 5
945
- },
946
- {
947
- "developer": "godlikehhd",
948
- "model_count": 26
949
- },
950
- {
951
- "developer": "Goekdeniz-Guelmez",
952
- "model_count": 10
953
- },
954
- {
955
- "developer": "Google",
956
- "model_count": 78
957
- },
958
- {
959
- "developer": "GoToCompany",
960
- "model_count": 2
961
- },
962
- {
963
- "developer": "goulue5",
964
- "model_count": 1
965
- },
966
- {
967
- "developer": "gradientai",
968
- "model_count": 1
969
- },
970
- {
971
- "developer": "GreenNode",
972
- "model_count": 1
973
- },
974
- {
975
- "developer": "grimjim",
976
- "model_count": 25
977
- },
978
- {
979
- "developer": "GritLM",
980
- "model_count": 2
981
- },
982
- {
983
- "developer": "Groq",
984
- "model_count": 1
985
- },
986
- {
987
- "developer": "Gryphe",
988
- "model_count": 5
989
- },
990
- {
991
- "developer": "GuilhermeNaturaUmana",
992
- "model_count": 1
993
- },
994
- {
995
- "developer": "Gunulhona",
996
- "model_count": 2
997
- },
998
- {
999
- "developer": "gupta-tanish",
1000
- "model_count": 1
1001
- },
1002
- {
1003
- "developer": "gz987",
1004
- "model_count": 4
1005
- },
1006
- {
1007
- "developer": "h2oai",
1008
- "model_count": 5
1009
- },
1010
- {
1011
- "developer": "haoranxu",
1012
- "model_count": 3
1013
- },
1014
- {
1015
- "developer": "HarbingerX",
1016
- "model_count": 2
1017
- },
1018
- {
1019
- "developer": "Hastagaras",
1020
- "model_count": 3
1021
- },
1022
- {
1023
- "developer": "hatemmahmoud",
1024
- "model_count": 1
1025
- },
1026
- {
1027
- "developer": "HelpingAI",
1028
- "model_count": 4
1029
- },
1030
- {
1031
- "developer": "hendrydong",
1032
- "model_count": 1
1033
- },
1034
- {
1035
- "developer": "HeraiHench",
1036
- "model_count": 4
1037
- },
1038
- {
1039
- "developer": "HFXM",
1040
- "model_count": 1
1041
- },
1042
- {
1043
- "developer": "HiroseKoichi",
1044
- "model_count": 1
1045
- },
1046
- {
1047
- "developer": "HoangHa",
1048
- "model_count": 1
1049
- },
1050
- {
1051
- "developer": "hon9kon9ize",
1052
- "model_count": 2
1053
- },
1054
- {
1055
- "developer": "hongbai12",
1056
- "model_count": 1
1057
- },
1058
- {
1059
- "developer": "hotmailuser",
1060
- "model_count": 34
1061
- },
1062
- {
1063
- "developer": "HPAI-BSC",
1064
- "model_count": 3
1065
- },
1066
- {
1067
- "developer": "huawei-noah-ustc",
1068
- "model_count": 1
1069
- },
1070
- {
1071
- "developer": "HuggingFaceH4",
1072
- "model_count": 5
1073
- },
1074
- {
1075
- "developer": "HuggingFaceTB",
1076
- "model_count": 12
1077
- },
1078
- {
1079
- "developer": "huggyllama",
1080
- "model_count": 3
1081
- },
1082
- {
1083
- "developer": "huihui-ai",
1084
- "model_count": 8
1085
- },
1086
- {
1087
- "developer": "HumanLLMs",
1088
- "model_count": 3
1089
- },
1090
- {
1091
- "developer": "huu-ontocord",
1092
- "model_count": 1
1093
- },
1094
- {
1095
- "developer": "ibivibiv",
1096
- "model_count": 2
1097
- },
1098
- {
1099
- "developer": "ibm",
1100
- "model_count": 8
1101
- },
1102
- {
1103
- "developer": "ibm-granite",
1104
- "model_count": 20
1105
- },
1106
- {
1107
- "developer": "icefog72",
1108
- "model_count": 62
1109
- },
1110
- {
1111
- "developer": "IDEA-CCNL",
1112
- "model_count": 2
1113
- },
1114
- {
1115
- "developer": "ifable",
1116
- "model_count": 1
1117
- },
1118
- {
1119
- "developer": "iFaz",
1120
- "model_count": 8
1121
- },
1122
- {
1123
- "developer": "ilsp",
1124
- "model_count": 1
1125
- },
1126
- {
1127
- "developer": "IlyaGusev",
1128
- "model_count": 2
1129
- },
1130
- {
1131
- "developer": "Infinirc",
1132
- "model_count": 1
1133
- },
1134
- {
1135
- "developer": "inflatebot",
1136
- "model_count": 1
1137
- },
1138
- {
1139
- "developer": "infly",
1140
- "model_count": 1
1141
- },
1142
- {
1143
- "developer": "informatiker",
1144
- "model_count": 1
1145
- },
1146
- {
1147
- "developer": "INSAIT-Institute",
1148
- "model_count": 1
1149
- },
1150
- {
1151
- "developer": "insightfactory",
1152
- "model_count": 1
1153
- },
1154
- {
1155
- "developer": "instruction-pretrain",
1156
- "model_count": 1
1157
- },
1158
- {
1159
- "developer": "Intel",
1160
- "model_count": 4
1161
- },
1162
- {
1163
- "developer": "internlm",
1164
- "model_count": 9
1165
- },
1166
- {
1167
- "developer": "intervitens",
1168
- "model_count": 1
1169
- },
1170
- {
1171
- "developer": "IntervitensInc",
1172
- "model_count": 1
1173
- },
1174
- {
1175
- "developer": "inumulaisk",
1176
- "model_count": 1
1177
- },
1178
- {
1179
- "developer": "invalid-coder",
1180
- "model_count": 1
1181
- },
1182
- {
1183
- "developer": "Invalid-Null",
1184
- "model_count": 2
1185
- },
1186
- {
1187
- "developer": "invisietch",
1188
- "model_count": 4
1189
- },
1190
- {
1191
- "developer": "irahulpandey",
1192
- "model_count": 1
1193
- },
1194
- {
1195
- "developer": "iRyanBell",
1196
- "model_count": 2
1197
- },
1198
- {
1199
- "developer": "Isaak-Carter",
1200
- "model_count": 3
1201
- },
1202
- {
1203
- "developer": "J-LAB",
1204
- "model_count": 1
1205
- },
1206
- {
1207
- "developer": "JackFram",
1208
- "model_count": 2
1209
- },
1210
- {
1211
- "developer": "Jacoby746",
1212
- "model_count": 7
1213
- },
1214
- {
1215
- "developer": "jaredjoss",
1216
- "model_count": 1
1217
- },
1218
- {
1219
- "developer": "jaspionjader",
1220
- "model_count": 196
1221
- },
1222
- {
1223
- "developer": "jayasuryajsk",
1224
- "model_count": 1
1225
- },
1226
- {
1227
- "developer": "JayHyeon",
1228
- "model_count": 174
1229
- },
1230
- {
1231
- "developer": "jeanmichela",
1232
- "model_count": 1
1233
- },
1234
- {
1235
- "developer": "jebcarter",
1236
- "model_count": 1
1237
- },
1238
- {
1239
- "developer": "jebish7",
1240
- "model_count": 9
1241
- },
1242
- {
1243
- "developer": "jeffmeloy",
1244
- "model_count": 18
1245
- },
1246
- {
1247
- "developer": "jeonsworld",
1248
- "model_count": 1
1249
- },
1250
- {
1251
- "developer": "jiangxinyang-shanda",
1252
- "model_count": 1
1253
- },
1254
- {
1255
- "developer": "jieliu",
1256
- "model_count": 1
1257
- },
1258
- {
1259
- "developer": "Jimmy19991222",
1260
- "model_count": 8
1261
- },
1262
- {
1263
- "developer": "jiviai",
1264
- "model_count": 1
1265
- },
1266
- {
1267
- "developer": "jlzhou",
1268
- "model_count": 1
1269
- },
1270
- {
1271
- "developer": "johnsutor",
1272
- "model_count": 31
1273
- },
1274
- {
1275
- "developer": "jondurbin",
1276
- "model_count": 1
1277
- },
1278
- {
1279
- "developer": "Joseph717171",
1280
- "model_count": 2
1281
- },
1282
- {
1283
- "developer": "Josephgflowers",
1284
- "model_count": 7
1285
- },
1286
- {
1287
- "developer": "jpacifico",
1288
- "model_count": 18
1289
- },
1290
- {
1291
- "developer": "jsfs11",
1292
- "model_count": 3
1293
- },
1294
- {
1295
- "developer": "JungZoona",
1296
- "model_count": 2
1297
- },
1298
- {
1299
- "developer": "Junhoee",
1300
- "model_count": 1
1301
- },
1302
- {
1303
- "developer": "kaist-ai",
1304
- "model_count": 4
1305
- },
1306
- {
1307
- "developer": "katanemo",
1308
- "model_count": 3
1309
- },
1310
- {
1311
- "developer": "kavonalds",
1312
- "model_count": 3
1313
- },
1314
- {
1315
- "developer": "kayfour",
1316
- "model_count": 1
1317
- },
1318
- {
1319
- "developer": "keeeeenw",
1320
- "model_count": 1
1321
- },
1322
- {
1323
- "developer": "kekmodel",
1324
- "model_count": 1
1325
- },
1326
- {
1327
- "developer": "kevin009",
1328
- "model_count": 1
1329
- },
1330
- {
1331
- "developer": "Khetterman",
1332
- "model_count": 2
1333
- },
1334
- {
1335
- "developer": "khoantap",
1336
- "model_count": 9
1337
- },
1338
- {
1339
- "developer": "khulaifi95",
1340
- "model_count": 1
1341
- },
1342
- {
1343
- "developer": "Kimargin",
1344
- "model_count": 1
1345
- },
1346
- {
1347
- "developer": "Kimi",
1348
- "model_count": 1
1349
- },
1350
- {
1351
- "developer": "KingNish",
1352
- "model_count": 7
1353
- },
1354
- {
1355
- "developer": "kms7530",
1356
- "model_count": 4
1357
- },
1358
- {
1359
- "developer": "kno10",
1360
- "model_count": 2
1361
- },
1362
- {
1363
- "developer": "Kquant03",
1364
- "model_count": 2
1365
- },
1366
- {
1367
- "developer": "Krystalan",
1368
- "model_count": 2
1369
- },
1370
- {
1371
- "developer": "KSU-HW-SEC",
1372
- "model_count": 4
1373
- },
1374
- {
1375
- "developer": "Kuaishou",
1376
- "model_count": 1
1377
- },
1378
- {
1379
- "developer": "Kukedlc",
1380
- "model_count": 7
1381
- },
1382
- {
1383
- "developer": "Kumar955",
1384
- "model_count": 1
1385
- },
1386
- {
1387
- "developer": "kyutai",
1388
- "model_count": 1
1389
- },
1390
- {
1391
- "developer": "kz919",
1392
- "model_count": 1
1393
- },
1394
- {
1395
- "developer": "L-RAGE",
1396
- "model_count": 1
1397
- },
1398
- {
1399
- "developer": "ladydaina",
1400
- "model_count": 1
1401
- },
1402
- {
1403
- "developer": "laislemke",
1404
- "model_count": 1
1405
- },
1406
- {
1407
- "developer": "lalainy",
1408
- "model_count": 7
1409
- },
1410
- {
1411
- "developer": "Lambent",
1412
- "model_count": 1
1413
- },
1414
- {
1415
- "developer": "Langboat",
1416
- "model_count": 1
1417
- },
1418
- {
1419
- "developer": "langgptai",
1420
- "model_count": 2
1421
- },
1422
- {
1423
- "developer": "lars1234",
1424
- "model_count": 1
1425
- },
1426
- {
1427
- "developer": "Lawnakk",
1428
- "model_count": 10
1429
- },
1430
- {
1431
- "developer": "leafspark",
1432
- "model_count": 1
1433
- },
1434
- {
1435
- "developer": "LEESM",
1436
- "model_count": 4
1437
- },
1438
- {
1439
- "developer": "lemon07r",
1440
- "model_count": 17
1441
- },
1442
- {
1443
- "developer": "LenguajeNaturalAI",
1444
- "model_count": 2
1445
- },
1446
- {
1447
- "developer": "LeroyDyer",
1448
- "model_count": 58
1449
- },
1450
- {
1451
- "developer": "lesubra",
1452
- "model_count": 8
1453
- },
1454
- {
1455
- "developer": "LGAI-EXAONE",
1456
- "model_count": 4
1457
- },
1458
- {
1459
- "developer": "lightblue",
1460
- "model_count": 5
1461
- },
1462
- {
1463
- "developer": "LightningRodLabs",
1464
- "model_count": 3
1465
- },
1466
- {
1467
- "developer": "Lil-R",
1468
- "model_count": 8
1469
- },
1470
- {
1471
- "developer": "LilRg",
1472
- "model_count": 10
1473
- },
1474
- {
1475
- "developer": "LimYeri",
1476
- "model_count": 5
1477
- },
1478
- {
1479
- "developer": "lkoenig",
1480
- "model_count": 11
1481
- },
1482
- {
1483
- "developer": "llm-blender",
1484
- "model_count": 1
1485
- },
1486
- {
1487
- "developer": "LLM360",
1488
- "model_count": 2
1489
- },
1490
- {
1491
- "developer": "LLM4Binary",
1492
- "model_count": 1
1493
- },
1494
- {
1495
- "developer": "llmat",
1496
- "model_count": 1
1497
- },
1498
- {
1499
- "developer": "llnYou",
1500
- "model_count": 5
1501
- },
1502
- {
1503
- "developer": "lmsys",
1504
- "model_count": 5
1505
- },
1506
- {
1507
- "developer": "Locutusque",
1508
- "model_count": 6
1509
- },
1510
- {
1511
- "developer": "lodrick-the-lafted",
1512
- "model_count": 1
1513
- },
1514
- {
1515
- "developer": "lordjia",
1516
- "model_count": 2
1517
- },
1518
- {
1519
- "developer": "lt-asset",
1520
- "model_count": 1
1521
- },
1522
- {
1523
- "developer": "lunahr",
1524
- "model_count": 2
1525
- },
1526
- {
1527
- "developer": "Luni",
1528
- "model_count": 2
1529
- },
1530
- {
1531
- "developer": "Lunzima",
1532
- "model_count": 18
1533
- },
1534
- {
1535
- "developer": "LxzGordon",
1536
- "model_count": 2
1537
- },
1538
- {
1539
- "developer": "Lyte",
1540
- "model_count": 3
1541
- },
1542
- {
1543
- "developer": "M4-ai",
1544
- "model_count": 1
1545
- },
1546
- {
1547
- "developer": "m42-health",
1548
- "model_count": 1
1549
- },
1550
- {
1551
- "developer": "macadeliccc",
1552
- "model_count": 3
1553
- },
1554
- {
1555
- "developer": "madeagents",
1556
- "model_count": 4
1557
- },
1558
- {
1559
- "developer": "magnifi",
1560
- "model_count": 1
1561
- },
1562
- {
1563
- "developer": "Magpie-Align",
1564
- "model_count": 8
1565
- },
1566
- {
1567
- "developer": "MagusCorp",
1568
- "model_count": 1
1569
- },
1570
- {
1571
- "developer": "maldv",
1572
- "model_count": 7
1573
- },
1574
- {
1575
- "developer": "ManoloPueblo",
1576
- "model_count": 3
1577
- },
1578
- {
1579
- "developer": "marcuscedricridia",
1580
- "model_count": 40
1581
- },
1582
- {
1583
- "developer": "marin-community",
1584
- "model_count": 1
1585
- },
1586
- {
1587
- "developer": "MarinaraSpaghetti",
1588
- "model_count": 2
1589
- },
1590
- {
1591
- "developer": "Marsouuu",
1592
- "model_count": 8
1593
- },
1594
- {
1595
- "developer": "matouLeLoup",
1596
- "model_count": 5
1597
- },
1598
- {
1599
- "developer": "mattshumer",
1600
- "model_count": 3
1601
- },
1602
- {
1603
- "developer": "maywell",
1604
- "model_count": 1
1605
- },
1606
- {
1607
- "developer": "MaziyarPanahi",
1608
- "model_count": 44
1609
- },
1610
- {
1611
- "developer": "meditsolutions",
1612
- "model_count": 12
1613
- },
1614
- {
1615
- "developer": "meetkai",
1616
- "model_count": 1
1617
- },
1618
- {
1619
- "developer": "meraGPT",
1620
- "model_count": 1
1621
- },
1622
- {
1623
- "developer": "mergekit-community",
1624
- "model_count": 11
1625
- },
1626
- {
1627
- "developer": "MEscriva",
1628
- "model_count": 1
1629
- },
1630
- {
1631
- "developer": "Meta",
1632
- "model_count": 26
1633
- },
1634
- {
1635
- "developer": "meta-llama",
1636
- "model_count": 23
1637
- },
1638
- {
1639
- "developer": "meta-metrics",
1640
- "model_count": 1
1641
- },
1642
- {
1643
- "developer": "mhl1",
1644
- "model_count": 1
1645
- },
1646
- {
1647
- "developer": "microsoft",
1648
- "model_count": 19
1649
- },
1650
- {
1651
- "developer": "mightbe",
1652
- "model_count": 1
1653
- },
1654
- {
1655
- "developer": "migtissera",
1656
- "model_count": 8
1657
- },
1658
- {
1659
- "developer": "Minami-su",
1660
- "model_count": 5
1661
- },
1662
- {
1663
- "developer": "mindw96",
1664
- "model_count": 1
1665
- },
1666
- {
1667
- "developer": "minghaowu",
1668
- "model_count": 1
1669
- },
1670
- {
1671
- "developer": "MiniMax",
1672
- "model_count": 4
1673
- },
1674
- {
1675
- "developer": "ministral",
1676
- "model_count": 1
1677
- },
1678
- {
1679
- "developer": "mistral-community",
1680
- "model_count": 3
1681
- },
1682
- {
1683
- "developer": "mistralai",
1684
- "model_count": 36
1685
- },
1686
- {
1687
- "developer": "mixtao",
1688
- "model_count": 1
1689
- },
1690
- {
1691
- "developer": "mkurman",
1692
- "model_count": 3
1693
- },
1694
- {
1695
- "developer": "mkxu",
1696
- "model_count": 2
1697
- },
1698
- {
1699
- "developer": "mlabonne",
1700
- "model_count": 14
1701
- },
1702
- {
1703
- "developer": "MLP-KTLim",
1704
- "model_count": 1
1705
- },
1706
- {
1707
- "developer": "mlx-community",
1708
- "model_count": 2
1709
- },
1710
- {
1711
- "developer": "mmnga",
1712
- "model_count": 1
1713
- },
1714
- {
1715
- "developer": "mobiuslabsgmbh",
1716
- "model_count": 2
1717
- },
1718
- {
1719
- "developer": "ModelCloud",
1720
- "model_count": 1
1721
- },
1722
- {
1723
- "developer": "ModelSpace",
1724
- "model_count": 1
1725
- },
1726
- {
1727
- "developer": "moeru-ai",
1728
- "model_count": 3
1729
- },
1730
- {
1731
- "developer": "monsterapi",
1732
- "model_count": 2
1733
- },
1734
- {
1735
- "developer": "MoonRide",
1736
- "model_count": 1
1737
- },
1738
- {
1739
- "developer": "moonshot",
1740
- "model_count": 2
1741
- },
1742
- {
1743
- "developer": "Moonshot AI",
1744
- "model_count": 2
1745
- },
1746
- {
1747
- "developer": "moonshotai",
1748
- "model_count": 2
1749
- },
1750
- {
1751
- "developer": "mosaicml",
1752
- "model_count": 3
1753
- },
1754
- {
1755
- "developer": "mosama",
1756
- "model_count": 1
1757
- },
1758
- {
1759
- "developer": "Mostafa8Mehrabi",
1760
- "model_count": 1
1761
- },
1762
- {
1763
- "developer": "mrdayl",
1764
- "model_count": 5
1765
- },
1766
- {
1767
- "developer": "mrm8488",
1768
- "model_count": 2
1769
- },
1770
- {
1771
- "developer": "MrRobotoAI",
1772
- "model_count": 2
1773
- },
1774
- {
1775
- "developer": "MTSAIR",
1776
- "model_count": 2
1777
- },
1778
- {
1779
- "developer": "mukaj",
1780
- "model_count": 1
1781
- },
1782
- {
1783
- "developer": "Multiple",
1784
- "model_count": 1
1785
- },
1786
- {
1787
- "developer": "MultivexAI",
1788
- "model_count": 5
1789
- },
1790
- {
1791
- "developer": "Mxode",
1792
- "model_count": 5
1793
- },
1794
- {
1795
- "developer": "my_model",
1796
- "model_count": 1
1797
- },
1798
- {
1799
- "developer": "nanbeige",
1800
- "model_count": 2
1801
- },
1802
- {
1803
- "developer": "NAPS-ai",
1804
- "model_count": 7
1805
- },
1806
- {
1807
- "developer": "natong19",
1808
- "model_count": 2
1809
- },
1810
- {
1811
- "developer": "Naveenpoliasetty",
1812
- "model_count": 1
1813
- },
1814
- {
1815
- "developer": "nazimali",
1816
- "model_count": 2
1817
- },
1818
- {
1819
- "developer": "NbAiLab",
1820
- "model_count": 2
1821
- },
1822
- {
1823
- "developer": "nbeerbower",
1824
- "model_count": 51
1825
- },
1826
- {
1827
- "developer": "nbrahme",
1828
- "model_count": 1
1829
- },
1830
- {
1831
- "developer": "NCSOFT",
1832
- "model_count": 3
1833
- },
1834
- {
1835
- "developer": "necva",
1836
- "model_count": 2
1837
- },
1838
- {
1839
- "developer": "Nekochu",
1840
- "model_count": 4
1841
- },
1842
- {
1843
- "developer": "neopolita",
1844
- "model_count": 11
1845
- },
1846
- {
1847
- "developer": "netcat420",
1848
- "model_count": 48
1849
- },
1850
- {
1851
- "developer": "netease-youdao",
1852
- "model_count": 1
1853
- },
1854
- {
1855
- "developer": "NeverSleep",
1856
- "model_count": 2
1857
- },
1858
- {
1859
- "developer": "newsbang",
1860
- "model_count": 7
1861
- },
1862
- {
1863
- "developer": "Nexesenex",
1864
- "model_count": 44
1865
- },
1866
- {
1867
- "developer": "Nexusflow",
1868
- "model_count": 2
1869
- },
1870
- {
1871
- "developer": "nguyentd",
1872
- "model_count": 1
1873
- },
1874
- {
1875
- "developer": "ngxson",
1876
- "model_count": 2
1877
- },
1878
- {
1879
- "developer": "nhyha",
1880
- "model_count": 5
1881
- },
1882
- {
1883
- "developer": "nicolinho",
1884
- "model_count": 4
1885
- },
1886
- {
1887
- "developer": "nidum",
1888
- "model_count": 1
1889
- },
1890
- {
1891
- "developer": "NikolaSigmoid",
1892
- "model_count": 7
1893
- },
1894
- {
1895
- "developer": "nisten",
1896
- "model_count": 2
1897
- },
1898
- {
1899
- "developer": "Nitral-AI",
1900
- "model_count": 8
1901
- },
1902
- {
1903
- "developer": "NJS26",
1904
- "model_count": 1
1905
- },
1906
- {
1907
- "developer": "NLPark",
1908
- "model_count": 3
1909
- },
1910
- {
1911
- "developer": "nlpguy",
1912
- "model_count": 9
1913
- },
1914
- {
1915
- "developer": "Nohobby",
1916
- "model_count": 2
1917
- },
1918
- {
1919
- "developer": "noname0202",
1920
- "model_count": 8
1921
- },
1922
- {
1923
- "developer": "Norquinal",
1924
- "model_count": 8
1925
- },
1926
- {
1927
- "developer": "NotASI",
1928
- "model_count": 4
1929
- },
1930
- {
1931
- "developer": "notbdq",
1932
- "model_count": 1
1933
- },
1934
- {
1935
- "developer": "nothingiisreal",
1936
- "model_count": 3
1937
- },
1938
- {
1939
- "developer": "NousResearch",
1940
- "model_count": 19
1941
- },
1942
- {
1943
- "developer": "Novaciano",
1944
- "model_count": 11
1945
- },
1946
- {
1947
- "developer": "NTQAI",
1948
- "model_count": 2
1949
- },
1950
- {
1951
- "developer": "NucleusAI",
1952
- "model_count": 1
1953
- },
1954
- {
1955
- "developer": "nvidia",
1956
- "model_count": 22
1957
- },
1958
- {
1959
- "developer": "nxmwxm",
1960
- "model_count": 1
1961
- },
1962
- {
1963
- "developer": "NYTK",
1964
- "model_count": 2
1965
- },
1966
- {
1967
- "developer": "NyxKrage",
1968
- "model_count": 1
1969
- },
1970
- {
1971
- "developer": "occiglot",
1972
- "model_count": 1
1973
- },
1974
- {
1975
- "developer": "odyssey-labs",
1976
- "model_count": 1
1977
- },
1978
- {
1979
- "developer": "OEvortex",
1980
- "model_count": 5
1981
- },
1982
- {
1983
- "developer": "olabs-ai",
1984
- "model_count": 1
1985
- },
1986
- {
1987
- "developer": "OliveiraJLT",
1988
- "model_count": 1
1989
- },
1990
- {
1991
- "developer": "Omkar1102",
1992
- "model_count": 1
1993
- },
1994
- {
1995
- "developer": "OmnicromsBrain",
1996
- "model_count": 1
1997
- },
1998
- {
1999
- "developer": "OnlyCheeini",
2000
- "model_count": 1
2001
- },
2002
- {
2003
- "developer": "ontocord",
2004
- "model_count": 32
2005
- },
2006
- {
2007
- "developer": "oobabooga",
2008
- "model_count": 1
2009
- },
2010
- {
2011
- "developer": "oopere",
2012
- "model_count": 9
2013
- },
2014
- {
2015
- "developer": "open-atlas",
2016
- "model_count": 2
2017
- },
2018
- {
2019
- "developer": "open-neo",
2020
- "model_count": 2
2021
- },
2022
- {
2023
- "developer": "Open-Orca",
2024
- "model_count": 1
2025
- },
2026
- {
2027
- "developer": "open-thoughts",
2028
- "model_count": 1
2029
- },
2030
- {
2031
- "developer": "OpenAI",
2032
- "model_count": 75
2033
- },
2034
- {
2035
- "developer": "openai-community",
2036
- "model_count": 4
2037
- },
2038
- {
2039
- "developer": "OpenAssistant",
2040
- "model_count": 4
2041
- },
2042
- {
2043
- "developer": "openbmb",
2044
- "model_count": 7
2045
- },
2046
- {
2047
- "developer": "OpenBuddy",
2048
- "model_count": 22
2049
- },
2050
- {
2051
- "developer": "openchat",
2052
- "model_count": 6
2053
- },
2054
- {
2055
- "developer": "opencompass",
2056
- "model_count": 4
2057
- },
2058
- {
2059
- "developer": "OpenGenerativeAI",
2060
- "model_count": 2
2061
- },
2062
- {
2063
- "developer": "OpenLeecher",
2064
- "model_count": 1
2065
- },
2066
- {
2067
- "developer": "OpenLLM-France",
2068
- "model_count": 4
2069
- },
2070
- {
2071
- "developer": "OpenScholar",
2072
- "model_count": 1
2073
- },
2074
- {
2075
- "developer": "orai-nlp",
2076
- "model_count": 1
2077
- },
2078
- {
2079
- "developer": "Orenguteng",
2080
- "model_count": 2
2081
- },
2082
- {
2083
- "developer": "Orion-zhen",
2084
- "model_count": 2
2085
- },
2086
- {
2087
- "developer": "oxyapi",
2088
- "model_count": 1
2089
- },
2090
- {
2091
- "developer": "ozone-ai",
2092
- "model_count": 1
2093
- },
2094
- {
2095
- "developer": "ozone-research",
2096
- "model_count": 1
2097
- },
2098
- {
2099
- "developer": "P0x0",
2100
- "model_count": 1
2101
- },
2102
- {
2103
- "developer": "paloalma",
2104
- "model_count": 5
2105
- },
2106
- {
2107
- "developer": "pankajmathur",
2108
- "model_count": 29
2109
- },
2110
- {
2111
- "developer": "Parissa3",
2112
- "model_count": 1
2113
- },
2114
- {
2115
- "developer": "paulml",
2116
- "model_count": 1
2117
- },
2118
- {
2119
- "developer": "phronetic-ai",
2120
- "model_count": 1
2121
- },
2122
- {
2123
- "developer": "Pinkstack",
2124
- "model_count": 4
2125
- },
2126
- {
2127
- "developer": "pints-ai",
2128
- "model_count": 2
2129
- },
2130
- {
2131
- "developer": "piotr25691",
2132
- "model_count": 3
2133
- },
2134
- {
2135
- "developer": "PJMixers",
2136
- "model_count": 1
2137
- },
2138
- {
2139
- "developer": "PJMixers-Dev",
2140
- "model_count": 9
2141
- },
2142
- {
2143
- "developer": "PKU-Alignment",
2144
- "model_count": 4
2145
- },
2146
- {
2147
- "developer": "PocketDoc",
2148
- "model_count": 5
2149
- },
2150
- {
2151
- "developer": "PoLL",
2152
- "model_count": 1
2153
- },
2154
- {
2155
- "developer": "postbot",
2156
- "model_count": 1
2157
- },
2158
- {
2159
- "developer": "PowerInfer",
2160
- "model_count": 1
2161
- },
2162
- {
2163
- "developer": "PranavHarshan",
2164
- "model_count": 2
2165
- },
2166
- {
2167
- "developer": "Pretergeek",
2168
- "model_count": 9
2169
- },
2170
- {
2171
- "developer": "PrimeIntellect",
2172
- "model_count": 2
2173
- },
2174
- {
2175
- "developer": "prince-canuma",
2176
- "model_count": 1
2177
- },
2178
- {
2179
- "developer": "princeton-nlp",
2180
- "model_count": 51
2181
- },
2182
- {
2183
- "developer": "prithivMLmods",
2184
- "model_count": 110
2185
- },
2186
- {
2187
- "developer": "prometheus-eval",
2188
- "model_count": 2
2189
- },
2190
- {
2191
- "developer": "pszemraj",
2192
- "model_count": 2
2193
- },
2194
- {
2195
- "developer": "PuxAI",
2196
- "model_count": 1
2197
- },
2198
- {
2199
- "developer": "PygmalionAI",
2200
- "model_count": 1
2201
- },
2202
- {
2203
- "developer": "Q-bert",
2204
- "model_count": 1
2205
- },
2206
- {
2207
- "developer": "qingy2019",
2208
- "model_count": 7
2209
- },
2210
- {
2211
- "developer": "qingy2024",
2212
- "model_count": 17
2213
- },
2214
- {
2215
- "developer": "qq8933",
2216
- "model_count": 1
2217
- },
2218
- {
2219
- "developer": "Quazim0t0",
2220
- "model_count": 69
2221
- },
2222
- {
2223
- "developer": "Qwen",
2224
- "model_count": 80
2225
- },
2226
- {
2227
- "developer": "R-I-S-E",
2228
- "model_count": 2
2229
- },
2230
- {
2231
- "developer": "Rakuten",
2232
- "model_count": 3
2233
- },
2234
- {
2235
- "developer": "raphgg",
2236
- "model_count": 1
2237
- },
2238
- {
2239
- "developer": "rasyosef",
2240
- "model_count": 4
2241
- },
2242
- {
2243
- "developer": "Ray2333",
2244
- "model_count": 10
2245
- },
2246
- {
2247
- "developer": "RDson",
2248
- "model_count": 1
2249
- },
2250
- {
2251
- "developer": "realtreetune",
2252
- "model_count": 1
2253
- },
2254
- {
2255
- "developer": "recoilme",
2256
- "model_count": 6
2257
- },
2258
- {
2259
- "developer": "redrix",
2260
- "model_count": 2
2261
- },
2262
- {
2263
- "developer": "refuelai",
2264
- "model_count": 1
2265
- },
2266
- {
2267
- "developer": "Replete-AI",
2268
- "model_count": 9
2269
- },
2270
- {
2271
- "developer": "RESMPDEV",
2272
- "model_count": 2
2273
- },
2274
- {
2275
- "developer": "RezVortex",
2276
- "model_count": 2
2277
- },
2278
- {
2279
- "developer": "rhplus0831",
2280
- "model_count": 1
2281
- },
2282
- {
2283
- "developer": "rhymes-ai",
2284
- "model_count": 1
2285
- },
2286
- {
2287
- "developer": "rhysjones",
2288
- "model_count": 1
2289
- },
2290
- {
2291
- "developer": "riaz",
2292
- "model_count": 1
2293
- },
2294
- {
2295
- "developer": "RLHFlow",
2296
- "model_count": 4
2297
- },
2298
- {
2299
- "developer": "rmdhirr",
2300
- "model_count": 1
2301
- },
2302
- {
2303
- "developer": "Ro-xe",
2304
- "model_count": 4
2305
- },
2306
- {
2307
- "developer": "Rombo-Org",
2308
- "model_count": 1
2309
- },
2310
- {
2311
- "developer": "rombodawg",
2312
- "model_count": 14
2313
- },
2314
- {
2315
- "developer": "rootxhacker",
2316
- "model_count": 3
2317
- },
2318
- {
2319
- "developer": "rsh345",
2320
- "model_count": 1
2321
- },
2322
- {
2323
- "developer": "rubenroy",
2324
- "model_count": 3
2325
- },
2326
- {
2327
- "developer": "RubielLabarta",
2328
- "model_count": 1
2329
- },
2330
- {
2331
- "developer": "ruizhe1217",
2332
- "model_count": 1
2333
- },
2334
- {
2335
- "developer": "rwitz",
2336
- "model_count": 1
2337
- },
2338
- {
2339
- "developer": "RWKV",
2340
- "model_count": 1
2341
- },
2342
- {
2343
- "developer": "sabersaleh",
2344
- "model_count": 7
2345
- },
2346
- {
2347
- "developer": "sabersalehk",
2348
- "model_count": 4
2349
- },
2350
- {
2351
- "developer": "SaisExperiments",
2352
- "model_count": 6
2353
- },
2354
- {
2355
- "developer": "saishf",
2356
- "model_count": 2
2357
- },
2358
- {
2359
- "developer": "saishshinde15",
2360
- "model_count": 3
2361
- },
2362
- {
2363
- "developer": "sakaltcommunity",
2364
- "model_count": 2
2365
- },
2366
- {
2367
- "developer": "Sakalti",
2368
- "model_count": 66
2369
- },
2370
- {
2371
- "developer": "sakhan10",
2372
- "model_count": 1
2373
- },
2374
- {
2375
- "developer": "salesforce",
2376
- "model_count": 9
2377
- },
2378
- {
2379
- "developer": "saltlux",
2380
- "model_count": 2
2381
- },
2382
- {
2383
- "developer": "sam-paech",
2384
- "model_count": 3
2385
- },
2386
- {
2387
- "developer": "SanjiWatsuki",
2388
- "model_count": 2
2389
- },
2390
- {
2391
- "developer": "Sao10K",
2392
- "model_count": 8
2393
- },
2394
- {
2395
- "developer": "sarvamai",
2396
- "model_count": 1
2397
- },
2398
- {
2399
- "developer": "Saxo",
2400
- "model_count": 11
2401
- },
2402
- {
2403
- "developer": "schnapss",
2404
- "model_count": 1
2405
- },
2406
- {
2407
- "developer": "Schrieffer",
2408
- "model_count": 1
2409
- },
2410
- {
2411
- "developer": "sci-m-wang",
2412
- "model_count": 3
2413
- },
2414
- {
2415
- "developer": "SeaLLMs",
2416
- "model_count": 3
2417
- },
2418
- {
2419
- "developer": "securin",
2420
- "model_count": 1
2421
- },
2422
- {
2423
- "developer": "senseable",
2424
- "model_count": 1
2425
- },
2426
- {
2427
- "developer": "SenseLLM",
2428
- "model_count": 2
2429
- },
2430
- {
2431
- "developer": "SentientAGI",
2432
- "model_count": 2
2433
- },
2434
- {
2435
- "developer": "SeppeV",
2436
- "model_count": 1
2437
- },
2438
- {
2439
- "developer": "sequelbox",
2440
- "model_count": 6
2441
- },
2442
- {
2443
- "developer": "sethuiyer",
2444
- "model_count": 6
2445
- },
2446
- {
2447
- "developer": "SF-Foundation",
2448
- "model_count": 2
2449
- },
2450
- {
2451
- "developer": "sfairXC",
2452
- "model_count": 1
2453
- },
2454
- {
2455
- "developer": "shadowml",
2456
- "model_count": 2
2457
- },
2458
- {
2459
- "developer": "Sharathhebbar24",
2460
- "model_count": 2
2461
- },
2462
- {
2463
- "developer": "shastraai",
2464
- "model_count": 1
2465
- },
2466
- {
2467
- "developer": "ShikaiChen",
2468
- "model_count": 1
2469
- },
2470
- {
2471
- "developer": "shivam9980",
2472
- "model_count": 2
2473
- },
2474
- {
2475
- "developer": "shivank21",
2476
- "model_count": 1
2477
- },
2478
- {
2479
- "developer": "Shreyash2010",
2480
- "model_count": 1
2481
- },
2482
- {
2483
- "developer": "shuttleai",
2484
- "model_count": 1
2485
- },
2486
- {
2487
- "developer": "shyamieee",
2488
- "model_count": 1
2489
- },
2490
- {
2491
- "developer": "Sicarius-Prototyping",
2492
- "model_count": 3
2493
- },
2494
- {
2495
- "developer": "SicariusSicariiStuff",
2496
- "model_count": 19
2497
- },
2498
- {
2499
- "developer": "silma-ai",
2500
- "model_count": 2
2501
- },
2502
- {
2503
- "developer": "siqi00",
2504
- "model_count": 2
2505
- },
2506
- {
2507
- "developer": "skumar9",
2508
- "model_count": 1
2509
- },
2510
- {
2511
- "developer": "skymizer",
2512
- "model_count": 1
2513
- },
2514
- {
2515
- "developer": "SkyOrbis",
2516
- "model_count": 12
2517
- },
2518
- {
2519
- "developer": "Skywork",
2520
- "model_count": 15
2521
- },
2522
- {
2523
- "developer": "snowflake",
2524
- "model_count": 1
2525
- },
2526
- {
2527
- "developer": "Solshine",
2528
- "model_count": 2
2529
- },
2530
- {
2531
- "developer": "someon98",
2532
- "model_count": 1
2533
- },
2534
- {
2535
- "developer": "sometimesanotion",
2536
- "model_count": 58
2537
- },
2538
- {
2539
- "developer": "sonthenguyen",
2540
- "model_count": 6
2541
- },
2542
- {
2543
- "developer": "sophosympatheia",
2544
- "model_count": 1
2545
- },
2546
- {
2547
- "developer": "Sorawiz",
2548
- "model_count": 2
2549
- },
2550
- {
2551
- "developer": "Sourjayon",
2552
- "model_count": 2
2553
- },
2554
- {
2555
- "developer": "SpaceYL",
2556
- "model_count": 1
2557
- },
2558
- {
2559
- "developer": "speakleash",
2560
- "model_count": 5
2561
- },
2562
- {
2563
- "developer": "speakleash-ack-cyfronet-agh",
2564
- "model_count": 1
2565
- },
2566
- {
2567
- "developer": "Spestly",
2568
- "model_count": 3
2569
- },
2570
- {
2571
- "developer": "spmurrayzzz",
2572
- "model_count": 1
2573
- },
2574
- {
2575
- "developer": "spow12",
2576
- "model_count": 4
2577
- },
2578
- {
2579
- "developer": "ssmits",
2580
- "model_count": 1
2581
- },
2582
- {
2583
- "developer": "stabilityai",
2584
- "model_count": 9
2585
- },
2586
- {
2587
- "developer": "stanford",
2588
- "model_count": 1
2589
- },
2590
- {
2591
- "developer": "stanfordnlp",
2592
- "model_count": 2
2593
- },
2594
- {
2595
- "developer": "Stark2008",
2596
- "model_count": 3
2597
- },
2598
- {
2599
- "developer": "Steelskull",
2600
- "model_count": 2
2601
- },
2602
- {
2603
- "developer": "StelleX",
2604
- "model_count": 2
2605
- },
2606
- {
2607
- "developer": "sthenno",
2608
- "model_count": 9
2609
- },
2610
- {
2611
- "developer": "sthenno-com",
2612
- "model_count": 4
2613
- },
2614
- {
2615
- "developer": "streamerbtw1002",
2616
- "model_count": 1
2617
- },
2618
- {
2619
- "developer": "stupidity-ai",
2620
- "model_count": 1
2621
- },
2622
- {
2623
- "developer": "suayptalha",
2624
- "model_count": 12
2625
- },
2626
- {
2627
- "developer": "SultanR",
2628
- "model_count": 4
2629
- },
2630
- {
2631
- "developer": "sumink",
2632
- "model_count": 22
2633
- },
2634
- {
2635
- "developer": "sunbaby",
2636
- "model_count": 1
2637
- },
2638
- {
2639
- "developer": "Supichi",
2640
- "model_count": 11
2641
- },
2642
- {
2643
- "developer": "Svak",
2644
- "model_count": 2
2645
- },
2646
- {
2647
- "developer": "swap-uniba",
2648
- "model_count": 1
2649
- },
2650
- {
2651
- "developer": "Syed-Hasan-8503",
2652
- "model_count": 1
2653
- },
2654
- {
2655
- "developer": "synergetic",
2656
- "model_count": 1
2657
- },
2658
- {
2659
- "developer": "T145",
2660
- "model_count": 51
2661
- },
2662
- {
2663
- "developer": "talha2001",
2664
- "model_count": 1
2665
- },
2666
- {
2667
- "developer": "tangledgroup",
2668
- "model_count": 2
2669
- },
2670
- {
2671
- "developer": "tanliboy",
2672
- "model_count": 3
2673
- },
2674
- {
2675
- "developer": "tannedbum",
2676
- "model_count": 4
2677
- },
2678
- {
2679
- "developer": "Tarek07",
2680
- "model_count": 2
2681
- },
2682
- {
2683
- "developer": "TeeZee",
2684
- "model_count": 1
2685
- },
2686
- {
2687
- "developer": "teknium",
2688
- "model_count": 5
2689
- },
2690
- {
2691
- "developer": "Telugu-LLM-Labs",
2692
- "model_count": 2
2693
- },
2694
- {
2695
- "developer": "TencentARC",
2696
- "model_count": 4
2697
- },
2698
- {
2699
- "developer": "tensopolis",
2700
- "model_count": 15
2701
- },
2702
- {
2703
- "developer": "tensoropera",
2704
- "model_count": 1
2705
- },
2706
- {
2707
- "developer": "tenyx",
2708
- "model_count": 1
2709
- },
2710
- {
2711
- "developer": "TheDrummer",
2712
- "model_count": 9
2713
- },
2714
- {
2715
- "developer": "TheDrunkenSnail",
2716
- "model_count": 3
2717
- },
2718
- {
2719
- "developer": "TheHierophant",
2720
- "model_count": 1
2721
- },
2722
- {
2723
- "developer": "theo77186",
2724
- "model_count": 1
2725
- },
2726
- {
2727
- "developer": "theprint",
2728
- "model_count": 18
2729
- },
2730
- {
2731
- "developer": "TheTsar1209",
2732
- "model_count": 7
2733
- },
2734
- {
2735
- "developer": "thinkcoder",
2736
- "model_count": 1
2737
- },
2738
- {
2739
- "developer": "thirdeyeai",
2740
- "model_count": 1
2741
- },
2742
- {
2743
- "developer": "thomas-yanxin",
2744
- "model_count": 4
2745
- },
2746
- {
2747
- "developer": "THUDM",
2748
- "model_count": 5
2749
- },
2750
- {
2751
- "developer": "tianyil1",
2752
- "model_count": 1
2753
- },
2754
- {
2755
- "developer": "TIGER-Lab",
2756
- "model_count": 6
2757
- },
2758
- {
2759
- "developer": "tii-uae",
2760
- "model_count": 4
2761
- },
2762
- {
2763
- "developer": "tiiuae",
2764
- "model_count": 18
2765
- },
2766
- {
2767
- "developer": "Tijmen2",
2768
- "model_count": 1
2769
- },
2770
- {
2771
- "developer": "tinycompany",
2772
- "model_count": 15
2773
- },
2774
- {
2775
- "developer": "TinyLlama",
2776
- "model_count": 6
2777
- },
2778
- {
2779
- "developer": "tklohj",
2780
- "model_count": 1
2781
- },
2782
- {
2783
- "developer": "ToastyPigeon",
2784
- "model_count": 1
2785
- },
2786
- {
2787
- "developer": "together",
2788
- "model_count": 4
2789
- },
2790
- {
2791
- "developer": "togethercomputer",
2792
- "model_count": 10
2793
- },
2794
- {
2795
- "developer": "tokyotech-llm",
2796
- "model_count": 1
2797
- },
2798
- {
2799
- "developer": "tomasmcm",
2800
- "model_count": 1
2801
- },
2802
- {
2803
- "developer": "Trappu",
2804
- "model_count": 2
2805
- },
2806
- {
2807
- "developer": "Tremontaine",
2808
- "model_count": 1
2809
- },
2810
- {
2811
- "developer": "Triangle104",
2812
- "model_count": 61
2813
- },
2814
- {
2815
- "developer": "trthminh1112",
2816
- "model_count": 1
2817
- },
2818
- {
2819
- "developer": "Tsunami-th",
2820
- "model_count": 4
2821
- },
2822
- {
2823
- "developer": "TTTXXX01",
2824
- "model_count": 1
2825
- },
2826
- {
2827
- "developer": "tugstugi",
2828
- "model_count": 1
2829
- },
2830
- {
2831
- "developer": "UCLA-AGI",
2832
- "model_count": 10
2833
- },
2834
- {
2835
- "developer": "uiuc-oumi",
2836
- "model_count": 2
2837
- },
2838
- {
2839
- "developer": "UKzExecution",
2840
- "model_count": 1
2841
- },
2842
- {
2843
- "developer": "Unbabel",
2844
- "model_count": 1
2845
- },
2846
- {
2847
- "developer": "Undi95",
2848
- "model_count": 2
2849
- },
2850
- {
2851
- "developer": "universalml",
2852
- "model_count": 1
2853
- },
2854
- {
2855
- "developer": "unknown",
2856
- "model_count": 10
2857
- },
2858
- {
2859
- "developer": "unsloth",
2860
- "model_count": 6
2861
- },
2862
- {
2863
- "developer": "upstage",
2864
- "model_count": 4
2865
- },
2866
- {
2867
- "developer": "utkmst",
2868
- "model_count": 1
2869
- },
2870
- {
2871
- "developer": "uukuguy",
2872
- "model_count": 7
2873
- },
2874
- {
2875
- "developer": "v000000",
2876
- "model_count": 6
2877
- },
2878
- {
2879
- "developer": "V3N0M",
2880
- "model_count": 1
2881
- },
2882
- {
2883
- "developer": "VAGOsolutions",
2884
- "model_count": 17
2885
- },
2886
- {
2887
- "developer": "ValiantLabs",
2888
- "model_count": 11
2889
- },
2890
- {
2891
- "developer": "vhab10",
2892
- "model_count": 3
2893
- },
2894
- {
2895
- "developer": "vicgalle",
2896
- "model_count": 12
2897
- },
2898
- {
2899
- "developer": "viettelsecurity-ai",
2900
- "model_count": 1
2901
- },
2902
- {
2903
- "developer": "vihangd",
2904
- "model_count": 1
2905
- },
2906
- {
2907
- "developer": "Vikhrmodels",
2908
- "model_count": 2
2909
- },
2910
- {
2911
- "developer": "VIRNECT",
2912
- "model_count": 2
2913
- },
2914
- {
2915
- "developer": "voidful",
2916
- "model_count": 1
2917
- },
2918
- {
2919
- "developer": "vonjack",
2920
- "model_count": 7
2921
- },
2922
- {
2923
- "developer": "w4r10ck",
2924
- "model_count": 1
2925
- },
2926
- {
2927
- "developer": "wanlige",
2928
- "model_count": 3
2929
- },
2930
- {
2931
- "developer": "wannaphong",
2932
- "model_count": 1
2933
- },
2934
- {
2935
- "developer": "waqasali1707",
2936
- "model_count": 1
2937
- },
2938
- {
2939
- "developer": "wave-on-discord",
2940
- "model_count": 1
2941
- },
2942
- {
2943
- "developer": "weathermanj",
2944
- "model_count": 4
2945
- },
2946
- {
2947
- "developer": "wenbopan",
2948
- "model_count": 1
2949
- },
2950
- {
2951
- "developer": "weqweasdas",
2952
- "model_count": 5
2953
- },
2954
- {
2955
- "developer": "Weyaxi",
2956
- "model_count": 8
2957
- },
2958
- {
2959
- "developer": "win10",
2960
- "model_count": 9
2961
- },
2962
- {
2963
- "developer": "winglian",
2964
- "model_count": 2
2965
- },
2966
- {
2967
- "developer": "WizardLMTeam",
2968
- "model_count": 3
2969
- },
2970
- {
2971
- "developer": "Wladastic",
2972
- "model_count": 1
2973
- },
2974
- {
2975
- "developer": "writer",
2976
- "model_count": 8
2977
- },
2978
- {
2979
- "developer": "wzhouad",
2980
- "model_count": 1
2981
- },
2982
- {
2983
- "developer": "x0000001",
2984
- "model_count": 1
2985
- },
2986
- {
2987
- "developer": "xAI",
2988
- "model_count": 11
2989
- },
2990
- {
2991
- "developer": "Xclbr7",
2992
- "model_count": 4
2993
- },
2994
- {
2995
- "developer": "Xiaojian9992024",
2996
- "model_count": 12
2997
- },
2998
- {
2999
- "developer": "xinchen9",
3000
- "model_count": 5
3001
- },
3002
- {
3003
- "developer": "Xkev",
3004
- "model_count": 1
3005
- },
3006
- {
3007
- "developer": "xkp24",
3008
- "model_count": 8
3009
- },
3010
- {
3011
- "developer": "xMaulana",
3012
- "model_count": 1
3013
- },
3014
- {
3015
- "developer": "xukp20",
3016
- "model_count": 8
3017
- },
3018
- {
3019
- "developer": "xwen-team",
3020
- "model_count": 1
3021
- },
3022
- {
3023
- "developer": "xxx777xxxASD",
3024
- "model_count": 1
3025
- },
3026
- {
3027
- "developer": "yam-peleg",
3028
- "model_count": 3
3029
- },
3030
- {
3031
- "developer": "yandex",
3032
- "model_count": 1
3033
- },
3034
- {
3035
- "developer": "yanng1242",
3036
- "model_count": 1
3037
- },
3038
- {
3039
- "developer": "Yash21",
3040
- "model_count": 1
3041
- },
3042
- {
3043
- "developer": "yasserrmd",
3044
- "model_count": 2
3045
- },
3046
- {
3047
- "developer": "ycros",
3048
- "model_count": 1
3049
- },
3050
- {
3051
- "developer": "yfzp",
3052
- "model_count": 8
3053
- },
3054
- {
3055
- "developer": "yifAI",
3056
- "model_count": 1
3057
- },
3058
- {
3059
- "developer": "ylalain",
3060
- "model_count": 1
3061
- },
3062
- {
3063
- "developer": "ymcki",
3064
- "model_count": 11
3065
- },
3066
- {
3067
- "developer": "Youlln",
3068
- "model_count": 19
3069
- },
3070
- {
3071
- "developer": "YoungPanda",
3072
- "model_count": 1
3073
- },
3074
- {
3075
- "developer": "YOYO-AI",
3076
- "model_count": 21
3077
- },
3078
- {
3079
- "developer": "yuchenxie",
3080
- "model_count": 2
3081
- },
3082
- {
3083
- "developer": "Yuma42",
3084
- "model_count": 3
3085
- },
3086
- {
3087
- "developer": "yuvraj17",
3088
- "model_count": 3
3089
- },
3090
- {
3091
- "developer": "Z-AI",
3092
- "model_count": 2
3093
- },
3094
- {
3095
- "developer": "Z.ai",
3096
- "model_count": 2
3097
- },
3098
- {
3099
- "developer": "Z1-Coder",
3100
- "model_count": 1
3101
- },
3102
- {
3103
- "developer": "zai-org",
3104
- "model_count": 1
3105
- },
3106
- {
3107
- "developer": "zake7749",
3108
- "model_count": 2
3109
- },
3110
- {
3111
- "developer": "zelk12",
3112
- "model_count": 78
3113
- },
3114
- {
3115
- "developer": "ZeroXClem",
3116
- "model_count": 11
3117
- },
3118
- {
3119
- "developer": "zetasepic",
3120
- "model_count": 2
3121
- },
3122
- {
3123
- "developer": "ZeusLabs",
3124
- "model_count": 1
3125
- },
3126
- {
3127
- "developer": "ZhangShenao",
3128
- "model_count": 1
3129
- },
3130
- {
3131
- "developer": "zhengr",
3132
- "model_count": 1
3133
- },
3134
- {
3135
- "developer": "zhipu",
3136
- "model_count": 3
3137
- },
3138
- {
3139
- "developer": "zhipu-ai",
3140
- "model_count": 1
3141
- },
3142
- {
3143
- "developer": "ZHLiu627",
3144
- "model_count": 2
3145
- },
3146
- {
3147
- "developer": "ZiyiYe",
3148
- "model_count": 1
3149
- }
3150
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/0-hero.json DELETED
@@ -1,47 +0,0 @@
1
- {
2
- "developer": "0-hero",
3
- "models": [
4
- {
5
- "id": "0-hero/Matter-0.1-7B-boost-DPO-preview",
6
- "name": "0-hero/Matter-0.1-7B-boost-DPO-preview",
7
- "developer": "0-hero",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "reward-bench/Score": 0.7448,
11
- "reward-bench/Chat": 0.9106,
12
- "reward-bench/Chat Hard": 0.6096,
13
- "reward-bench/Safety": 0.7135,
14
- "reward-bench/Reasoning": 0.8395,
15
- "reward-bench/Prior Sets (0.5 weight)": 0.5566
16
- }
17
- },
18
- {
19
- "id": "0-hero/Matter-0.1-7B-DPO-preview",
20
- "name": "0-hero/Matter-0.1-7B-DPO-preview",
21
- "developer": "0-hero",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "reward-bench/Score": 0.7247,
25
- "reward-bench/Chat": 0.8939,
26
- "reward-bench/Chat Hard": 0.5768,
27
- "reward-bench/Safety": 0.6378,
28
- "reward-bench/Reasoning": 0.8854,
29
- "reward-bench/Prior Sets (0.5 weight)": 0.5348
30
- }
31
- },
32
- {
33
- "id": "0-hero/Matter-0.2-7B-DPO",
34
- "name": "Matter-0.2-7B-DPO",
35
- "developer": "0-hero",
36
- "evaluator_relationship": null,
37
- "benchmark_scores": {
38
- "hfopenllm_v2/IFEval": 0.3303,
39
- "hfopenllm_v2/BBH": 0.3596,
40
- "hfopenllm_v2/MATH Level 5": 0.0144,
41
- "hfopenllm_v2/GPQA": 0.2592,
42
- "hfopenllm_v2/MUSR": 0.3814,
43
- "hfopenllm_v2/MMLU-PRO": 0.1164
44
- }
45
- }
46
- ]
47
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/01-ai.json DELETED
@@ -1,417 +0,0 @@
1
- {
2
- "developer": "01-ai",
3
- "models": [
4
- {
5
- "id": "01-ai/Yi-1.5-34B",
6
- "name": "Yi-1.5-34B",
7
- "developer": "01-ai",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.2841,
11
- "hfopenllm_v2/BBH": 0.5976,
12
- "hfopenllm_v2/MATH Level 5": 0.1533,
13
- "hfopenllm_v2/GPQA": 0.3658,
14
- "hfopenllm_v2/MUSR": 0.4236,
15
- "hfopenllm_v2/MMLU-PRO": 0.4666
16
- }
17
- },
18
- {
19
- "id": "01-ai/Yi-1.5-34B-32K",
20
- "name": "Yi-1.5-34B-32K",
21
- "developer": "01-ai",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.3119,
25
- "hfopenllm_v2/BBH": 0.6016,
26
- "hfopenllm_v2/MATH Level 5": 0.1541,
27
- "hfopenllm_v2/GPQA": 0.3633,
28
- "hfopenllm_v2/MUSR": 0.4398,
29
- "hfopenllm_v2/MMLU-PRO": 0.4709
30
- }
31
- },
32
- {
33
- "id": "01-ai/Yi-1.5-34B-Chat",
34
- "name": "Yi-1.5-34B-Chat",
35
- "developer": "01-ai",
36
- "evaluator_relationship": null,
37
- "benchmark_scores": {
38
- "hfopenllm_v2/IFEval": 0.6067,
39
- "hfopenllm_v2/BBH": 0.6084,
40
- "hfopenllm_v2/MATH Level 5": 0.2772,
41
- "hfopenllm_v2/GPQA": 0.3649,
42
- "hfopenllm_v2/MUSR": 0.4282,
43
- "hfopenllm_v2/MMLU-PRO": 0.452
44
- }
45
- },
46
- {
47
- "id": "01-ai/Yi-1.5-34B-Chat-16K",
48
- "name": "Yi-1.5-34B-Chat-16K",
49
- "developer": "01-ai",
50
- "evaluator_relationship": null,
51
- "benchmark_scores": {
52
- "hfopenllm_v2/IFEval": 0.4564,
53
- "hfopenllm_v2/BBH": 0.61,
54
- "hfopenllm_v2/MATH Level 5": 0.2137,
55
- "hfopenllm_v2/GPQA": 0.3381,
56
- "hfopenllm_v2/MUSR": 0.4398,
57
- "hfopenllm_v2/MMLU-PRO": 0.4545
58
- }
59
- },
60
- {
61
- "id": "01-ai/Yi-1.5-6B",
62
- "name": "Yi-1.5-6B",
63
- "developer": "01-ai",
64
- "evaluator_relationship": null,
65
- "benchmark_scores": {
66
- "hfopenllm_v2/IFEval": 0.2617,
67
- "hfopenllm_v2/BBH": 0.4493,
68
- "hfopenllm_v2/MATH Level 5": 0.0665,
69
- "hfopenllm_v2/GPQA": 0.3138,
70
- "hfopenllm_v2/MUSR": 0.4374,
71
- "hfopenllm_v2/MMLU-PRO": 0.3144
72
- }
73
- },
74
- {
75
- "id": "01-ai/Yi-1.5-6B-Chat",
76
- "name": "Yi-1.5-6B-Chat",
77
- "developer": "01-ai",
78
- "evaluator_relationship": null,
79
- "benchmark_scores": {
80
- "hfopenllm_v2/IFEval": 0.5145,
81
- "hfopenllm_v2/BBH": 0.4571,
82
- "hfopenllm_v2/MATH Level 5": 0.1624,
83
- "hfopenllm_v2/GPQA": 0.302,
84
- "hfopenllm_v2/MUSR": 0.4392,
85
- "hfopenllm_v2/MMLU-PRO": 0.3193
86
- }
87
- },
88
- {
89
- "id": "01-ai/Yi-1.5-9B",
90
- "name": "Yi-1.5-9B",
91
- "developer": "01-ai",
92
- "evaluator_relationship": null,
93
- "benchmark_scores": {
94
- "hfopenllm_v2/IFEval": 0.2936,
95
- "hfopenllm_v2/BBH": 0.5143,
96
- "hfopenllm_v2/MATH Level 5": 0.114,
97
- "hfopenllm_v2/GPQA": 0.3792,
98
- "hfopenllm_v2/MUSR": 0.4328,
99
- "hfopenllm_v2/MMLU-PRO": 0.3916
100
- }
101
- },
102
- {
103
- "id": "01-ai/Yi-1.5-9B-32K",
104
- "name": "Yi-1.5-9B-32K",
105
- "developer": "01-ai",
106
- "evaluator_relationship": null,
107
- "benchmark_scores": {
108
- "hfopenllm_v2/IFEval": 0.2303,
109
- "hfopenllm_v2/BBH": 0.4963,
110
- "hfopenllm_v2/MATH Level 5": 0.108,
111
- "hfopenllm_v2/GPQA": 0.3591,
112
- "hfopenllm_v2/MUSR": 0.4186,
113
- "hfopenllm_v2/MMLU-PRO": 0.3765
114
- }
115
- },
116
- {
117
- "id": "01-ai/Yi-1.5-9B-Chat",
118
- "name": "Yi-1.5-9B-Chat",
119
- "developer": "01-ai",
120
- "evaluator_relationship": null,
121
- "benchmark_scores": {
122
- "hfopenllm_v2/IFEval": 0.6046,
123
- "hfopenllm_v2/BBH": 0.5559,
124
- "hfopenllm_v2/MATH Level 5": 0.2258,
125
- "hfopenllm_v2/GPQA": 0.3347,
126
- "hfopenllm_v2/MUSR": 0.4259,
127
- "hfopenllm_v2/MMLU-PRO": 0.3975
128
- }
129
- },
130
- {
131
- "id": "01-ai/Yi-1.5-9B-Chat-16K",
132
- "name": "Yi-1.5-9B-Chat-16K",
133
- "developer": "01-ai",
134
- "evaluator_relationship": null,
135
- "benchmark_scores": {
136
- "hfopenllm_v2/IFEval": 0.4214,
137
- "hfopenllm_v2/BBH": 0.5153,
138
- "hfopenllm_v2/MATH Level 5": 0.1782,
139
- "hfopenllm_v2/GPQA": 0.3087,
140
- "hfopenllm_v2/MUSR": 0.4099,
141
- "hfopenllm_v2/MMLU-PRO": 0.3994
142
- }
143
- },
144
- {
145
- "id": "01-ai/yi-34b",
146
- "name": "Yi 34B",
147
- "developer": "01-ai",
148
- "evaluator_relationship": null,
149
- "benchmark_scores": {
150
- "helm_lite/Mean win rate": 0.57,
151
- "helm_lite/NarrativeQA": 0.782,
152
- "helm_lite/NaturalQuestions (closed-book)": 0.443,
153
- "helm_lite/OpenbookQA": 0.92,
154
- "helm_lite/MMLU": 0.65,
155
- "helm_lite/MATH": 0.375,
156
- "helm_lite/GSM8K": 0.648,
157
- "helm_lite/LegalBench": 0.618,
158
- "helm_lite/MedQA": 0.656,
159
- "helm_lite/WMT 2014": 0.172,
160
- "helm_mmlu/MMLU All Subjects": 0.762,
161
- "helm_mmlu/Abstract Algebra": 0.4,
162
- "helm_mmlu/Anatomy": 0.748,
163
- "helm_mmlu/College Physics": 0.5,
164
- "helm_mmlu/Computer Security": 0.83,
165
- "helm_mmlu/Econometrics": 0.588,
166
- "helm_mmlu/Global Facts": 0.53,
167
- "helm_mmlu/Jurisprudence": 0.898,
168
- "helm_mmlu/Philosophy": 0.82,
169
- "helm_mmlu/Professional Psychology": 0.835,
170
- "helm_mmlu/Us Foreign Policy": 0.91,
171
- "helm_mmlu/Astronomy": 0.901,
172
- "helm_mmlu/Business Ethics": 0.75,
173
- "helm_mmlu/Clinical Knowledge": 0.8,
174
- "helm_mmlu/Conceptual Physics": 0.77,
175
- "helm_mmlu/Electrical Engineering": 0.779,
176
- "helm_mmlu/Elementary Mathematics": 0.656,
177
- "helm_mmlu/Formal Logic": 0.548,
178
- "helm_mmlu/High School World History": 0.907,
179
- "helm_mmlu/Human Sexuality": 0.87,
180
- "helm_mmlu/International Law": 0.909,
181
- "helm_mmlu/Logical Fallacies": 0.883,
182
- "helm_mmlu/Machine Learning": 0.58,
183
- "helm_mmlu/Management": 0.893,
184
- "helm_mmlu/Marketing": 0.936,
185
- "helm_mmlu/Medical Genetics": 0.87,
186
- "helm_mmlu/Miscellaneous": 0.902,
187
- "helm_mmlu/Moral Scenarios": 0.606,
188
- "helm_mmlu/Nutrition": 0.869,
189
- "helm_mmlu/Prehistory": 0.877,
190
- "helm_mmlu/Public Relations": 0.745,
191
- "helm_mmlu/Security Studies": 0.833,
192
- "helm_mmlu/Sociology": 0.9,
193
- "helm_mmlu/Virology": 0.572,
194
- "helm_mmlu/World Religions": 0.877,
195
- "helm_mmlu/Mean win rate": 0.315,
196
- "hfopenllm_v2/IFEval": 0.3046,
197
- "hfopenllm_v2/BBH": 0.5457,
198
- "hfopenllm_v2/MATH Level 5": 0.0514,
199
- "hfopenllm_v2/GPQA": 0.3666,
200
- "hfopenllm_v2/MUSR": 0.4119,
201
- "hfopenllm_v2/MMLU-PRO": 0.4412
202
- }
203
- },
204
- {
205
- "id": "01-ai/Yi-34B-200K",
206
- "name": "Yi-34B-200K",
207
- "developer": "01-ai",
208
- "evaluator_relationship": null,
209
- "benchmark_scores": {
210
- "hfopenllm_v2/IFEval": 0.1542,
211
- "hfopenllm_v2/BBH": 0.5442,
212
- "hfopenllm_v2/MATH Level 5": 0.0574,
213
- "hfopenllm_v2/GPQA": 0.3565,
214
- "hfopenllm_v2/MUSR": 0.3817,
215
- "hfopenllm_v2/MMLU-PRO": 0.4535
216
- }
217
- },
218
- {
219
- "id": "01-ai/Yi-34B-Chat",
220
- "name": "Yi-34B-Chat",
221
- "developer": "01-ai",
222
- "evaluator_relationship": null,
223
- "benchmark_scores": {
224
- "hfopenllm_v2/IFEval": 0.4699,
225
- "hfopenllm_v2/BBH": 0.5561,
226
- "hfopenllm_v2/MATH Level 5": 0.0627,
227
- "hfopenllm_v2/GPQA": 0.3381,
228
- "hfopenllm_v2/MUSR": 0.3978,
229
- "hfopenllm_v2/MMLU-PRO": 0.4093
230
- }
231
- },
232
- {
233
- "id": "01-ai/yi-6b",
234
- "name": "Yi 6B",
235
- "developer": "01-ai",
236
- "evaluator_relationship": null,
237
- "benchmark_scores": {
238
- "helm_lite/Mean win rate": 0.253,
239
- "helm_lite/NarrativeQA": 0.702,
240
- "helm_lite/NaturalQuestions (closed-book)": 0.31,
241
- "helm_lite/OpenbookQA": 0.8,
242
- "helm_lite/MMLU": 0.53,
243
- "helm_lite/MATH": 0.126,
244
- "helm_lite/GSM8K": 0.375,
245
- "helm_lite/LegalBench": 0.519,
246
- "helm_lite/MedQA": 0.497,
247
- "helm_lite/WMT 2014": 0.117,
248
- "helm_mmlu/MMLU All Subjects": 0.64,
249
- "helm_mmlu/Abstract Algebra": 0.3,
250
- "helm_mmlu/Anatomy": 0.6,
251
- "helm_mmlu/College Physics": 0.422,
252
- "helm_mmlu/Computer Security": 0.73,
253
- "helm_mmlu/Econometrics": 0.351,
254
- "helm_mmlu/Global Facts": 0.43,
255
- "helm_mmlu/Jurisprudence": 0.796,
256
- "helm_mmlu/Philosophy": 0.678,
257
- "helm_mmlu/Professional Psychology": 0.668,
258
- "helm_mmlu/Us Foreign Policy": 0.87,
259
- "helm_mmlu/Astronomy": 0.684,
260
- "helm_mmlu/Business Ethics": 0.67,
261
- "helm_mmlu/Clinical Knowledge": 0.66,
262
- "helm_mmlu/Conceptual Physics": 0.621,
263
- "helm_mmlu/Electrical Engineering": 0.662,
264
- "helm_mmlu/Elementary Mathematics": 0.452,
265
- "helm_mmlu/Formal Logic": 0.452,
266
- "helm_mmlu/High School World History": 0.785,
267
- "helm_mmlu/Human Sexuality": 0.763,
268
- "helm_mmlu/International Law": 0.769,
269
- "helm_mmlu/Logical Fallacies": 0.779,
270
- "helm_mmlu/Machine Learning": 0.411,
271
- "helm_mmlu/Management": 0.806,
272
- "helm_mmlu/Marketing": 0.893,
273
- "helm_mmlu/Medical Genetics": 0.77,
274
- "helm_mmlu/Miscellaneous": 0.796,
275
- "helm_mmlu/Moral Scenarios": 0.335,
276
- "helm_mmlu/Nutrition": 0.739,
277
- "helm_mmlu/Prehistory": 0.713,
278
- "helm_mmlu/Public Relations": 0.718,
279
- "helm_mmlu/Security Studies": 0.735,
280
- "helm_mmlu/Sociology": 0.831,
281
- "helm_mmlu/Virology": 0.452,
282
- "helm_mmlu/World Religions": 0.836,
283
- "helm_mmlu/Mean win rate": 0.651,
284
- "hfopenllm_v2/IFEval": 0.2893,
285
- "hfopenllm_v2/BBH": 0.4309,
286
- "hfopenllm_v2/MATH Level 5": 0.0159,
287
- "hfopenllm_v2/GPQA": 0.2693,
288
- "hfopenllm_v2/MUSR": 0.3937,
289
- "hfopenllm_v2/MMLU-PRO": 0.2991
290
- }
291
- },
292
- {
293
- "id": "01-ai/Yi-6B-200K",
294
- "name": "Yi-6B-200K",
295
- "developer": "01-ai",
296
- "evaluator_relationship": null,
297
- "benchmark_scores": {
298
- "hfopenllm_v2/IFEval": 0.0843,
299
- "hfopenllm_v2/BBH": 0.4289,
300
- "hfopenllm_v2/MATH Level 5": 0.0181,
301
- "hfopenllm_v2/GPQA": 0.2819,
302
- "hfopenllm_v2/MUSR": 0.4587,
303
- "hfopenllm_v2/MMLU-PRO": 0.2844
304
- }
305
- },
306
- {
307
- "id": "01-ai/Yi-6B-Chat",
308
- "name": "Yi-6B-Chat",
309
- "developer": "01-ai",
310
- "evaluator_relationship": null,
311
- "benchmark_scores": {
312
- "hfopenllm_v2/IFEval": 0.3395,
313
- "hfopenllm_v2/BBH": 0.4133,
314
- "hfopenllm_v2/MATH Level 5": 0.0136,
315
- "hfopenllm_v2/GPQA": 0.2945,
316
- "hfopenllm_v2/MUSR": 0.3688,
317
- "hfopenllm_v2/MMLU-PRO": 0.3061
318
- }
319
- },
320
- {
321
- "id": "01-ai/Yi-9B",
322
- "name": "Yi-9B",
323
- "developer": "01-ai",
324
- "evaluator_relationship": null,
325
- "benchmark_scores": {
326
- "hfopenllm_v2/IFEval": 0.2709,
327
- "hfopenllm_v2/BBH": 0.494,
328
- "hfopenllm_v2/MATH Level 5": 0.0559,
329
- "hfopenllm_v2/GPQA": 0.318,
330
- "hfopenllm_v2/MUSR": 0.4054,
331
- "hfopenllm_v2/MMLU-PRO": 0.3574
332
- }
333
- },
334
- {
335
- "id": "01-ai/Yi-9B-200K",
336
- "name": "Yi-9B-200K",
337
- "developer": "01-ai",
338
- "evaluator_relationship": null,
339
- "benchmark_scores": {
340
- "hfopenllm_v2/IFEval": 0.2327,
341
- "hfopenllm_v2/BBH": 0.4793,
342
- "hfopenllm_v2/MATH Level 5": 0.0665,
343
- "hfopenllm_v2/GPQA": 0.3154,
344
- "hfopenllm_v2/MUSR": 0.4294,
345
- "hfopenllm_v2/MMLU-PRO": 0.3622
346
- }
347
- },
348
- {
349
- "id": "01-ai/Yi-Coder-9B-Chat",
350
- "name": "Yi-Coder-9B-Chat",
351
- "developer": "01-ai",
352
- "evaluator_relationship": null,
353
- "benchmark_scores": {
354
- "hfopenllm_v2/IFEval": 0.4817,
355
- "hfopenllm_v2/BBH": 0.4814,
356
- "hfopenllm_v2/MATH Level 5": 0.04,
357
- "hfopenllm_v2/GPQA": 0.2475,
358
- "hfopenllm_v2/MUSR": 0.3992,
359
- "hfopenllm_v2/MMLU-PRO": 0.2425
360
- }
361
- },
362
- {
363
- "id": "01-ai/yi-large-preview",
364
- "name": "Yi Large Preview",
365
- "developer": "01-ai",
366
- "evaluator_relationship": null,
367
- "benchmark_scores": {
368
- "helm_lite/Mean win rate": 0.471,
369
- "helm_lite/NarrativeQA": 0.373,
370
- "helm_lite/NaturalQuestions (closed-book)": 0.428,
371
- "helm_lite/OpenbookQA": 0.946,
372
- "helm_lite/MMLU": 0.712,
373
- "helm_lite/MATH": 0.712,
374
- "helm_lite/GSM8K": 0.69,
375
- "helm_lite/LegalBench": 0.519,
376
- "helm_lite/MedQA": 0.66,
377
- "helm_lite/WMT 2014": 0.176,
378
- "helm_mmlu/MMLU All Subjects": 0.793,
379
- "helm_mmlu/Abstract Algebra": 0.6,
380
- "helm_mmlu/Anatomy": 0.83,
381
- "helm_mmlu/College Physics": 0.569,
382
- "helm_mmlu/Computer Security": 0.86,
383
- "helm_mmlu/Econometrics": 0.728,
384
- "helm_mmlu/Global Facts": 0.52,
385
- "helm_mmlu/Jurisprudence": 0.852,
386
- "helm_mmlu/Philosophy": 0.842,
387
- "helm_mmlu/Professional Psychology": 0.853,
388
- "helm_mmlu/Us Foreign Policy": 0.85,
389
- "helm_mmlu/Astronomy": 0.914,
390
- "helm_mmlu/Business Ethics": 0.8,
391
- "helm_mmlu/Clinical Knowledge": 0.857,
392
- "helm_mmlu/Conceptual Physics": 0.864,
393
- "helm_mmlu/Electrical Engineering": 0.779,
394
- "helm_mmlu/Elementary Mathematics": 0.685,
395
- "helm_mmlu/Formal Logic": 0.603,
396
- "helm_mmlu/High School World History": 0.928,
397
- "helm_mmlu/Human Sexuality": 0.901,
398
- "helm_mmlu/International Law": 0.917,
399
- "helm_mmlu/Logical Fallacies": 0.865,
400
- "helm_mmlu/Machine Learning": 0.616,
401
- "helm_mmlu/Management": 0.903,
402
- "helm_mmlu/Marketing": 0.927,
403
- "helm_mmlu/Medical Genetics": 0.83,
404
- "helm_mmlu/Miscellaneous": 0.916,
405
- "helm_mmlu/Moral Scenarios": 0.831,
406
- "helm_mmlu/Nutrition": 0.846,
407
- "helm_mmlu/Prehistory": 0.892,
408
- "helm_mmlu/Public Relations": 0.827,
409
- "helm_mmlu/Security Studies": 0.82,
410
- "helm_mmlu/Sociology": 0.881,
411
- "helm_mmlu/Virology": 0.59,
412
- "helm_mmlu/World Religions": 0.871,
413
- "helm_mmlu/Mean win rate": 0.258
414
- }
415
- }
416
- ]
417
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/1-800-llms.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "developer": "1-800-LLMs",
3
- "models": [
4
- {
5
- "id": "1-800-LLMs/Qwen-2.5-14B-Hindi",
6
- "name": "Qwen-2.5-14B-Hindi",
7
- "developer": "1-800-LLMs",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.5826,
11
- "hfopenllm_v2/BBH": 0.6524,
12
- "hfopenllm_v2/MATH Level 5": 0.3331,
13
- "hfopenllm_v2/GPQA": 0.3624,
14
- "hfopenllm_v2/MUSR": 0.4489,
15
- "hfopenllm_v2/MMLU-PRO": 0.5263
16
- }
17
- },
18
- {
19
- "id": "1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct",
20
- "name": "Qwen-2.5-14B-Hindi-Custom-Instruct",
21
- "developer": "1-800-LLMs",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.3077,
25
- "hfopenllm_v2/BBH": 0.6284,
26
- "hfopenllm_v2/MATH Level 5": 0.3112,
27
- "hfopenllm_v2/GPQA": 0.37,
28
- "hfopenllm_v2/MUSR": 0.4491,
29
- "hfopenllm_v2/MMLU-PRO": 0.5164
30
- }
31
- }
32
- ]
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/1024m.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "developer": "1024m",
3
- "models": [
4
- {
5
- "id": "1024m/PHI-4-Hindi",
6
- "name": "PHI-4-Hindi",
7
- "developer": "1024m",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.0082,
11
- "hfopenllm_v2/BBH": 0.671,
12
- "hfopenllm_v2/MATH Level 5": 0.2334,
13
- "hfopenllm_v2/GPQA": 0.3977,
14
- "hfopenllm_v2/MUSR": 0.4914,
15
- "hfopenllm_v2/MMLU-PRO": 0.5239
16
- }
17
- },
18
- {
19
- "id": "1024m/QWEN-14B-B100",
20
- "name": "QWEN-14B-B100",
21
- "developer": "1024m",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.7762,
25
- "hfopenllm_v2/BBH": 0.6533,
26
- "hfopenllm_v2/MATH Level 5": 0.5438,
27
- "hfopenllm_v2/GPQA": 0.3507,
28
- "hfopenllm_v2/MUSR": 0.41,
29
- "hfopenllm_v2/MMLU-PRO": 0.5179
30
- }
31
- }
32
- ]
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/152334h.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "developer": "152334H",
3
- "models": [
4
- {
5
- "id": "152334H/miqu-1-70b-sf",
6
- "name": "miqu-1-70b-sf",
7
- "developer": "152334H",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.5182,
11
- "hfopenllm_v2/BBH": 0.6102,
12
- "hfopenllm_v2/MATH Level 5": 0.1246,
13
- "hfopenllm_v2/GPQA": 0.3507,
14
- "hfopenllm_v2/MUSR": 0.4582,
15
- "hfopenllm_v2/MMLU-PRO": 0.4228
16
- }
17
- }
18
- ]
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/1tuanpham.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "developer": "1TuanPham",
3
- "models": [
4
- {
5
- "id": "1TuanPham/T-VisStar-7B-v0.1",
6
- "name": "T-VisStar-7B-v0.1",
7
- "developer": "1TuanPham",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.3607,
11
- "hfopenllm_v2/BBH": 0.5052,
12
- "hfopenllm_v2/MATH Level 5": 0.0574,
13
- "hfopenllm_v2/GPQA": 0.2852,
14
- "hfopenllm_v2/MUSR": 0.4375,
15
- "hfopenllm_v2/MMLU-PRO": 0.3211
16
- }
17
- },
18
- {
19
- "id": "1TuanPham/T-VisStar-v0.1",
20
- "name": "T-VisStar-v0.1",
21
- "developer": "1TuanPham",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.3607,
25
- "hfopenllm_v2/BBH": 0.5052,
26
- "hfopenllm_v2/MATH Level 5": 0.0574,
27
- "hfopenllm_v2/GPQA": 0.2852,
28
- "hfopenllm_v2/MUSR": 0.4375,
29
- "hfopenllm_v2/MMLU-PRO": 0.3211
30
- }
31
- }
32
- ]
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/3rd-degree-burn.json DELETED
@@ -1,61 +0,0 @@
1
- {
2
- "developer": "3rd-Degree-Burn",
3
- "models": [
4
- {
5
- "id": "3rd-Degree-Burn/L-3.1-Science-Writer-8B",
6
- "name": "L-3.1-Science-Writer-8B",
7
- "developer": "3rd-Degree-Burn",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.4263,
11
- "hfopenllm_v2/BBH": 0.5041,
12
- "hfopenllm_v2/MATH Level 5": 0.1035,
13
- "hfopenllm_v2/GPQA": 0.2743,
14
- "hfopenllm_v2/MUSR": 0.3959,
15
- "hfopenllm_v2/MMLU-PRO": 0.3649
16
- }
17
- },
18
- {
19
- "id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot",
20
- "name": "Llama-3.1-8B-Squareroot",
21
- "developer": "3rd-Degree-Burn",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.2213,
25
- "hfopenllm_v2/BBH": 0.3461,
26
- "hfopenllm_v2/MATH Level 5": 0.2659,
27
- "hfopenllm_v2/GPQA": 0.2567,
28
- "hfopenllm_v2/MUSR": 0.3089,
29
- "hfopenllm_v2/MMLU-PRO": 0.175
30
- }
31
- },
32
- {
33
- "id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1",
34
- "name": "Llama-3.1-8B-Squareroot-v1",
35
- "developer": "3rd-Degree-Burn",
36
- "evaluator_relationship": null,
37
- "benchmark_scores": {
38
- "hfopenllm_v2/IFEval": 0.2892,
39
- "hfopenllm_v2/BBH": 0.3343,
40
- "hfopenllm_v2/MATH Level 5": 0.0884,
41
- "hfopenllm_v2/GPQA": 0.2559,
42
- "hfopenllm_v2/MUSR": 0.3341,
43
- "hfopenllm_v2/MMLU-PRO": 0.1127
44
- }
45
- },
46
- {
47
- "id": "3rd-Degree-Burn/Llama-Squared-8B",
48
- "name": "Llama-Squared-8B",
49
- "developer": "3rd-Degree-Burn",
50
- "evaluator_relationship": null,
51
- "benchmark_scores": {
52
- "hfopenllm_v2/IFEval": 0.2755,
53
- "hfopenllm_v2/BBH": 0.4431,
54
- "hfopenllm_v2/MATH Level 5": 0.0574,
55
- "hfopenllm_v2/GPQA": 0.2718,
56
- "hfopenllm_v2/MUSR": 0.3089,
57
- "hfopenllm_v2/MMLU-PRO": 0.2366
58
- }
59
- }
60
- ]
61
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/4season.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "developer": "4season",
3
- "models": [
4
- {
5
- "id": "4season/final_model_test_v2",
6
- "name": "final_model_test_v2",
7
- "developer": "4season",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.3191,
11
- "hfopenllm_v2/BBH": 0.6342,
12
- "hfopenllm_v2/MATH Level 5": 0.0838,
13
- "hfopenllm_v2/GPQA": 0.3272,
14
- "hfopenllm_v2/MUSR": 0.4314,
15
- "hfopenllm_v2/MMLU-PRO": 0.3528
16
- }
17
- }
18
- ]
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/aaditya.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "developer": "aaditya",
3
- "models": [
4
- {
5
- "id": "aaditya/Llama3-OpenBioLLM-70B",
6
- "name": "Llama3-OpenBioLLM-70B",
7
- "developer": "aaditya",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.7597,
11
- "hfopenllm_v2/BBH": 0.6399,
12
- "hfopenllm_v2/MATH Level 5": 0.1971,
13
- "hfopenllm_v2/GPQA": 0.323,
14
- "hfopenllm_v2/MUSR": 0.4417,
15
- "hfopenllm_v2/MMLU-PRO": 0.4867
16
- }
17
- }
18
- ]
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/aalf.json DELETED
@@ -1,61 +0,0 @@
1
- {
2
- "developer": "AALF",
3
- "models": [
4
- {
5
- "id": "AALF/FuseChat-Llama-3.1-8B-Instruct-preview",
6
- "name": "FuseChat-Llama-3.1-8B-Instruct-preview",
7
- "developer": "AALF",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.719,
11
- "hfopenllm_v2/BBH": 0.512,
12
- "hfopenllm_v2/MATH Level 5": 0.2477,
13
- "hfopenllm_v2/GPQA": 0.3054,
14
- "hfopenllm_v2/MUSR": 0.382,
15
- "hfopenllm_v2/MMLU-PRO": 0.3733
16
- }
17
- },
18
- {
19
- "id": "AALF/FuseChat-Llama-3.1-8B-SFT-preview",
20
- "name": "FuseChat-Llama-3.1-8B-SFT-preview",
21
- "developer": "AALF",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.7281,
25
- "hfopenllm_v2/BBH": 0.524,
26
- "hfopenllm_v2/MATH Level 5": 0.2251,
27
- "hfopenllm_v2/GPQA": 0.3045,
28
- "hfopenllm_v2/MUSR": 0.402,
29
- "hfopenllm_v2/MMLU-PRO": 0.3743
30
- }
31
- },
32
- {
33
- "id": "AALF/gemma-2-27b-it-SimPO-37K",
34
- "name": "gemma-2-27b-it-SimPO-37K",
35
- "developer": "AALF",
36
- "evaluator_relationship": null,
37
- "benchmark_scores": {
38
- "hfopenllm_v2/IFEval": 0.2407,
39
- "hfopenllm_v2/BBH": 0.3911,
40
- "hfopenllm_v2/MATH Level 5": 0.0128,
41
- "hfopenllm_v2/GPQA": 0.2802,
42
- "hfopenllm_v2/MUSR": 0.3488,
43
- "hfopenllm_v2/MMLU-PRO": 0.1971
44
- }
45
- },
46
- {
47
- "id": "AALF/gemma-2-27b-it-SimPO-37K-100steps",
48
- "name": "gemma-2-27b-it-SimPO-37K-100steps",
49
- "developer": "AALF",
50
- "evaluator_relationship": null,
51
- "benchmark_scores": {
52
- "hfopenllm_v2/IFEval": 0.2568,
53
- "hfopenllm_v2/BBH": 0.3931,
54
- "hfopenllm_v2/MATH Level 5": 0.0211,
55
- "hfopenllm_v2/GPQA": 0.2886,
56
- "hfopenllm_v2/MUSR": 0.3329,
57
- "hfopenllm_v2/MMLU-PRO": 0.2125
58
- }
59
- }
60
- ]
61
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/aashraf995.json DELETED
@@ -1,61 +0,0 @@
1
- {
2
- "developer": "Aashraf995",
3
- "models": [
4
- {
5
- "id": "Aashraf995/Creative-7B-nerd",
6
- "name": "Creative-7B-nerd",
7
- "developer": "Aashraf995",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.4722,
11
- "hfopenllm_v2/BBH": 0.5607,
12
- "hfopenllm_v2/MATH Level 5": 0.3165,
13
- "hfopenllm_v2/GPQA": 0.3263,
14
- "hfopenllm_v2/MUSR": 0.4515,
15
- "hfopenllm_v2/MMLU-PRO": 0.4492
16
- }
17
- },
18
- {
19
- "id": "Aashraf995/Gemma-Evo-10B",
20
- "name": "Gemma-Evo-10B",
21
- "developer": "Aashraf995",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.7332,
25
- "hfopenllm_v2/BBH": 0.6044,
26
- "hfopenllm_v2/MATH Level 5": 0.2228,
27
- "hfopenllm_v2/GPQA": 0.354,
28
- "hfopenllm_v2/MUSR": 0.4595,
29
- "hfopenllm_v2/MMLU-PRO": 0.4275
30
- }
31
- },
32
- {
33
- "id": "Aashraf995/Qwen-Evo-7B",
34
- "name": "Qwen-Evo-7B",
35
- "developer": "Aashraf995",
36
- "evaluator_relationship": null,
37
- "benchmark_scores": {
38
- "hfopenllm_v2/IFEval": 0.4757,
39
- "hfopenllm_v2/BBH": 0.5709,
40
- "hfopenllm_v2/MATH Level 5": 0.3142,
41
- "hfopenllm_v2/GPQA": 0.3255,
42
- "hfopenllm_v2/MUSR": 0.4541,
43
- "hfopenllm_v2/MMLU-PRO": 0.4462
44
- }
45
- },
46
- {
47
- "id": "Aashraf995/QwenStock-14B",
48
- "name": "QwenStock-14B",
49
- "developer": "Aashraf995",
50
- "evaluator_relationship": null,
51
- "benchmark_scores": {
52
- "hfopenllm_v2/IFEval": 0.5009,
53
- "hfopenllm_v2/BBH": 0.655,
54
- "hfopenllm_v2/MATH Level 5": 0.3573,
55
- "hfopenllm_v2/GPQA": 0.3893,
56
- "hfopenllm_v2/MUSR": 0.4793,
57
- "hfopenllm_v2/MMLU-PRO": 0.5382
58
- }
59
- }
60
- ]
61
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/abacusai.json DELETED
@@ -1,145 +0,0 @@
1
- {
2
- "developer": "abacusai",
3
- "models": [
4
- {
5
- "id": "abacusai/bigstral-12b-32k",
6
- "name": "bigstral-12b-32k",
7
- "developer": "abacusai",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.4194,
11
- "hfopenllm_v2/BBH": 0.47,
12
- "hfopenllm_v2/MATH Level 5": 0.0151,
13
- "hfopenllm_v2/GPQA": 0.2928,
14
- "hfopenllm_v2/MUSR": 0.456,
15
- "hfopenllm_v2/MMLU-PRO": 0.2641
16
- }
17
- },
18
- {
19
- "id": "abacusai/bigyi-15b",
20
- "name": "bigyi-15b",
21
- "developer": "abacusai",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.2094,
25
- "hfopenllm_v2/BBH": 0.4345,
26
- "hfopenllm_v2/MATH Level 5": 0.0295,
27
- "hfopenllm_v2/GPQA": 0.3096,
28
- "hfopenllm_v2/MUSR": 0.3538,
29
- "hfopenllm_v2/MMLU-PRO": 0.3003
30
- }
31
- },
32
- {
33
- "id": "abacusai/Dracarys-72B-Instruct",
34
- "name": "Dracarys-72B-Instruct",
35
- "developer": "abacusai",
36
- "evaluator_relationship": null,
37
- "benchmark_scores": {
38
- "hfopenllm_v2/IFEval": 0.7856,
39
- "hfopenllm_v2/BBH": 0.6944,
40
- "hfopenllm_v2/MATH Level 5": 0.3965,
41
- "hfopenllm_v2/GPQA": 0.3909,
42
- "hfopenllm_v2/MUSR": 0.4558,
43
- "hfopenllm_v2/MMLU-PRO": 0.5456
44
- }
45
- },
46
- {
47
- "id": "abacusai/Liberated-Qwen1.5-14B",
48
- "name": "Liberated-Qwen1.5-14B",
49
- "developer": "abacusai",
50
- "evaluator_relationship": null,
51
- "benchmark_scores": {
52
- "hfopenllm_v2/IFEval": 0.3631,
53
- "hfopenllm_v2/BBH": 0.4948,
54
- "hfopenllm_v2/MATH Level 5": 0.1601,
55
- "hfopenllm_v2/GPQA": 0.2836,
56
- "hfopenllm_v2/MUSR": 0.4175,
57
- "hfopenllm_v2/MMLU-PRO": 0.3512
58
- }
59
- },
60
- {
61
- "id": "abacusai/Llama-3-Smaug-8B",
62
- "name": "Llama-3-Smaug-8B",
63
- "developer": "abacusai",
64
- "evaluator_relationship": null,
65
- "benchmark_scores": {
66
- "hfopenllm_v2/IFEval": 0.4867,
67
- "hfopenllm_v2/BBH": 0.4931,
68
- "hfopenllm_v2/MATH Level 5": 0.0853,
69
- "hfopenllm_v2/GPQA": 0.2483,
70
- "hfopenllm_v2/MUSR": 0.3622,
71
- "hfopenllm_v2/MMLU-PRO": 0.3185
72
- }
73
- },
74
- {
75
- "id": "abacusai/Smaug-34B-v0.1",
76
- "name": "Smaug-34B-v0.1",
77
- "developer": "abacusai",
78
- "evaluator_relationship": null,
79
- "benchmark_scores": {
80
- "hfopenllm_v2/IFEval": 0.5016,
81
- "hfopenllm_v2/BBH": 0.5358,
82
- "hfopenllm_v2/MATH Level 5": 0.0718,
83
- "hfopenllm_v2/GPQA": 0.3297,
84
- "hfopenllm_v2/MUSR": 0.3979,
85
- "hfopenllm_v2/MMLU-PRO": 0.4543
86
- }
87
- },
88
- {
89
- "id": "abacusai/Smaug-72B-v0.1",
90
- "name": "Smaug-72B-v0.1",
91
- "developer": "abacusai",
92
- "evaluator_relationship": null,
93
- "benchmark_scores": {
94
- "hfopenllm_v2/IFEval": 0.5167,
95
- "hfopenllm_v2/BBH": 0.5996,
96
- "hfopenllm_v2/MATH Level 5": 0.1911,
97
- "hfopenllm_v2/GPQA": 0.3238,
98
- "hfopenllm_v2/MUSR": 0.4473,
99
- "hfopenllm_v2/MMLU-PRO": 0.4624
100
- }
101
- },
102
- {
103
- "id": "abacusai/Smaug-Llama-3-70B-Instruct-32K",
104
- "name": "Smaug-Llama-3-70B-Instruct-32K",
105
- "developer": "abacusai",
106
- "evaluator_relationship": null,
107
- "benchmark_scores": {
108
- "hfopenllm_v2/IFEval": 0.7761,
109
- "hfopenllm_v2/BBH": 0.6493,
110
- "hfopenllm_v2/MATH Level 5": 0.2749,
111
- "hfopenllm_v2/GPQA": 0.2961,
112
- "hfopenllm_v2/MUSR": 0.4208,
113
- "hfopenllm_v2/MMLU-PRO": 0.4765
114
- }
115
- },
116
- {
117
- "id": "abacusai/Smaug-Mixtral-v0.1",
118
- "name": "Smaug-Mixtral-v0.1",
119
- "developer": "abacusai",
120
- "evaluator_relationship": null,
121
- "benchmark_scores": {
122
- "hfopenllm_v2/IFEval": 0.5554,
123
- "hfopenllm_v2/BBH": 0.5162,
124
- "hfopenllm_v2/MATH Level 5": 0.0952,
125
- "hfopenllm_v2/GPQA": 0.3012,
126
- "hfopenllm_v2/MUSR": 0.4298,
127
- "hfopenllm_v2/MMLU-PRO": 0.3352
128
- }
129
- },
130
- {
131
- "id": "abacusai/Smaug-Qwen2-72B-Instruct",
132
- "name": "Smaug-Qwen2-72B-Instruct",
133
- "developer": "abacusai",
134
- "evaluator_relationship": null,
135
- "benchmark_scores": {
136
- "hfopenllm_v2/IFEval": 0.7825,
137
- "hfopenllm_v2/BBH": 0.691,
138
- "hfopenllm_v2/MATH Level 5": 0.4131,
139
- "hfopenllm_v2/GPQA": 0.3616,
140
- "hfopenllm_v2/MUSR": 0.4401,
141
- "hfopenllm_v2/MMLU-PRO": 0.519
142
- }
143
- }
144
- ]
145
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/abacusresearch.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "developer": "AbacusResearch",
3
- "models": [
4
- {
5
- "id": "AbacusResearch/Jallabi-34B",
6
- "name": "Jallabi-34B",
7
- "developer": "AbacusResearch",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.3529,
11
- "hfopenllm_v2/BBH": 0.6023,
12
- "hfopenllm_v2/MATH Level 5": 0.0521,
13
- "hfopenllm_v2/GPQA": 0.3389,
14
- "hfopenllm_v2/MUSR": 0.4822,
15
- "hfopenllm_v2/MMLU-PRO": 0.4682
16
- }
17
- }
18
- ]
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/abhishek.json DELETED
@@ -1,75 +0,0 @@
1
- {
2
- "developer": "abhishek",
3
- "models": [
4
- {
5
- "id": "abhishek/autotrain-0tmgq-5tpbg",
6
- "name": "autotrain-0tmgq-5tpbg",
7
- "developer": "abhishek",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.1952,
11
- "hfopenllm_v2/BBH": 0.3127,
12
- "hfopenllm_v2/MATH Level 5": 0.0128,
13
- "hfopenllm_v2/GPQA": 0.2592,
14
- "hfopenllm_v2/MUSR": 0.3584,
15
- "hfopenllm_v2/MMLU-PRO": 0.1144
16
- }
17
- },
18
- {
19
- "id": "abhishek/autotrain-llama3-70b-orpo-v1",
20
- "name": "autotrain-llama3-70b-orpo-v1",
21
- "developer": "abhishek",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.4233,
25
- "hfopenllm_v2/BBH": 0.5998,
26
- "hfopenllm_v2/MATH Level 5": 0.0106,
27
- "hfopenllm_v2/GPQA": 0.2441,
28
- "hfopenllm_v2/MUSR": 0.3579,
29
- "hfopenllm_v2/MMLU-PRO": 0.1122
30
- }
31
- },
32
- {
33
- "id": "abhishek/autotrain-llama3-70b-orpo-v2",
34
- "name": "autotrain-llama3-70b-orpo-v2",
35
- "developer": "abhishek",
36
- "evaluator_relationship": null,
37
- "benchmark_scores": {
38
- "hfopenllm_v2/IFEval": 0.5406,
39
- "hfopenllm_v2/BBH": 0.5899,
40
- "hfopenllm_v2/MATH Level 5": 0.2107,
41
- "hfopenllm_v2/GPQA": 0.2936,
42
- "hfopenllm_v2/MUSR": 0.4113,
43
- "hfopenllm_v2/MMLU-PRO": 0.4818
44
- }
45
- },
46
- {
47
- "id": "abhishek/autotrain-llama3-orpo-v2",
48
- "name": "autotrain-llama3-orpo-v2",
49
- "developer": "abhishek",
50
- "evaluator_relationship": null,
51
- "benchmark_scores": {
52
- "hfopenllm_v2/IFEval": 0.4372,
53
- "hfopenllm_v2/BBH": 0.3159,
54
- "hfopenllm_v2/MATH Level 5": 0.0468,
55
- "hfopenllm_v2/GPQA": 0.2668,
56
- "hfopenllm_v2/MUSR": 0.3792,
57
- "hfopenllm_v2/MMLU-PRO": 0.2218
58
- }
59
- },
60
- {
61
- "id": "abhishek/autotrain-vr4a1-e5mms",
62
- "name": "autotrain-vr4a1-e5mms",
63
- "developer": "abhishek",
64
- "evaluator_relationship": null,
65
- "benchmark_scores": {
66
- "hfopenllm_v2/IFEval": 0.2142,
67
- "hfopenllm_v2/BBH": 0.5001,
68
- "hfopenllm_v2/MATH Level 5": 0.1412,
69
- "hfopenllm_v2/GPQA": 0.3196,
70
- "hfopenllm_v2/MUSR": 0.3891,
71
- "hfopenllm_v2/MMLU-PRO": 0.3667
72
- }
73
- }
74
- ]
75
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/abideen.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "developer": "abideen",
3
- "models": [
4
- {
5
- "id": "abideen/MedPhi-4-14B-v1",
6
- "name": "MedPhi-4-14B-v1",
7
- "developer": "abideen",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.6277,
11
- "hfopenllm_v2/BBH": 0.6897,
12
- "hfopenllm_v2/MATH Level 5": 0.2931,
13
- "hfopenllm_v2/GPQA": 0.344,
14
- "hfopenllm_v2/MUSR": 0.4155,
15
- "hfopenllm_v2/MMLU-PRO": 0.5338
16
- }
17
- }
18
- ]
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/adamo1139.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "developer": "adamo1139",
3
- "models": [
4
- {
5
- "id": "adamo1139/Yi-34B-200K-AEZAKMI-v2",
6
- "name": "Yi-34B-200K-AEZAKMI-v2",
7
- "developer": "adamo1139",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.4555,
11
- "hfopenllm_v2/BBH": 0.5384,
12
- "hfopenllm_v2/MATH Level 5": 0.0566,
13
- "hfopenllm_v2/GPQA": 0.3322,
14
- "hfopenllm_v2/MUSR": 0.3886,
15
- "hfopenllm_v2/MMLU-PRO": 0.4513
16
- }
17
- }
18
- ]
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/adriszmar.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "developer": "adriszmar",
3
- "models": [
4
- {
5
- "id": "adriszmar/QAIMath-Qwen2.5-7B-TIES",
6
- "name": "QAIMath-Qwen2.5-7B-TIES",
7
- "developer": "adriszmar",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.1746,
11
- "hfopenllm_v2/BBH": 0.3126,
12
- "hfopenllm_v2/MATH Level 5": 0.0,
13
- "hfopenllm_v2/GPQA": 0.245,
14
- "hfopenllm_v2/MUSR": 0.4096,
15
- "hfopenllm_v2/MMLU-PRO": 0.1087
16
- }
17
- }
18
- ]
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/aellm.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "developer": "AELLM",
3
- "models": [
4
- {
5
- "id": "AELLM/gemma-2-aeria-infinity-9b",
6
- "name": "gemma-2-aeria-infinity-9b",
7
- "developer": "AELLM",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.7594,
11
- "hfopenllm_v2/BBH": 0.5983,
12
- "hfopenllm_v2/MATH Level 5": 0.2145,
13
- "hfopenllm_v2/GPQA": 0.3339,
14
- "hfopenllm_v2/MUSR": 0.402,
15
- "hfopenllm_v2/MMLU-PRO": 0.3862
16
- }
17
- },
18
- {
19
- "id": "AELLM/gemma-2-lyco-infinity-9b",
20
- "name": "gemma-2-lyco-infinity-9b",
21
- "developer": "AELLM",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.7316,
25
- "hfopenllm_v2/BBH": 0.584,
26
- "hfopenllm_v2/MATH Level 5": 0.1707,
27
- "hfopenllm_v2/GPQA": 0.328,
28
- "hfopenllm_v2/MUSR": 0.4006,
29
- "hfopenllm_v2/MMLU-PRO": 0.3787
30
- }
31
- }
32
- ]
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/aevalone.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "developer": "aevalone",
3
- "models": [
4
- {
5
- "id": "aevalone/distill_qw_test",
6
- "name": "distill_qw_test",
7
- "developer": "aevalone",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.7409,
11
- "hfopenllm_v2/BBH": 0.5246,
12
- "hfopenllm_v2/MATH Level 5": 0.4781,
13
- "hfopenllm_v2/GPQA": 0.3003,
14
- "hfopenllm_v2/MUSR": 0.386,
15
- "hfopenllm_v2/MMLU-PRO": 0.4092
16
- }
17
- }
18
- ]
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/agentlans.json DELETED
@@ -1,131 +0,0 @@
1
- {
2
- "developer": "agentlans",
3
- "models": [
4
- {
5
- "id": "agentlans/Gemma2-9B-AdvancedFuse",
6
- "name": "Gemma2-9B-AdvancedFuse",
7
- "developer": "agentlans",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.1543,
11
- "hfopenllm_v2/BBH": 0.5859,
12
- "hfopenllm_v2/MATH Level 5": 0.1005,
13
- "hfopenllm_v2/GPQA": 0.3347,
14
- "hfopenllm_v2/MUSR": 0.4231,
15
- "hfopenllm_v2/MMLU-PRO": 0.4
16
- }
17
- },
18
- {
19
- "id": "agentlans/Llama-3.2-1B-Instruct-CrashCourse12K",
20
- "name": "Llama-3.2-1B-Instruct-CrashCourse12K",
21
- "developer": "agentlans",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.5395,
25
- "hfopenllm_v2/BBH": 0.3548,
26
- "hfopenllm_v2/MATH Level 5": 0.071,
27
- "hfopenllm_v2/GPQA": 0.2408,
28
- "hfopenllm_v2/MUSR": 0.321,
29
- "hfopenllm_v2/MMLU-PRO": 0.1809
30
- }
31
- },
32
- {
33
- "id": "agentlans/Llama3.1-8B-drill",
34
- "name": "Llama3.1-8B-drill",
35
- "developer": "agentlans",
36
- "evaluator_relationship": null,
37
- "benchmark_scores": {
38
- "hfopenllm_v2/IFEval": 0.7652,
39
- "hfopenllm_v2/BBH": 0.5016,
40
- "hfopenllm_v2/MATH Level 5": 0.1715,
41
- "hfopenllm_v2/GPQA": 0.2676,
42
- "hfopenllm_v2/MUSR": 0.3672,
43
- "hfopenllm_v2/MMLU-PRO": 0.3776
44
- }
45
- },
46
- {
47
- "id": "agentlans/Llama3.1-Daredevilish",
48
- "name": "Llama3.1-Daredevilish",
49
- "developer": "agentlans",
50
- "evaluator_relationship": null,
51
- "benchmark_scores": {
52
- "hfopenllm_v2/IFEval": 0.6292,
53
- "hfopenllm_v2/BBH": 0.5013,
54
- "hfopenllm_v2/MATH Level 5": 0.1292,
55
- "hfopenllm_v2/GPQA": 0.3012,
56
- "hfopenllm_v2/MUSR": 0.4091,
57
- "hfopenllm_v2/MMLU-PRO": 0.3697
58
- }
59
- },
60
- {
61
- "id": "agentlans/Llama3.1-Daredevilish-Instruct",
62
- "name": "Llama3.1-Daredevilish-Instruct",
63
- "developer": "agentlans",
64
- "evaluator_relationship": null,
65
- "benchmark_scores": {
66
- "hfopenllm_v2/IFEval": 0.7926,
67
- "hfopenllm_v2/BBH": 0.5235,
68
- "hfopenllm_v2/MATH Level 5": 0.1722,
69
- "hfopenllm_v2/GPQA": 0.307,
70
- "hfopenllm_v2/MUSR": 0.3911,
71
- "hfopenllm_v2/MMLU-PRO": 0.3877
72
- }
73
- },
74
- {
75
- "id": "agentlans/Llama3.1-LexiHermes-SuperStorm",
76
- "name": "Llama3.1-LexiHermes-SuperStorm",
77
- "developer": "agentlans",
78
- "evaluator_relationship": null,
79
- "benchmark_scores": {
80
- "hfopenllm_v2/IFEval": 0.7835,
81
- "hfopenllm_v2/BBH": 0.5266,
82
- "hfopenllm_v2/MATH Level 5": 0.1616,
83
- "hfopenllm_v2/GPQA": 0.323,
84
- "hfopenllm_v2/MUSR": 0.3963,
85
- "hfopenllm_v2/MMLU-PRO": 0.3844
86
- }
87
- },
88
- {
89
- "id": "agentlans/Llama3.1-SuperDeepFuse",
90
- "name": "Llama3.1-SuperDeepFuse",
91
- "developer": "agentlans",
92
- "evaluator_relationship": null,
93
- "benchmark_scores": {
94
- "hfopenllm_v2/IFEval": 0.7762,
95
- "hfopenllm_v2/BBH": 0.5049,
96
- "hfopenllm_v2/MATH Level 5": 0.1828,
97
- "hfopenllm_v2/GPQA": 0.2743,
98
- "hfopenllm_v2/MUSR": 0.3699,
99
- "hfopenllm_v2/MMLU-PRO": 0.3775
100
- }
101
- },
102
- {
103
- "id": "agentlans/Llama3.1-SuperDeepFuse-CrashCourse12K",
104
- "name": "Llama3.1-SuperDeepFuse-CrashCourse12K",
105
- "developer": "agentlans",
106
- "evaluator_relationship": null,
107
- "benchmark_scores": {
108
- "hfopenllm_v2/IFEval": 0.7187,
109
- "hfopenllm_v2/BBH": 0.5216,
110
- "hfopenllm_v2/MATH Level 5": 0.1805,
111
- "hfopenllm_v2/GPQA": 0.3129,
112
- "hfopenllm_v2/MUSR": 0.4026,
113
- "hfopenllm_v2/MMLU-PRO": 0.3631
114
- }
115
- },
116
- {
117
- "id": "agentlans/Qwen2.5-0.5B-Instruct-CrashCourse-dropout",
118
- "name": "Qwen2.5-0.5B-Instruct-CrashCourse-dropout",
119
- "developer": "agentlans",
120
- "evaluator_relationship": null,
121
- "benchmark_scores": {
122
- "hfopenllm_v2/IFEval": 0.2949,
123
- "hfopenllm_v2/BBH": 0.3312,
124
- "hfopenllm_v2/MATH Level 5": 0.0423,
125
- "hfopenllm_v2/GPQA": 0.2634,
126
- "hfopenllm_v2/MUSR": 0.3342,
127
- "hfopenllm_v2/MMLU-PRO": 0.1608
128
- }
129
- }
130
- ]
131
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/agi-0.json DELETED
@@ -1,47 +0,0 @@
1
- {
2
- "developer": "AGI-0",
3
- "models": [
4
- {
5
- "id": "AGI-0/Art-v0-3B",
6
- "name": "Art-v0-3B",
7
- "developer": "AGI-0",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.3192,
11
- "hfopenllm_v2/BBH": 0.3401,
12
- "hfopenllm_v2/MATH Level 5": 0.2462,
13
- "hfopenllm_v2/GPQA": 0.2592,
14
- "hfopenllm_v2/MUSR": 0.3768,
15
- "hfopenllm_v2/MMLU-PRO": 0.1179
16
- }
17
- },
18
- {
19
- "id": "AGI-0/Artificium-llama3.1-8B-001",
20
- "name": "Artificium-llama3.1-8B-001",
21
- "developer": "AGI-0",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.5248,
25
- "hfopenllm_v2/BBH": 0.4256,
26
- "hfopenllm_v2/MATH Level 5": 0.136,
27
- "hfopenllm_v2/GPQA": 0.2659,
28
- "hfopenllm_v2/MUSR": 0.3795,
29
- "hfopenllm_v2/MMLU-PRO": 0.3182
30
- }
31
- },
32
- {
33
- "id": "AGI-0/smartllama3.1-8B-001",
34
- "name": "smartllama3.1-8B-001",
35
- "developer": "AGI-0",
36
- "evaluator_relationship": null,
37
- "benchmark_scores": {
38
- "hfopenllm_v2/IFEval": 0.3518,
39
- "hfopenllm_v2/BBH": 0.467,
40
- "hfopenllm_v2/MATH Level 5": 0.1299,
41
- "hfopenllm_v2/GPQA": 0.3062,
42
- "hfopenllm_v2/MUSR": 0.4386,
43
- "hfopenllm_v2/MMLU-PRO": 0.3487
44
- }
45
- }
46
- ]
47
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/ahdoot.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "developer": "Ahdoot",
3
- "models": [
4
- {
5
- "id": "Ahdoot/StructuredThinker-v0.3-MoreStructure",
6
- "name": "StructuredThinker-v0.3-MoreStructure",
7
- "developer": "Ahdoot",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.4193,
11
- "hfopenllm_v2/BBH": 0.4838,
12
- "hfopenllm_v2/MATH Level 5": 0.2908,
13
- "hfopenllm_v2/GPQA": 0.297,
14
- "hfopenllm_v2/MUSR": 0.4158,
15
- "hfopenllm_v2/MMLU-PRO": 0.361
16
- }
17
- },
18
- {
19
- "id": "Ahdoot/Test_StealthThinker",
20
- "name": "Test_StealthThinker",
21
- "developer": "Ahdoot",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.422,
25
- "hfopenllm_v2/BBH": 0.4647,
26
- "hfopenllm_v2/MATH Level 5": 0.179,
27
- "hfopenllm_v2/GPQA": 0.2961,
28
- "hfopenllm_v2/MUSR": 0.428,
29
- "hfopenllm_v2/MMLU-PRO": 0.3597
30
- }
31
- }
32
- ]
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/ahjeong.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "developer": "Ahjeong",
3
- "models": [
4
- {
5
- "id": "Ahjeong/MMPO_Gemma_7b",
6
- "name": "Ahjeong/MMPO_Gemma_7b",
7
- "developer": "Ahjeong",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "reward-bench/Score": 0.7587,
11
- "reward-bench/Chat": 0.9693,
12
- "reward-bench/Chat Hard": 0.614,
13
- "reward-bench/Safety": 0.7135,
14
- "reward-bench/Reasoning": 0.7756,
15
- "reward-bench/Prior Sets (0.5 weight)": 0.6831
16
- }
17
- },
18
- {
19
- "id": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3",
20
- "name": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3",
21
- "developer": "Ahjeong",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "reward-bench/Score": 0.7652,
25
- "reward-bench/Chat": 0.9721,
26
- "reward-bench/Chat Hard": 0.6338,
27
- "reward-bench/Safety": 0.7635,
28
- "reward-bench/Reasoning": 0.7284,
29
- "reward-bench/Prior Sets (0.5 weight)": 0.6913
30
- }
31
- }
32
- ]
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/ahmeda335.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "developer": "ahmeda335",
3
- "models": [
4
- {
5
- "id": "ahmeda335/13_outOf_32_pruned_layers_llama3.1-8b",
6
- "name": "13_outOf_32_pruned_layers_llama3.1-8b",
7
- "developer": "ahmeda335",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.1748,
11
- "hfopenllm_v2/BBH": 0.2883,
12
- "hfopenllm_v2/MATH Level 5": 0.0,
13
- "hfopenllm_v2/GPQA": 0.2592,
14
- "hfopenllm_v2/MUSR": 0.3803,
15
- "hfopenllm_v2/MMLU-PRO": 0.1129
16
- }
17
- }
18
- ]
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/ai-mo.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "developer": "AI-MO",
3
- "models": [
4
- {
5
- "id": "AI-MO/NuminaMath-7B-CoT",
6
- "name": "NuminaMath-7B-CoT",
7
- "developer": "AI-MO",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.2689,
11
- "hfopenllm_v2/BBH": 0.4314,
12
- "hfopenllm_v2/MATH Level 5": 0.2696,
13
- "hfopenllm_v2/GPQA": 0.2659,
14
- "hfopenllm_v2/MUSR": 0.3303,
15
- "hfopenllm_v2/MMLU-PRO": 0.2868
16
- }
17
- },
18
- {
19
- "id": "AI-MO/NuminaMath-7B-TIR",
20
- "name": "NuminaMath-7B-TIR",
21
- "developer": "AI-MO",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.2756,
25
- "hfopenllm_v2/BBH": 0.4144,
26
- "hfopenllm_v2/MATH Level 5": 0.1609,
27
- "hfopenllm_v2/GPQA": 0.2584,
28
- "hfopenllm_v2/MUSR": 0.3509,
29
- "hfopenllm_v2/MMLU-PRO": 0.2733
30
- }
31
- }
32
- ]
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/ai-sweden-models.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "developer": "AI-Sweden-Models",
3
- "models": [
4
- {
5
- "id": "AI-Sweden-Models/gpt-sw3-40b",
6
- "name": "gpt-sw3-40b",
7
- "developer": "AI-Sweden-Models",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.147,
11
- "hfopenllm_v2/BBH": 0.3268,
12
- "hfopenllm_v2/MATH Level 5": 0.0174,
13
- "hfopenllm_v2/GPQA": 0.2349,
14
- "hfopenllm_v2/MUSR": 0.3632,
15
- "hfopenllm_v2/MMLU-PRO": 0.1276
16
- }
17
- },
18
- {
19
- "id": "AI-Sweden-Models/Llama-3-8B-instruct",
20
- "name": "Llama-3-8B-instruct",
21
- "developer": "AI-Sweden-Models",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.2401,
25
- "hfopenllm_v2/BBH": 0.4173,
26
- "hfopenllm_v2/MATH Level 5": 0.0385,
27
- "hfopenllm_v2/GPQA": 0.2659,
28
- "hfopenllm_v2/MUSR": 0.4771,
29
- "hfopenllm_v2/MMLU-PRO": 0.2597
30
- }
31
- }
32
- ]
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/ai2.json DELETED
@@ -1,89 +0,0 @@
1
- {
2
- "developer": "AI2",
3
- "models": [
4
- {
5
- "id": "ai2/llama-2-chat-7b-nectar-3.8m.json",
6
- "name": "ai2/llama-2-chat-7b-nectar-3.8m.json",
7
- "developer": "AI2",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "reward-bench/Score": 0.5843,
11
- "reward-bench/Chat": 0.8631,
12
- "reward-bench/Chat Hard": 0.2654,
13
- "reward-bench/Safety": 0.6243
14
- }
15
- },
16
- {
17
- "id": "ai2/llama-2-chat-nectar-180k.json",
18
- "name": "ai2/llama-2-chat-nectar-180k.json",
19
- "developer": "AI2",
20
- "evaluator_relationship": null,
21
- "benchmark_scores": {
22
- "reward-bench/Score": 0.5235,
23
- "reward-bench/Chat": 0.8827,
24
- "reward-bench/Chat Hard": 0.2851,
25
- "reward-bench/Safety": 0.4027
26
- }
27
- },
28
- {
29
- "id": "ai2/llama-2-chat-ultrafeedback-60k.jsonl",
30
- "name": "ai2/llama-2-chat-ultrafeedback-60k.jsonl",
31
- "developer": "AI2",
32
- "evaluator_relationship": null,
33
- "benchmark_scores": {
34
- "reward-bench/Score": 0.644,
35
- "reward-bench/Chat": 0.9441,
36
- "reward-bench/Chat Hard": 0.4539,
37
- "reward-bench/Safety": 0.5338
38
- }
39
- },
40
- {
41
- "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
42
- "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
43
- "developer": "AI2",
44
- "evaluator_relationship": null,
45
- "benchmark_scores": {
46
- "reward-bench/Score": 0.7008,
47
- "reward-bench/Chat": 0.9385,
48
- "reward-bench/Chat Hard": 0.3882,
49
- "reward-bench/Safety": 0.7757
50
- }
51
- },
52
- {
53
- "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json",
54
- "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json",
55
- "developer": "AI2",
56
- "evaluator_relationship": null,
57
- "benchmark_scores": {
58
- "reward-bench/Score": 0.7127,
59
- "reward-bench/Chat": 0.9358,
60
- "reward-bench/Chat Hard": 0.4079,
61
- "reward-bench/Safety": 0.7946
62
- }
63
- },
64
- {
65
- "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized.json",
66
- "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized.json",
67
- "developer": "AI2",
68
- "evaluator_relationship": null,
69
- "benchmark_scores": {
70
- "reward-bench/Score": 0.6756,
71
- "reward-bench/Chat": 0.9134,
72
- "reward-bench/Chat Hard": 0.3904,
73
- "reward-bench/Safety": 0.723
74
- }
75
- },
76
- {
77
- "id": "ai2/tulu-2-7b-rm-v0.json",
78
- "name": "ai2/tulu-2-7b-rm-v0.json",
79
- "developer": "AI2",
80
- "evaluator_relationship": null,
81
- "benchmark_scores": {
82
- "reward-bench/Score": 0.6655,
83
- "reward-bench/Chat": 0.933,
84
- "reward-bench/Chat Hard": 0.4539,
85
- "reward-bench/Safety": 0.6095
86
- }
87
- }
88
- ]
89
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/ai21.json DELETED
@@ -1,364 +0,0 @@
1
- {
2
- "developer": "ai21",
3
- "models": [
4
- {
5
- "id": "ai21/J1-Grande-v1-17B",
6
- "name": "J1-Grande v1 17B",
7
- "developer": "ai21",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "helm_classic/Mean win rate": 0.433,
11
- "helm_classic/MMLU": 0.27,
12
- "helm_classic/BoolQ": 0.722,
13
- "helm_classic/NarrativeQA": 0.672,
14
- "helm_classic/NaturalQuestions (open-book)": 0.578,
15
- "helm_classic/QuAC": 0.362,
16
- "helm_classic/HellaSwag": 0.739,
17
- "helm_classic/OpenbookQA": 0.52,
18
- "helm_classic/TruthfulQA": 0.193,
19
- "helm_classic/MS MARCO (TREC)": 0.341,
20
- "helm_classic/CNN/DailyMail": 0.143,
21
- "helm_classic/XSUM": 0.122,
22
- "helm_classic/IMDB": 0.953,
23
- "helm_classic/CivilComments": 0.529,
24
- "helm_classic/RAFT": 0.658
25
- }
26
- },
27
- {
28
- "id": "ai21/J1-Grande-v2-beta-17B",
29
- "name": "J1-Grande v2 beta 17B",
30
- "developer": "ai21",
31
- "evaluator_relationship": null,
32
- "benchmark_scores": {
33
- "helm_classic/Mean win rate": 0.706,
34
- "helm_classic/MMLU": 0.445,
35
- "helm_classic/BoolQ": 0.812,
36
- "helm_classic/NarrativeQA": 0.725,
37
- "helm_classic/NaturalQuestions (open-book)": 0.625,
38
- "helm_classic/QuAC": 0.392,
39
- "helm_classic/HellaSwag": 0.764,
40
- "helm_classic/OpenbookQA": 0.56,
41
- "helm_classic/TruthfulQA": 0.306,
42
- "helm_classic/MS MARCO (TREC)": 0.46,
43
- "helm_classic/CNN/DailyMail": 0.146,
44
- "helm_classic/XSUM": 0.152,
45
- "helm_classic/IMDB": 0.957,
46
- "helm_classic/CivilComments": 0.546,
47
- "helm_classic/RAFT": 0.679
48
- }
49
- },
50
- {
51
- "id": "ai21/J1-Jumbo-v1-178B",
52
- "name": "J1-Jumbo v1 178B",
53
- "developer": "ai21",
54
- "evaluator_relationship": null,
55
- "benchmark_scores": {
56
- "helm_classic/Mean win rate": 0.517,
57
- "helm_classic/MMLU": 0.259,
58
- "helm_classic/BoolQ": 0.776,
59
- "helm_classic/NarrativeQA": 0.695,
60
- "helm_classic/NaturalQuestions (open-book)": 0.595,
61
- "helm_classic/QuAC": 0.358,
62
- "helm_classic/HellaSwag": 0.765,
63
- "helm_classic/OpenbookQA": 0.534,
64
- "helm_classic/TruthfulQA": 0.175,
65
- "helm_classic/MS MARCO (TREC)": 0.363,
66
- "helm_classic/CNN/DailyMail": 0.144,
67
- "helm_classic/XSUM": 0.129,
68
- "helm_classic/IMDB": 0.943,
69
- "helm_classic/CivilComments": 0.553,
70
- "helm_classic/RAFT": 0.681
71
- }
72
- },
73
- {
74
- "id": "ai21/J1-Large-v1-7.5B",
75
- "name": "J1-Large v1 7.5B",
76
- "developer": "ai21",
77
- "evaluator_relationship": null,
78
- "benchmark_scores": {
79
- "helm_classic/Mean win rate": 0.285,
80
- "helm_classic/MMLU": 0.241,
81
- "helm_classic/BoolQ": 0.683,
82
- "helm_classic/NarrativeQA": 0.623,
83
- "helm_classic/NaturalQuestions (open-book)": 0.532,
84
- "helm_classic/QuAC": 0.328,
85
- "helm_classic/HellaSwag": 0.7,
86
- "helm_classic/OpenbookQA": 0.514,
87
- "helm_classic/TruthfulQA": 0.197,
88
- "helm_classic/MS MARCO (TREC)": 0.292,
89
- "helm_classic/CNN/DailyMail": 0.134,
90
- "helm_classic/XSUM": 0.102,
91
- "helm_classic/IMDB": 0.956,
92
- "helm_classic/CivilComments": 0.532,
93
- "helm_classic/RAFT": 0.545
94
- }
95
- },
96
- {
97
- "id": "ai21/j2-grande",
98
- "name": "Jurassic-2 Grande 17B",
99
- "developer": "ai21",
100
- "evaluator_relationship": null,
101
- "benchmark_scores": {
102
- "helm_lite/Mean win rate": 0.172,
103
- "helm_lite/NarrativeQA": 0.744,
104
- "helm_lite/NaturalQuestions (closed-book)": 0.35,
105
- "helm_lite/OpenbookQA": 0.614,
106
- "helm_lite/MMLU": 0.471,
107
- "helm_lite/MATH": 0.064,
108
- "helm_lite/GSM8K": 0.159,
109
- "helm_lite/LegalBench": 0.468,
110
- "helm_lite/MedQA": 0.39,
111
- "helm_lite/WMT 2014": 0.102
112
- }
113
- },
114
- {
115
- "id": "ai21/j2-jumbo",
116
- "name": "Jurassic-2 Jumbo 178B",
117
- "developer": "ai21",
118
- "evaluator_relationship": null,
119
- "benchmark_scores": {
120
- "helm_lite/Mean win rate": 0.215,
121
- "helm_lite/NarrativeQA": 0.728,
122
- "helm_lite/NaturalQuestions (closed-book)": 0.385,
123
- "helm_lite/OpenbookQA": 0.688,
124
- "helm_lite/MMLU": 0.483,
125
- "helm_lite/MATH": 0.103,
126
- "helm_lite/GSM8K": 0.239,
127
- "helm_lite/LegalBench": 0.533,
128
- "helm_lite/MedQA": 0.431,
129
- "helm_lite/WMT 2014": 0.114
130
- }
131
- },
132
- {
133
- "id": "ai21/jamba-1.5-large",
134
- "name": "Jamba 1.5 Large",
135
- "developer": "ai21",
136
- "evaluator_relationship": null,
137
- "benchmark_scores": {
138
- "helm_lite/Mean win rate": 0.637,
139
- "helm_lite/NarrativeQA": 0.664,
140
- "helm_lite/NaturalQuestions (closed-book)": 0.394,
141
- "helm_lite/OpenbookQA": 0.948,
142
- "helm_lite/MMLU": 0.683,
143
- "helm_lite/MATH": 0.692,
144
- "helm_lite/GSM8K": 0.846,
145
- "helm_lite/LegalBench": 0.675,
146
- "helm_lite/MedQA": 0.698,
147
- "helm_lite/WMT 2014": 0.203,
148
- "helm_mmlu/MMLU All Subjects": 0.782,
149
- "helm_mmlu/Abstract Algebra": 0.53,
150
- "helm_mmlu/Anatomy": 0.793,
151
- "helm_mmlu/College Physics": 0.51,
152
- "helm_mmlu/Computer Security": 0.8,
153
- "helm_mmlu/Econometrics": 0.614,
154
- "helm_mmlu/Global Facts": 0.54,
155
- "helm_mmlu/Jurisprudence": 0.87,
156
- "helm_mmlu/Philosophy": 0.849,
157
- "helm_mmlu/Professional Psychology": 0.842,
158
- "helm_mmlu/Us Foreign Policy": 0.92,
159
- "helm_mmlu/Astronomy": 0.882,
160
- "helm_mmlu/Business Ethics": 0.77,
161
- "helm_mmlu/Clinical Knowledge": 0.849,
162
- "helm_mmlu/Conceptual Physics": 0.779,
163
- "helm_mmlu/Electrical Engineering": 0.793,
164
- "helm_mmlu/Elementary Mathematics": 0.656,
165
- "helm_mmlu/Formal Logic": 0.619,
166
- "helm_mmlu/High School World History": 0.911,
167
- "helm_mmlu/Human Sexuality": 0.832,
168
- "helm_mmlu/International Law": 0.884,
169
- "helm_mmlu/Logical Fallacies": 0.859,
170
- "helm_mmlu/Machine Learning": 0.688,
171
- "helm_mmlu/Management": 0.864,
172
- "helm_mmlu/Marketing": 0.94,
173
- "helm_mmlu/Medical Genetics": 0.89,
174
- "helm_mmlu/Miscellaneous": 0.931,
175
- "helm_mmlu/Moral Scenarios": 0.686,
176
- "helm_mmlu/Nutrition": 0.869,
177
- "helm_mmlu/Prehistory": 0.892,
178
- "helm_mmlu/Public Relations": 0.755,
179
- "helm_mmlu/Security Studies": 0.771,
180
- "helm_mmlu/Sociology": 0.93,
181
- "helm_mmlu/Virology": 0.554,
182
- "helm_mmlu/World Religions": 0.865,
183
- "helm_mmlu/Mean win rate": 0.147
184
- }
185
- },
186
- {
187
- "id": "ai21/jamba-1.5-mini",
188
- "name": "Jamba 1.5 Mini",
189
- "developer": "ai21",
190
- "evaluator_relationship": null,
191
- "benchmark_scores": {
192
- "helm_lite/Mean win rate": 0.414,
193
- "helm_lite/NarrativeQA": 0.746,
194
- "helm_lite/NaturalQuestions (closed-book)": 0.388,
195
- "helm_lite/OpenbookQA": 0.89,
196
- "helm_lite/MMLU": 0.582,
197
- "helm_lite/MATH": 0.318,
198
- "helm_lite/GSM8K": 0.691,
199
- "helm_lite/LegalBench": 0.503,
200
- "helm_lite/MedQA": 0.632,
201
- "helm_lite/WMT 2014": 0.179,
202
- "helm_mmlu/MMLU All Subjects": 0.699,
203
- "helm_mmlu/Abstract Algebra": 0.33,
204
- "helm_mmlu/Anatomy": 0.711,
205
- "helm_mmlu/College Physics": 0.48,
206
- "helm_mmlu/Computer Security": 0.73,
207
- "helm_mmlu/Econometrics": 0.491,
208
- "helm_mmlu/Global Facts": 0.43,
209
- "helm_mmlu/Jurisprudence": 0.88,
210
- "helm_mmlu/Philosophy": 0.752,
211
- "helm_mmlu/Professional Psychology": 0.76,
212
- "helm_mmlu/Us Foreign Policy": 0.9,
213
- "helm_mmlu/Astronomy": 0.822,
214
- "helm_mmlu/Business Ethics": 0.76,
215
- "helm_mmlu/Clinical Knowledge": 0.74,
216
- "helm_mmlu/Conceptual Physics": 0.677,
217
- "helm_mmlu/Electrical Engineering": 0.683,
218
- "helm_mmlu/Elementary Mathematics": 0.553,
219
- "helm_mmlu/Formal Logic": 0.452,
220
- "helm_mmlu/High School World History": 0.84,
221
- "helm_mmlu/Human Sexuality": 0.809,
222
- "helm_mmlu/International Law": 0.893,
223
- "helm_mmlu/Logical Fallacies": 0.81,
224
- "helm_mmlu/Machine Learning": 0.509,
225
- "helm_mmlu/Management": 0.825,
226
- "helm_mmlu/Marketing": 0.915,
227
- "helm_mmlu/Medical Genetics": 0.69,
228
- "helm_mmlu/Miscellaneous": 0.902,
229
- "helm_mmlu/Moral Scenarios": 0.269,
230
- "helm_mmlu/Nutrition": 0.801,
231
- "helm_mmlu/Prehistory": 0.824,
232
- "helm_mmlu/Public Relations": 0.727,
233
- "helm_mmlu/Security Studies": 0.755,
234
- "helm_mmlu/Sociology": 0.876,
235
- "helm_mmlu/Virology": 0.578,
236
- "helm_mmlu/World Religions": 0.842,
237
- "helm_mmlu/Mean win rate": 0.206
238
- }
239
- },
240
- {
241
- "id": "ai21/jamba-instruct",
242
- "name": "Jamba Instruct",
243
- "developer": "ai21",
244
- "evaluator_relationship": null,
245
- "benchmark_scores": {
246
- "helm_lite/Mean win rate": 0.287,
247
- "helm_lite/NarrativeQA": 0.658,
248
- "helm_lite/NaturalQuestions (closed-book)": 0.384,
249
- "helm_lite/OpenbookQA": 0.796,
250
- "helm_lite/MMLU": 0.582,
251
- "helm_lite/MATH": 0.38,
252
- "helm_lite/GSM8K": 0.67,
253
- "helm_lite/LegalBench": 0.54,
254
- "helm_lite/MedQA": 0.519,
255
- "helm_lite/WMT 2014": 0.164,
256
- "helm_mmlu/MMLU All Subjects": 0.659,
257
- "helm_mmlu/Abstract Algebra": 0.36,
258
- "helm_mmlu/Anatomy": 0.615,
259
- "helm_mmlu/College Physics": 0.422,
260
- "helm_mmlu/Computer Security": 0.76,
261
- "helm_mmlu/Econometrics": 0.439,
262
- "helm_mmlu/Global Facts": 0.4,
263
- "helm_mmlu/Jurisprudence": 0.796,
264
- "helm_mmlu/Philosophy": 0.749,
265
- "helm_mmlu/Professional Psychology": 0.716,
266
- "helm_mmlu/Us Foreign Policy": 0.91,
267
- "helm_mmlu/Astronomy": 0.73,
268
- "helm_mmlu/Business Ethics": 0.6,
269
- "helm_mmlu/Clinical Knowledge": 0.702,
270
- "helm_mmlu/Conceptual Physics": 0.677,
271
- "helm_mmlu/Electrical Engineering": 0.621,
272
- "helm_mmlu/Elementary Mathematics": 0.497,
273
- "helm_mmlu/Formal Logic": 0.444,
274
- "helm_mmlu/High School World History": 0.797,
275
- "helm_mmlu/Human Sexuality": 0.794,
276
- "helm_mmlu/International Law": 0.835,
277
- "helm_mmlu/Logical Fallacies": 0.706,
278
- "helm_mmlu/Machine Learning": 0.536,
279
- "helm_mmlu/Management": 0.786,
280
- "helm_mmlu/Marketing": 0.885,
281
- "helm_mmlu/Medical Genetics": 0.67,
282
- "helm_mmlu/Miscellaneous": 0.865,
283
- "helm_mmlu/Moral Scenarios": 0.465,
284
- "helm_mmlu/Nutrition": 0.745,
285
- "helm_mmlu/Prehistory": 0.796,
286
- "helm_mmlu/Public Relations": 0.682,
287
- "helm_mmlu/Security Studies": 0.743,
288
- "helm_mmlu/Sociology": 0.891,
289
- "helm_mmlu/Virology": 0.53,
290
- "helm_mmlu/World Religions": 0.813,
291
- "helm_mmlu/Mean win rate": 0.887
292
- }
293
- },
294
- {
295
- "id": "ai21/Jurassic-2-Grande-17B",
296
- "name": "Jurassic-2 Grande 17B",
297
- "developer": "ai21",
298
- "evaluator_relationship": null,
299
- "benchmark_scores": {
300
- "helm_classic/Mean win rate": 0.743,
301
- "helm_classic/MMLU": 0.475,
302
- "helm_classic/BoolQ": 0.826,
303
- "helm_classic/NarrativeQA": 0.737,
304
- "helm_classic/NaturalQuestions (open-book)": 0.639,
305
- "helm_classic/QuAC": 0.418,
306
- "helm_classic/HellaSwag": 0.781,
307
- "helm_classic/OpenbookQA": 0.542,
308
- "helm_classic/TruthfulQA": 0.348,
309
- "helm_classic/MS MARCO (TREC)": 0.514,
310
- "helm_classic/CNN/DailyMail": 0.144,
311
- "helm_classic/XSUM": 0.167,
312
- "helm_classic/IMDB": 0.938,
313
- "helm_classic/CivilComments": 0.547,
314
- "helm_classic/RAFT": 0.712
315
- }
316
- },
317
- {
318
- "id": "ai21/Jurassic-2-Jumbo-178B",
319
- "name": "Jurassic-2 Jumbo 178B",
320
- "developer": "ai21",
321
- "evaluator_relationship": null,
322
- "benchmark_scores": {
323
- "helm_classic/Mean win rate": 0.824,
324
- "helm_classic/MMLU": 0.48,
325
- "helm_classic/BoolQ": 0.829,
326
- "helm_classic/NarrativeQA": 0.733,
327
- "helm_classic/NaturalQuestions (open-book)": 0.669,
328
- "helm_classic/QuAC": 0.435,
329
- "helm_classic/HellaSwag": 0.788,
330
- "helm_classic/OpenbookQA": 0.558,
331
- "helm_classic/TruthfulQA": 0.437,
332
- "helm_classic/MS MARCO (TREC)": 0.661,
333
- "helm_classic/CNN/DailyMail": 0.149,
334
- "helm_classic/XSUM": 0.182,
335
- "helm_classic/IMDB": 0.938,
336
- "helm_classic/CivilComments": 0.57,
337
- "helm_classic/RAFT": 0.746
338
- }
339
- },
340
- {
341
- "id": "ai21/Jurassic-2-Large-7.5B",
342
- "name": "Jurassic-2 Large 7.5B",
343
- "developer": "ai21",
344
- "evaluator_relationship": null,
345
- "benchmark_scores": {
346
- "helm_classic/Mean win rate": 0.553,
347
- "helm_classic/MMLU": 0.339,
348
- "helm_classic/BoolQ": 0.742,
349
- "helm_classic/NarrativeQA": -1.0,
350
- "helm_classic/NaturalQuestions (open-book)": 0.589,
351
- "helm_classic/QuAC": -1.0,
352
- "helm_classic/HellaSwag": 0.729,
353
- "helm_classic/OpenbookQA": 0.53,
354
- "helm_classic/TruthfulQA": 0.245,
355
- "helm_classic/MS MARCO (TREC)": 0.464,
356
- "helm_classic/CNN/DailyMail": 0.136,
357
- "helm_classic/XSUM": 0.142,
358
- "helm_classic/IMDB": 0.956,
359
- "helm_classic/CivilComments": 0.57,
360
- "helm_classic/RAFT": 0.622
361
- }
362
- }
363
- ]
364
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/ai21labs.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "developer": "ai21labs",
3
- "models": [
4
- {
5
- "id": "ai21labs/Jamba-v0.1",
6
- "name": "Jamba-v0.1",
7
- "developer": "ai21labs",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.2026,
11
- "hfopenllm_v2/BBH": 0.3602,
12
- "hfopenllm_v2/MATH Level 5": 0.0159,
13
- "hfopenllm_v2/GPQA": 0.2685,
14
- "hfopenllm_v2/MUSR": 0.359,
15
- "hfopenllm_v2/MMLU-PRO": 0.2492
16
- }
17
- }
18
- ]
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/ai4bharat.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "developer": "ai4bharat",
3
- "models": [
4
- {
5
- "id": "ai4bharat/Airavata",
6
- "name": "Airavata",
7
- "developer": "ai4bharat",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.0559,
11
- "hfopenllm_v2/BBH": 0.3628,
12
- "hfopenllm_v2/MATH Level 5": 0.0181,
13
- "hfopenllm_v2/GPQA": 0.2743,
14
- "hfopenllm_v2/MUSR": 0.3763,
15
- "hfopenllm_v2/MMLU-PRO": 0.1635
16
- }
17
- }
18
- ]
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/ai4free.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "developer": "AI4free",
3
- "models": [
4
- {
5
- "id": "AI4free/Dhanishtha",
6
- "name": "Dhanishtha",
7
- "developer": "AI4free",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.2451,
11
- "hfopenllm_v2/BBH": 0.3404,
12
- "hfopenllm_v2/MATH Level 5": 0.256,
13
- "hfopenllm_v2/GPQA": 0.2525,
14
- "hfopenllm_v2/MUSR": 0.3569,
15
- "hfopenllm_v2/MMLU-PRO": 0.1643
16
- }
17
- },
18
- {
19
- "id": "AI4free/t2",
20
- "name": "t2",
21
- "developer": "AI4free",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.3867,
25
- "hfopenllm_v2/BBH": 0.291,
26
- "hfopenllm_v2/MATH Level 5": 0.1896,
27
- "hfopenllm_v2/GPQA": 0.2576,
28
- "hfopenllm_v2/MUSR": 0.3846,
29
- "hfopenllm_v2/MMLU-PRO": 0.1144
30
- }
31
- }
32
- ]
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/aicoressecurity.json DELETED
@@ -1,61 +0,0 @@
1
- {
2
- "developer": "AicoresSecurity",
3
- "models": [
4
- {
5
- "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0",
6
- "name": "Cybernet-Sec-3B-R1-V0",
7
- "developer": "AicoresSecurity",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.6358,
11
- "hfopenllm_v2/BBH": 0.4497,
12
- "hfopenllm_v2/MATH Level 5": 0.1156,
13
- "hfopenllm_v2/GPQA": 0.2634,
14
- "hfopenllm_v2/MUSR": 0.3314,
15
- "hfopenllm_v2/MMLU-PRO": 0.301
16
- }
17
- },
18
- {
19
- "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder",
20
- "name": "Cybernet-Sec-3B-R1-V0-Coder",
21
- "developer": "AicoresSecurity",
22
- "evaluator_relationship": null,
23
- "benchmark_scores": {
24
- "hfopenllm_v2/IFEval": 0.7098,
25
- "hfopenllm_v2/BBH": 0.4478,
26
- "hfopenllm_v2/MATH Level 5": 0.1488,
27
- "hfopenllm_v2/GPQA": 0.2718,
28
- "hfopenllm_v2/MUSR": 0.3408,
29
- "hfopenllm_v2/MMLU-PRO": 0.3178
30
- }
31
- },
32
- {
33
- "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1",
34
- "name": "Cybernet-Sec-3B-R1-V1",
35
- "developer": "AicoresSecurity",
36
- "evaluator_relationship": null,
37
- "benchmark_scores": {
38
- "hfopenllm_v2/IFEval": 0.6146,
39
- "hfopenllm_v2/BBH": 0.4282,
40
- "hfopenllm_v2/MATH Level 5": 0.1518,
41
- "hfopenllm_v2/GPQA": 0.2609,
42
- "hfopenllm_v2/MUSR": 0.3287,
43
- "hfopenllm_v2/MMLU-PRO": 0.2876
44
- }
45
- },
46
- {
47
- "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1.1",
48
- "name": "Cybernet-Sec-3B-R1-V1.1",
49
- "developer": "AicoresSecurity",
50
- "evaluator_relationship": null,
51
- "benchmark_scores": {
52
- "hfopenllm_v2/IFEval": 0.673,
53
- "hfopenllm_v2/BBH": 0.4392,
54
- "hfopenllm_v2/MATH Level 5": 0.176,
55
- "hfopenllm_v2/GPQA": 0.271,
56
- "hfopenllm_v2/MUSR": 0.3541,
57
- "hfopenllm_v2/MMLU-PRO": 0.3088
58
- }
59
- }
60
- ]
61
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/developers/aidc-ai.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "developer": "AIDC-AI",
3
- "models": [
4
- {
5
- "id": "AIDC-AI/Marco-o1",
6
- "name": "Marco-o1",
7
- "developer": "AIDC-AI",
8
- "evaluator_relationship": null,
9
- "benchmark_scores": {
10
- "hfopenllm_v2/IFEval": 0.4771,
11
- "hfopenllm_v2/BBH": 0.5364,
12
- "hfopenllm_v2/MATH Level 5": 0.3746,
13
- "hfopenllm_v2/GPQA": 0.2592,
14
- "hfopenllm_v2/MUSR": 0.4138,
15
- "hfopenllm_v2/MMLU-PRO": 0.4117
16
- }
17
- }
18
- ]
19
- }