bbkdevops's picture
download
raw
5.78 kB
"""Universal intelligence benchmark dossier for TinyMind."""
from __future__ import annotations
from datetime import datetime, timezone
import json
from pathlib import Path
from typing import Iterable
BENCHMARKS = [
{
"name": "GPQA Diamond",
"dimension": "graduate_science_reasoning",
"mode": "official_or_harness",
"status": "requires_dataset_or_provider_eval",
"source": "https://arxiv.org/abs/2311.12022",
},
{
"name": "MMLU-Pro",
"dimension": "broad_reasoning_knowledge",
"mode": "official_or_lighteval",
"status": "requires_dataset_or_provider_eval",
"source": "https://arxiv.org/abs/2406.01574",
},
{
"name": "FrontierMath",
"dimension": "frontier_math",
"mode": "restricted_official",
"status": "requires_official_access",
"source": "https://epoch.ai/benchmarks/frontiermath/",
},
{
"name": "Humanity's Last Exam",
"dimension": "frontier_multidomain",
"mode": "official_or_public_subset",
"status": "requires_dataset_or_provider_eval",
"source": "https://huggingface.co/datasets/cais/hle",
},
{
"name": "SWE-bench Pro",
"dimension": "real_world_software_engineering",
"mode": "docker_agent_eval",
"status": "requires_linux_docker_harness",
"source": "https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified/",
},
{
"name": "OpenAI-compatible tool/agent eval",
"dimension": "agent_tool_use",
"mode": "local_and_external_endpoint",
"status": "ready_with_v1_chat_completions",
"source": "serve/api.py:/v1/chat/completions",
},
{
"name": "TinyMind ReGenesis 10M exact memory",
"dimension": "long_context_exact_recall",
"mode": "local_evidence",
"status": "local_evidence_available",
"source": "reports/regenesis_ledger_10m/regenesis_ledger_report.json",
},
{
"name": "Thai-English expert CEV holdout",
"dimension": "multilingual_th_en",
"mode": "local_ceved_holdout",
"status": "local_dataset_available",
"source": "reports/expert_curriculum_open_pure/expert_curriculum_eval.jsonl",
},
{
"name": "AXON intelligence per bit",
"dimension": "efficiency_size_latency",
"mode": "local_artifact_measurement",
"status": "local_evidence_available",
"source": "reports/tinymind_axon_open_pure/tinymind_open_pure_knowledge.report.json",
},
{
"name": "Open Pure provenance and junk gate",
"dimension": "purity_provenance",
"mode": "local_manifest_hashes",
"status": "local_evidence_available",
"source": "reports/internet_update_official/internet_evidence_manifest.json",
},
]
def _path_exists(value: str) -> bool:
if value.startswith(("http://", "https://")):
return False
return Path(value).exists()
def build_universal_intelligence_dossier(
out_dir: str | Path,
model_id: str,
evidence_paths: Iterable[str | Path] = (),
) -> dict:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
evidence = [str(path) for path in evidence_paths if Path(path).exists()]
dimensions = sorted({item["dimension"] for item in BENCHMARKS})
local_ready = [
item
for item in BENCHMARKS
if item["status"].startswith("local") or item["status"].startswith("ready") or _path_exists(item["source"])
]
official_missing = [
item["name"]
for item in BENCHMARKS
if item["status"].startswith("requires")
]
dossier = {
"schema_version": "tinymind-universal-intelligence-dossier-v1",
"model_id": model_id,
"created_at": datetime.now(timezone.utc).isoformat(),
"benchmarks": BENCHMARKS,
"evidence_paths": evidence,
"coverage": {
"dimensions": dimensions,
"dimensions_total": len(dimensions),
"dimension_coverage": 1.0,
"local_or_endpoint_ready_count": len(local_ready),
"official_required_count": len(official_missing),
},
"claim_gate": {
"world_best_claim_allowed": False,
"missing": ["official_external_rank1_results"] + [f"official:{name}" for name in official_missing],
"rule": "A 100% coverage dossier is not a 100% score. World-best requires official external rank-1 evidence.",
},
}
json_path = out / "universal_intelligence_dossier.json"
dossier["json_path"] = str(json_path)
markdown_path = out / "universal_intelligence_dossier.md"
dossier["markdown_path"] = str(markdown_path)
json_path.write_text(json.dumps(dossier, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
markdown_path.write_text(_markdown(dossier), encoding="utf-8")
return dossier
def _markdown(dossier: dict) -> str:
lines = [
"# TinyMind Universal Intelligence Dossier",
"",
f"- Model: `{dossier['model_id']}`",
f"- Dimension coverage: {dossier['coverage']['dimension_coverage']:.0%}",
f"- Dimensions: {dossier['coverage']['dimensions_total']}",
f"- World-best claim allowed: {dossier['claim_gate']['world_best_claim_allowed']}",
"",
"| Benchmark | Dimension | Status | Source |",
"|---|---|---|---|",
]
for item in dossier["benchmarks"]:
lines.append(f"| {item['name']} | {item['dimension']} | {item['status']} | {item['source']} |")
lines.extend(["", "## Claim Gate", ""])
lines.extend(f"- {missing}" for missing in dossier["claim_gate"]["missing"])
lines.append("")
return "\n".join(lines)

Xet Storage Details

Size:
5.78 kB
·
Xet hash:
6d81aa6cf33de0e7bc935f54afe97e668bf3869c525dfa1e4ac7c768721d5cd5

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.