tinymind-native-8b-remote-handoff/bundle/evaluation/universal_intelligence_dossier.py

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /universal_intelligence_dossier.py

download

raw

5.78 kB

	"""Universal intelligence benchmark dossier for TinyMind."""

	from __future__ import annotations

	from datetime import datetime, timezone
	import json
	from pathlib import Path
	from typing import Iterable


	BENCHMARKS = [
	{
	"name": "GPQA Diamond",
	"dimension": "graduate_science_reasoning",
	"mode": "official_or_harness",
	"status": "requires_dataset_or_provider_eval",
	"source": "https://arxiv.org/abs/2311.12022",
	},
	{
	"name": "MMLU-Pro",
	"dimension": "broad_reasoning_knowledge",
	"mode": "official_or_lighteval",
	"status": "requires_dataset_or_provider_eval",
	"source": "https://arxiv.org/abs/2406.01574",
	},
	{
	"name": "FrontierMath",
	"dimension": "frontier_math",
	"mode": "restricted_official",
	"status": "requires_official_access",
	"source": "https://epoch.ai/benchmarks/frontiermath/",
	},
	{
	"name": "Humanity's Last Exam",
	"dimension": "frontier_multidomain",
	"mode": "official_or_public_subset",
	"status": "requires_dataset_or_provider_eval",
	"source": "https://huggingface.co/datasets/cais/hle",
	},
	{
	"name": "SWE-bench Pro",
	"dimension": "real_world_software_engineering",
	"mode": "docker_agent_eval",
	"status": "requires_linux_docker_harness",
	"source": "https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified/",
	},
	{
	"name": "OpenAI-compatible tool/agent eval",
	"dimension": "agent_tool_use",
	"mode": "local_and_external_endpoint",
	"status": "ready_with_v1_chat_completions",
	"source": "serve/api.py:/v1/chat/completions",
	},
	{
	"name": "TinyMind ReGenesis 10M exact memory",
	"dimension": "long_context_exact_recall",
	"mode": "local_evidence",
	"status": "local_evidence_available",
	"source": "reports/regenesis_ledger_10m/regenesis_ledger_report.json",
	},
	{
	"name": "Thai-English expert CEV holdout",
	"dimension": "multilingual_th_en",
	"mode": "local_ceved_holdout",
	"status": "local_dataset_available",
	"source": "reports/expert_curriculum_open_pure/expert_curriculum_eval.jsonl",
	},
	{
	"name": "AXON intelligence per bit",
	"dimension": "efficiency_size_latency",
	"mode": "local_artifact_measurement",
	"status": "local_evidence_available",
	"source": "reports/tinymind_axon_open_pure/tinymind_open_pure_knowledge.report.json",
	},
	{
	"name": "Open Pure provenance and junk gate",
	"dimension": "purity_provenance",
	"mode": "local_manifest_hashes",
	"status": "local_evidence_available",
	"source": "reports/internet_update_official/internet_evidence_manifest.json",
	},
	]


	def _path_exists(value: str) -> bool:
	if value.startswith(("http://", "https://")):
	return False
	return Path(value).exists()


	def build_universal_intelligence_dossier(
	out_dir: str \| Path,
	model_id: str,
	evidence_paths: Iterable[str \| Path] = (),
	) -> dict:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	evidence = [str(path) for path in evidence_paths if Path(path).exists()]
	dimensions = sorted({item["dimension"] for item in BENCHMARKS})
	local_ready = [
	item
	for item in BENCHMARKS
	if item["status"].startswith("local") or item["status"].startswith("ready") or _path_exists(item["source"])
	]
	official_missing = [
	item["name"]
	for item in BENCHMARKS
	if item["status"].startswith("requires")
	]
	dossier = {
	"schema_version": "tinymind-universal-intelligence-dossier-v1",
	"model_id": model_id,
	"created_at": datetime.now(timezone.utc).isoformat(),
	"benchmarks": BENCHMARKS,
	"evidence_paths": evidence,
	"coverage": {
	"dimensions": dimensions,
	"dimensions_total": len(dimensions),
	"dimension_coverage": 1.0,
	"local_or_endpoint_ready_count": len(local_ready),
	"official_required_count": len(official_missing),
	},
	"claim_gate": {
	"world_best_claim_allowed": False,
	"missing": ["official_external_rank1_results"] + [f"official:{name}" for name in official_missing],
	"rule": "A 100% coverage dossier is not a 100% score. World-best requires official external rank-1 evidence.",
	},
	}
	json_path = out / "universal_intelligence_dossier.json"
	dossier["json_path"] = str(json_path)
	markdown_path = out / "universal_intelligence_dossier.md"
	dossier["markdown_path"] = str(markdown_path)
	json_path.write_text(json.dumps(dossier, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	markdown_path.write_text(_markdown(dossier), encoding="utf-8")
	return dossier


	def _markdown(dossier: dict) -> str:
	lines = [
	"# TinyMind Universal Intelligence Dossier",
	"",
	f"- Model: `{dossier['model_id']}`",
	f"- Dimension coverage: {dossier['coverage']['dimension_coverage']:.0%}",
	f"- Dimensions: {dossier['coverage']['dimensions_total']}",
	f"- World-best claim allowed: {dossier['claim_gate']['world_best_claim_allowed']}",
	"",
	"\| Benchmark \| Dimension \| Status \| Source \|",
	"\|---\|---\|---\|---\|",
	]
	for item in dossier["benchmarks"]:
	lines.append(f"\| {item['name']} \| {item['dimension']} \| {item['status']} \| {item['source']} \|")
	lines.extend(["", "## Claim Gate", ""])
	lines.extend(f"- {missing}" for missing in dossier["claim_gate"]["missing"])
	lines.append("")
	return "\n".join(lines)

Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Xet Storage Details