Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /universal_intelligence_dossier.py
| """Universal intelligence benchmark dossier for TinyMind.""" | |
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| from typing import Iterable | |
| BENCHMARKS = [ | |
| { | |
| "name": "GPQA Diamond", | |
| "dimension": "graduate_science_reasoning", | |
| "mode": "official_or_harness", | |
| "status": "requires_dataset_or_provider_eval", | |
| "source": "https://arxiv.org/abs/2311.12022", | |
| }, | |
| { | |
| "name": "MMLU-Pro", | |
| "dimension": "broad_reasoning_knowledge", | |
| "mode": "official_or_lighteval", | |
| "status": "requires_dataset_or_provider_eval", | |
| "source": "https://arxiv.org/abs/2406.01574", | |
| }, | |
| { | |
| "name": "FrontierMath", | |
| "dimension": "frontier_math", | |
| "mode": "restricted_official", | |
| "status": "requires_official_access", | |
| "source": "https://epoch.ai/benchmarks/frontiermath/", | |
| }, | |
| { | |
| "name": "Humanity's Last Exam", | |
| "dimension": "frontier_multidomain", | |
| "mode": "official_or_public_subset", | |
| "status": "requires_dataset_or_provider_eval", | |
| "source": "https://huggingface.co/datasets/cais/hle", | |
| }, | |
| { | |
| "name": "SWE-bench Pro", | |
| "dimension": "real_world_software_engineering", | |
| "mode": "docker_agent_eval", | |
| "status": "requires_linux_docker_harness", | |
| "source": "https://openai.com/index/why-we-no-longer-evaluate-swe-bench-verified/", | |
| }, | |
| { | |
| "name": "OpenAI-compatible tool/agent eval", | |
| "dimension": "agent_tool_use", | |
| "mode": "local_and_external_endpoint", | |
| "status": "ready_with_v1_chat_completions", | |
| "source": "serve/api.py:/v1/chat/completions", | |
| }, | |
| { | |
| "name": "TinyMind ReGenesis 10M exact memory", | |
| "dimension": "long_context_exact_recall", | |
| "mode": "local_evidence", | |
| "status": "local_evidence_available", | |
| "source": "reports/regenesis_ledger_10m/regenesis_ledger_report.json", | |
| }, | |
| { | |
| "name": "Thai-English expert CEV holdout", | |
| "dimension": "multilingual_th_en", | |
| "mode": "local_ceved_holdout", | |
| "status": "local_dataset_available", | |
| "source": "reports/expert_curriculum_open_pure/expert_curriculum_eval.jsonl", | |
| }, | |
| { | |
| "name": "AXON intelligence per bit", | |
| "dimension": "efficiency_size_latency", | |
| "mode": "local_artifact_measurement", | |
| "status": "local_evidence_available", | |
| "source": "reports/tinymind_axon_open_pure/tinymind_open_pure_knowledge.report.json", | |
| }, | |
| { | |
| "name": "Open Pure provenance and junk gate", | |
| "dimension": "purity_provenance", | |
| "mode": "local_manifest_hashes", | |
| "status": "local_evidence_available", | |
| "source": "reports/internet_update_official/internet_evidence_manifest.json", | |
| }, | |
| ] | |
| def _path_exists(value: str) -> bool: | |
| if value.startswith(("http://", "https://")): | |
| return False | |
| return Path(value).exists() | |
| def build_universal_intelligence_dossier( | |
| out_dir: str | Path, | |
| model_id: str, | |
| evidence_paths: Iterable[str | Path] = (), | |
| ) -> dict: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| evidence = [str(path) for path in evidence_paths if Path(path).exists()] | |
| dimensions = sorted({item["dimension"] for item in BENCHMARKS}) | |
| local_ready = [ | |
| item | |
| for item in BENCHMARKS | |
| if item["status"].startswith("local") or item["status"].startswith("ready") or _path_exists(item["source"]) | |
| ] | |
| official_missing = [ | |
| item["name"] | |
| for item in BENCHMARKS | |
| if item["status"].startswith("requires") | |
| ] | |
| dossier = { | |
| "schema_version": "tinymind-universal-intelligence-dossier-v1", | |
| "model_id": model_id, | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "benchmarks": BENCHMARKS, | |
| "evidence_paths": evidence, | |
| "coverage": { | |
| "dimensions": dimensions, | |
| "dimensions_total": len(dimensions), | |
| "dimension_coverage": 1.0, | |
| "local_or_endpoint_ready_count": len(local_ready), | |
| "official_required_count": len(official_missing), | |
| }, | |
| "claim_gate": { | |
| "world_best_claim_allowed": False, | |
| "missing": ["official_external_rank1_results"] + [f"official:{name}" for name in official_missing], | |
| "rule": "A 100% coverage dossier is not a 100% score. World-best requires official external rank-1 evidence.", | |
| }, | |
| } | |
| json_path = out / "universal_intelligence_dossier.json" | |
| dossier["json_path"] = str(json_path) | |
| markdown_path = out / "universal_intelligence_dossier.md" | |
| dossier["markdown_path"] = str(markdown_path) | |
| json_path.write_text(json.dumps(dossier, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| markdown_path.write_text(_markdown(dossier), encoding="utf-8") | |
| return dossier | |
| def _markdown(dossier: dict) -> str: | |
| lines = [ | |
| "# TinyMind Universal Intelligence Dossier", | |
| "", | |
| f"- Model: `{dossier['model_id']}`", | |
| f"- Dimension coverage: {dossier['coverage']['dimension_coverage']:.0%}", | |
| f"- Dimensions: {dossier['coverage']['dimensions_total']}", | |
| f"- World-best claim allowed: {dossier['claim_gate']['world_best_claim_allowed']}", | |
| "", | |
| "| Benchmark | Dimension | Status | Source |", | |
| "|---|---|---|---|", | |
| ] | |
| for item in dossier["benchmarks"]: | |
| lines.append(f"| {item['name']} | {item['dimension']} | {item['status']} | {item['source']} |") | |
| lines.extend(["", "## Claim Gate", ""]) | |
| lines.extend(f"- {missing}" for missing in dossier["claim_gate"]["missing"]) | |
| lines.append("") | |
| return "\n".join(lines) | |
Xet Storage Details
- Size:
- 5.78 kB
- Xet hash:
- 6d81aa6cf33de0e7bc935f54afe97e668bf3869c525dfa1e4ac7c768721d5cd5
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.