Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /evaluation /frontier_parity.py
| """Frontier parity gate for comparing TinyMind with GPT-5.5 Pro class models.""" | |
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| GPT55_PRO_TARGET = { | |
| "model": "gpt-5.5-pro", | |
| "provider": "OpenAI", | |
| "official_sources": [ | |
| "https://openai.com/index/introducing-gpt-5-5/", | |
| "https://developers.openai.com/api/docs/models/gpt-5.5-pro", | |
| ], | |
| "target_axes": { | |
| "knowledge_mmlu_pro": 90.0, | |
| "instruction_following": 95.0, | |
| "translation_th_en": 90.0, | |
| "natural_answer_style": 95.0, | |
| "bit_exactness": 95.0, | |
| "layer_coherence": 98.0, | |
| "long_context_exact_10m": 100.0, | |
| "pure_data_full_cycle": 100.0, | |
| "coding_project_agent": 90.0, | |
| "tool_grounding_reliability": 95.0, | |
| }, | |
| "note": "Targets are parity thresholds for TinyMind release gates, not official GPT-5.5 Pro benchmark scores.", | |
| } | |
| def _load(path: str | Path | None) -> dict: | |
| if not path: | |
| return {} | |
| p = Path(path) | |
| return json.loads(p.read_text(encoding="utf-8")) if p.exists() else {} | |
| def _tinymind_scores(world_report: dict, optional_scores: dict | None = None) -> dict[str, float]: | |
| scores = {row["axis"]: float(row["score"]) for row in world_report.get("metrics", [])} | |
| optional_scores = optional_scores or {} | |
| for key, value in optional_scores.items(): | |
| try: | |
| scores[key] = float(value) | |
| except (TypeError, ValueError): | |
| continue | |
| return scores | |
| def build_frontier_parity_report( | |
| out_dir: str | Path, | |
| world_report: str | Path = "reports/world_class_eval/world_class_eval_report.json", | |
| imported_scores: str | Path | None = None, | |
| ) -> dict: | |
| world = _load(world_report) | |
| imports = _load(imported_scores).get("scores", {}) if imported_scores else {} | |
| tiny_scores = _tinymind_scores(world, imports) | |
| rows = [] | |
| for axis, target in GPT55_PRO_TARGET["target_axes"].items(): | |
| score = tiny_scores.get(axis, 0.0) | |
| rows.append( | |
| { | |
| "axis": axis, | |
| "tinymind_score": score, | |
| "parity_target": target, | |
| "gap": target - score, | |
| "passed": score >= target, | |
| } | |
| ) | |
| passed_axes = sum(1 for row in rows if row["passed"]) | |
| frontier_plus_rows = [ | |
| { | |
| **row, | |
| "frontier_plus_target": min(100.0, row["parity_target"] + (2.0 if row["parity_target"] < 100.0 else 0.0)), | |
| "frontier_plus_passed": row["tinymind_score"] >= min(100.0, row["parity_target"] + (2.0 if row["parity_target"] < 100.0 else 0.0)), | |
| } | |
| for row in rows | |
| ] | |
| frontier_plus_passed = sum(1 for row in frontier_plus_rows if row["frontier_plus_passed"]) | |
| critical_gaps = [row for row in rows if row["gap"] > 15.0] | |
| report = { | |
| "schema_version": "tinymind-frontier-parity-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "frontier_target": GPT55_PRO_TARGET, | |
| "world_report": str(world_report), | |
| "imported_scores": str(imported_scores) if imported_scores else None, | |
| "parity_rows": rows, | |
| "frontier_plus_rows": frontier_plus_rows, | |
| "summary": { | |
| "passed_axes": passed_axes, | |
| "axis_count": len(rows), | |
| "parity_percent": 100.0 * passed_axes / max(len(rows), 1), | |
| "frontier_plus_passed_axes": frontier_plus_passed, | |
| "frontier_plus_percent": 100.0 * frontier_plus_passed / max(len(frontier_plus_rows), 1), | |
| "critical_gaps": [row["axis"] for row in critical_gaps], | |
| }, | |
| "claim_gate": { | |
| "can_claim_gpt55_pro_equivalent": passed_axes == len(rows), | |
| "can_claim_beyond_frontier": frontier_plus_passed == len(frontier_plus_rows) and bool(imported_scores), | |
| "can_claim_currently_below_gpt55_pro_target": passed_axes < len(rows), | |
| "reason": "Frontier+ requires every axis to exceed parity target plus imported external evidence.", | |
| }, | |
| "upgrade_plan": _upgrade_plan(rows), | |
| } | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| json_path = out / "gpt55_pro_parity_report.json" | |
| md_path = out / "gpt55_pro_parity_report.md" | |
| report["json_path"] = str(json_path) | |
| report["markdown_path"] = str(md_path) | |
| json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md_path.write_text(_markdown(report), encoding="utf-8") | |
| return report | |
| def _upgrade_plan(rows: list[dict]) -> list[dict]: | |
| plan = [] | |
| for row in rows: | |
| if row["passed"]: | |
| continue | |
| axis = row["axis"] | |
| if axis == "instruction_following": | |
| action = "train strict IFEval-format adapters and add constrained decoding verifier" | |
| elif axis == "knowledge_mmlu_pro": | |
| action = "expand verified MMLU-Pro style curriculum and run hard negative fine-tuning" | |
| elif axis == "translation_th_en": | |
| action = "add bilingual Thai-English expert parallel data with semantic equivalence checks" | |
| elif axis in {"coding_project_agent", "tool_grounding_reliability"}: | |
| action = "run sandboxed project-building tasks with tool-use reward and artifact verification" | |
| else: | |
| action = "collect targeted failures, refine HyperPure records, train adapter, rerun gate" | |
| plan.append({"axis": axis, "gap": row["gap"], "action": action}) | |
| return plan | |
| def _markdown(report: dict) -> str: | |
| lines = [ | |
| "# TinyMind GPT-5.5 Pro Parity Gate", | |
| "", | |
| f"- Parity axes passed: {report['summary']['passed_axes']}/{report['summary']['axis_count']}", | |
| f"- Parity percent: {report['summary']['parity_percent']:.2f}%", | |
| f"- Frontier+ percent: {report['summary']['frontier_plus_percent']:.2f}%", | |
| f"- Can claim GPT-5.5 Pro equivalent: {report['claim_gate']['can_claim_gpt55_pro_equivalent']}", | |
| f"- Can claim beyond frontier: {report['claim_gate']['can_claim_beyond_frontier']}", | |
| "", | |
| "## Axes", | |
| "", | |
| "| Axis | TinyMind | Target | Gap | Passed |", | |
| "|---|---:|---:|---:|---|", | |
| ] | |
| for row in report["parity_rows"]: | |
| lines.append( | |
| f"| {row['axis']} | {row['tinymind_score']:.2f} | {row['parity_target']:.2f} | {row['gap']:.2f} | {row['passed']} |" | |
| ) | |
| lines.extend(["", "## Upgrade Plan", ""]) | |
| for row in report["upgrade_plan"]: | |
| lines.append(f"- {row['axis']}: gap={row['gap']:.2f}; {row['action']}") | |
| return "\n".join(lines) + "\n" | |
Xet Storage Details
- Size:
- 6.65 kB
- Xet hash:
- 61bbf932b9ecb420c5d020b65aa13f4973c2edf86d2df6e79888653a43f37fd4
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.