Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /frontier_parity.py

bbkdevops

about 1 month ago

download

raw

6.65 kB

	"""Frontier parity gate for comparing TinyMind with GPT-5.5 Pro class models."""

	from __future__ import annotations

	from datetime import datetime, timezone
	import json
	from pathlib import Path


	GPT55_PRO_TARGET = {
	"model": "gpt-5.5-pro",
	"provider": "OpenAI",
	"official_sources": [
	"https://openai.com/index/introducing-gpt-5-5/",
	"https://developers.openai.com/api/docs/models/gpt-5.5-pro",
	],
	"target_axes": {
	"knowledge_mmlu_pro": 90.0,
	"instruction_following": 95.0,
	"translation_th_en": 90.0,
	"natural_answer_style": 95.0,
	"bit_exactness": 95.0,
	"layer_coherence": 98.0,
	"long_context_exact_10m": 100.0,
	"pure_data_full_cycle": 100.0,
	"coding_project_agent": 90.0,
	"tool_grounding_reliability": 95.0,
	},
	"note": "Targets are parity thresholds for TinyMind release gates, not official GPT-5.5 Pro benchmark scores.",
	}


	def _load(path: str \| Path \| None) -> dict:
	if not path:
	return {}
	p = Path(path)
	return json.loads(p.read_text(encoding="utf-8")) if p.exists() else {}


	def _tinymind_scores(world_report: dict, optional_scores: dict \| None = None) -> dict[str, float]:
	scores = {row["axis"]: float(row["score"]) for row in world_report.get("metrics", [])}
	optional_scores = optional_scores or {}
	for key, value in optional_scores.items():
	try:
	scores[key] = float(value)
	except (TypeError, ValueError):
	continue
	return scores


	def build_frontier_parity_report(
	out_dir: str \| Path,
	world_report: str \| Path = "reports/world_class_eval/world_class_eval_report.json",
	imported_scores: str \| Path \| None = None,
	) -> dict:
	world = _load(world_report)
	imports = _load(imported_scores).get("scores", {}) if imported_scores else {}
	tiny_scores = _tinymind_scores(world, imports)
	rows = []
	for axis, target in GPT55_PRO_TARGET["target_axes"].items():
	score = tiny_scores.get(axis, 0.0)
	rows.append(
	{
	"axis": axis,
	"tinymind_score": score,
	"parity_target": target,
	"gap": target - score,
	"passed": score >= target,
	}
	)
	passed_axes = sum(1 for row in rows if row["passed"])
	frontier_plus_rows = [
	{
	**row,
	"frontier_plus_target": min(100.0, row["parity_target"] + (2.0 if row["parity_target"] < 100.0 else 0.0)),
	"frontier_plus_passed": row["tinymind_score"] >= min(100.0, row["parity_target"] + (2.0 if row["parity_target"] < 100.0 else 0.0)),
	}
	for row in rows
	]
	frontier_plus_passed = sum(1 for row in frontier_plus_rows if row["frontier_plus_passed"])
	critical_gaps = [row for row in rows if row["gap"] > 15.0]
	report = {
	"schema_version": "tinymind-frontier-parity-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"frontier_target": GPT55_PRO_TARGET,
	"world_report": str(world_report),
	"imported_scores": str(imported_scores) if imported_scores else None,
	"parity_rows": rows,
	"frontier_plus_rows": frontier_plus_rows,
	"summary": {
	"passed_axes": passed_axes,
	"axis_count": len(rows),
	"parity_percent": 100.0 * passed_axes / max(len(rows), 1),
	"frontier_plus_passed_axes": frontier_plus_passed,
	"frontier_plus_percent": 100.0 * frontier_plus_passed / max(len(frontier_plus_rows), 1),
	"critical_gaps": [row["axis"] for row in critical_gaps],
	},
	"claim_gate": {
	"can_claim_gpt55_pro_equivalent": passed_axes == len(rows),
	"can_claim_beyond_frontier": frontier_plus_passed == len(frontier_plus_rows) and bool(imported_scores),
	"can_claim_currently_below_gpt55_pro_target": passed_axes < len(rows),
	"reason": "Frontier+ requires every axis to exceed parity target plus imported external evidence.",
	},
	"upgrade_plan": _upgrade_plan(rows),
	}
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	json_path = out / "gpt55_pro_parity_report.json"
	md_path = out / "gpt55_pro_parity_report.md"
	report["json_path"] = str(json_path)
	report["markdown_path"] = str(md_path)
	json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	md_path.write_text(_markdown(report), encoding="utf-8")
	return report


	def _upgrade_plan(rows: list[dict]) -> list[dict]:
	plan = []
	for row in rows:
	if row["passed"]:
	continue
	axis = row["axis"]
	if axis == "instruction_following":
	action = "train strict IFEval-format adapters and add constrained decoding verifier"
	elif axis == "knowledge_mmlu_pro":
	action = "expand verified MMLU-Pro style curriculum and run hard negative fine-tuning"
	elif axis == "translation_th_en":
	action = "add bilingual Thai-English expert parallel data with semantic equivalence checks"
	elif axis in {"coding_project_agent", "tool_grounding_reliability"}:
	action = "run sandboxed project-building tasks with tool-use reward and artifact verification"
	else:
	action = "collect targeted failures, refine HyperPure records, train adapter, rerun gate"
	plan.append({"axis": axis, "gap": row["gap"], "action": action})
	return plan


	def _markdown(report: dict) -> str:
	lines = [
	"# TinyMind GPT-5.5 Pro Parity Gate",
	"",
	f"- Parity axes passed: {report['summary']['passed_axes']}/{report['summary']['axis_count']}",
	f"- Parity percent: {report['summary']['parity_percent']:.2f}%",
	f"- Frontier+ percent: {report['summary']['frontier_plus_percent']:.2f}%",
	f"- Can claim GPT-5.5 Pro equivalent: {report['claim_gate']['can_claim_gpt55_pro_equivalent']}",
	f"- Can claim beyond frontier: {report['claim_gate']['can_claim_beyond_frontier']}",
	"",
	"## Axes",
	"",
	"\| Axis \| TinyMind \| Target \| Gap \| Passed \|",
	"\|---\|---:\|---:\|---:\|---\|",
	]
	for row in report["parity_rows"]:
	lines.append(
	f"\| {row['axis']} \| {row['tinymind_score']:.2f} \| {row['parity_target']:.2f} \| {row['gap']:.2f} \| {row['passed']} \|"
	)
	lines.extend(["", "## Upgrade Plan", ""])
	for row in report["upgrade_plan"]:
	lines.append(f"- {row['axis']}: gap={row['gap']:.2f}; {row['action']}")
	return "\n".join(lines) + "\n"

Xet Storage Details

Size:: 6.65 kB
Xet hash:: 61bbf932b9ecb420c5d020b65aa13f4973c2edf86d2df6e79888653a43f37fd4

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.