bbkdevops's picture
download
raw
6.65 kB
"""Frontier parity gate for comparing TinyMind with GPT-5.5 Pro class models."""
from __future__ import annotations
from datetime import datetime, timezone
import json
from pathlib import Path
GPT55_PRO_TARGET = {
"model": "gpt-5.5-pro",
"provider": "OpenAI",
"official_sources": [
"https://openai.com/index/introducing-gpt-5-5/",
"https://developers.openai.com/api/docs/models/gpt-5.5-pro",
],
"target_axes": {
"knowledge_mmlu_pro": 90.0,
"instruction_following": 95.0,
"translation_th_en": 90.0,
"natural_answer_style": 95.0,
"bit_exactness": 95.0,
"layer_coherence": 98.0,
"long_context_exact_10m": 100.0,
"pure_data_full_cycle": 100.0,
"coding_project_agent": 90.0,
"tool_grounding_reliability": 95.0,
},
"note": "Targets are parity thresholds for TinyMind release gates, not official GPT-5.5 Pro benchmark scores.",
}
def _load(path: str | Path | None) -> dict:
if not path:
return {}
p = Path(path)
return json.loads(p.read_text(encoding="utf-8")) if p.exists() else {}
def _tinymind_scores(world_report: dict, optional_scores: dict | None = None) -> dict[str, float]:
scores = {row["axis"]: float(row["score"]) for row in world_report.get("metrics", [])}
optional_scores = optional_scores or {}
for key, value in optional_scores.items():
try:
scores[key] = float(value)
except (TypeError, ValueError):
continue
return scores
def build_frontier_parity_report(
out_dir: str | Path,
world_report: str | Path = "reports/world_class_eval/world_class_eval_report.json",
imported_scores: str | Path | None = None,
) -> dict:
world = _load(world_report)
imports = _load(imported_scores).get("scores", {}) if imported_scores else {}
tiny_scores = _tinymind_scores(world, imports)
rows = []
for axis, target in GPT55_PRO_TARGET["target_axes"].items():
score = tiny_scores.get(axis, 0.0)
rows.append(
{
"axis": axis,
"tinymind_score": score,
"parity_target": target,
"gap": target - score,
"passed": score >= target,
}
)
passed_axes = sum(1 for row in rows if row["passed"])
frontier_plus_rows = [
{
**row,
"frontier_plus_target": min(100.0, row["parity_target"] + (2.0 if row["parity_target"] < 100.0 else 0.0)),
"frontier_plus_passed": row["tinymind_score"] >= min(100.0, row["parity_target"] + (2.0 if row["parity_target"] < 100.0 else 0.0)),
}
for row in rows
]
frontier_plus_passed = sum(1 for row in frontier_plus_rows if row["frontier_plus_passed"])
critical_gaps = [row for row in rows if row["gap"] > 15.0]
report = {
"schema_version": "tinymind-frontier-parity-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"frontier_target": GPT55_PRO_TARGET,
"world_report": str(world_report),
"imported_scores": str(imported_scores) if imported_scores else None,
"parity_rows": rows,
"frontier_plus_rows": frontier_plus_rows,
"summary": {
"passed_axes": passed_axes,
"axis_count": len(rows),
"parity_percent": 100.0 * passed_axes / max(len(rows), 1),
"frontier_plus_passed_axes": frontier_plus_passed,
"frontier_plus_percent": 100.0 * frontier_plus_passed / max(len(frontier_plus_rows), 1),
"critical_gaps": [row["axis"] for row in critical_gaps],
},
"claim_gate": {
"can_claim_gpt55_pro_equivalent": passed_axes == len(rows),
"can_claim_beyond_frontier": frontier_plus_passed == len(frontier_plus_rows) and bool(imported_scores),
"can_claim_currently_below_gpt55_pro_target": passed_axes < len(rows),
"reason": "Frontier+ requires every axis to exceed parity target plus imported external evidence.",
},
"upgrade_plan": _upgrade_plan(rows),
}
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
json_path = out / "gpt55_pro_parity_report.json"
md_path = out / "gpt55_pro_parity_report.md"
report["json_path"] = str(json_path)
report["markdown_path"] = str(md_path)
json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
md_path.write_text(_markdown(report), encoding="utf-8")
return report
def _upgrade_plan(rows: list[dict]) -> list[dict]:
plan = []
for row in rows:
if row["passed"]:
continue
axis = row["axis"]
if axis == "instruction_following":
action = "train strict IFEval-format adapters and add constrained decoding verifier"
elif axis == "knowledge_mmlu_pro":
action = "expand verified MMLU-Pro style curriculum and run hard negative fine-tuning"
elif axis == "translation_th_en":
action = "add bilingual Thai-English expert parallel data with semantic equivalence checks"
elif axis in {"coding_project_agent", "tool_grounding_reliability"}:
action = "run sandboxed project-building tasks with tool-use reward and artifact verification"
else:
action = "collect targeted failures, refine HyperPure records, train adapter, rerun gate"
plan.append({"axis": axis, "gap": row["gap"], "action": action})
return plan
def _markdown(report: dict) -> str:
lines = [
"# TinyMind GPT-5.5 Pro Parity Gate",
"",
f"- Parity axes passed: {report['summary']['passed_axes']}/{report['summary']['axis_count']}",
f"- Parity percent: {report['summary']['parity_percent']:.2f}%",
f"- Frontier+ percent: {report['summary']['frontier_plus_percent']:.2f}%",
f"- Can claim GPT-5.5 Pro equivalent: {report['claim_gate']['can_claim_gpt55_pro_equivalent']}",
f"- Can claim beyond frontier: {report['claim_gate']['can_claim_beyond_frontier']}",
"",
"## Axes",
"",
"| Axis | TinyMind | Target | Gap | Passed |",
"|---|---:|---:|---:|---|",
]
for row in report["parity_rows"]:
lines.append(
f"| {row['axis']} | {row['tinymind_score']:.2f} | {row['parity_target']:.2f} | {row['gap']:.2f} | {row['passed']} |"
)
lines.extend(["", "## Upgrade Plan", ""])
for row in report["upgrade_plan"]:
lines.append(f"- {row['axis']}: gap={row['gap']:.2f}; {row['action']}")
return "\n".join(lines) + "\n"

Xet Storage Details

Size:
6.65 kB
·
Xet hash:
61bbf932b9ecb420c5d020b65aa13f4973c2edf86d2df6e79888653a43f37fd4

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.