Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /mythos_capability_forge.py
| """Forge Mythos-reported capabilities into TinyMind train/eval tasks. | |
| The point is not to copy Mythos or claim its scores. The point is to turn | |
| reported capability areas into concrete, auditable TinyMind tasks that can be | |
| trained and evaluated locally or externally. | |
| """ | |
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| from typing import Any | |
| AXIS_TO_TASK = { | |
| "swe_bench_verified": { | |
| "axis": "coding_patch_reasoning", | |
| "prompt": "Given a failing test summary and a small code excerpt, identify the bug, propose the minimal patch, and state a regression test.", | |
| "success_criteria": ["minimal patch", "failing behavior explained", "regression test included", "no invented files"], | |
| }, | |
| "gpqa_diamond": { | |
| "axis": "scientific_reasoning", | |
| "prompt": "Solve a graduate-level science question by stating assumptions, eliminating distractors, and giving a concise verified answer.", | |
| "success_criteria": ["assumptions stated", "distractors eliminated", "final answer separated", "uncertainty bounded"], | |
| }, | |
| "usamo": { | |
| "axis": "formal_math_proof", | |
| "prompt": "Solve a proof-style math problem using invariant, construction, or contradiction as appropriate.", | |
| "success_criteria": ["lemma structure", "valid proof steps", "no skipped core step", "final theorem restated"], | |
| }, | |
| "terminal_bench_2": { | |
| "axis": "terminal_agent_repair", | |
| "prompt": "Diagnose a broken CLI workflow from logs, propose commands, and include rollback and verification steps.", | |
| "success_criteria": ["root cause", "commands", "rollback", "verification"], | |
| }, | |
| "cybench": { | |
| "axis": "defensive_vulnerability_analysis", | |
| "prompt": "Analyze an authorized vulnerability report defensively: identify root cause class, affected surface, proof requirements, and patch strategy without exploit weaponization.", | |
| "success_criteria": ["authorization boundary", "root cause class", "patch strategy", "no exploit payload"], | |
| }, | |
| } | |
| DEFAULT_SECURITY_TASK = { | |
| "axis": "defensive_vulnerability_analysis", | |
| "prompt": "Convert a reported security capability into a defensive-only audit plan with sandboxing, hashes, read-only analysis, and evidence logs.", | |
| "success_criteria": ["defensive-only", "sandbox required", "evidence logs", "no offensive steps"], | |
| } | |
| def _load(path: str | Path | None) -> dict[str, Any]: | |
| if not path: | |
| return {} | |
| p = Path(path) | |
| return json.loads(p.read_text(encoding="utf-8")) if p.exists() else {} | |
| def _task_row(task: dict[str, Any], idx: int, split: str) -> dict[str, Any]: | |
| criteria = "; ".join(task["success_criteria"]) | |
| return { | |
| "id": f"mythos-capability-{split}-{idx:04d}-{task['axis']}", | |
| "domain": task["axis"], | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": "You are TinyMind. Solve only with explicit evidence boundaries, natural Thai/English clarity, and no unsupported frontier claims.", | |
| }, | |
| { | |
| "role": "user", | |
| "content": task["prompt"], | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": ( | |
| "I will solve this as an auditable task: first define the evidence, then reason through the minimal steps, " | |
| f"then check these criteria: {criteria}. I will not treat reported Mythos scores as TinyMind scores." | |
| ), | |
| }, | |
| ], | |
| "source": "mythos_report_capability_forge", | |
| "safety_boundary": task["safety_boundary"], | |
| "success_criteria": task["success_criteria"], | |
| } | |
| def build_mythos_capability_forge( | |
| out_dir: str | Path, | |
| *, | |
| mythos_analysis_path: str | Path = "reports/mythos_report_analysis/mythos_report_analysis.json", | |
| ) -> dict[str, Any]: | |
| analysis = _load(mythos_analysis_path) | |
| claims = analysis.get("benchmark_claims", []) | |
| lessons = analysis.get("distilled_lessons", []) | |
| axes = {str(claim.get("axis")) for claim in claims} | |
| tasks = [] | |
| for axis in sorted(axes): | |
| if axis in AXIS_TO_TASK: | |
| tasks.append(dict(AXIS_TO_TASK[axis])) | |
| if not any(task["axis"] == "defensive_vulnerability_analysis" for task in tasks): | |
| tasks.append(dict(DEFAULT_SECURITY_TASK)) | |
| if not any(task["axis"] == "formal_math_proof" for task in tasks): | |
| tasks.append(dict(AXIS_TO_TASK["usamo"])) | |
| for task in tasks: | |
| task["safety_boundary"] = { | |
| "offensive_exploit_generation_allowed": False, | |
| "requires_authorized_context": task["axis"] == "defensive_vulnerability_analysis", | |
| "must_log_evidence": True, | |
| "mythos_score_transfer_allowed": False, | |
| } | |
| task["lesson_links"] = [lesson.get("lesson_id") for lesson in lessons[:4]] | |
| task["target"] = "exceed_reported_capability_area_by_reproducible_evidence_not_by_claim" | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| json_path = out / "mythos_capability_forge_report.json" | |
| sft_path = out / "mythos_capability_ladder_sft.jsonl" | |
| eval_path = out / "mythos_capability_ladder_eval.jsonl" | |
| md_path = out / "mythos_capability_forge_report.md" | |
| sft_rows = [_task_row(task, idx, "sft") for idx, task in enumerate(tasks)] | |
| eval_rows = [ | |
| { | |
| **_task_row(task, idx, "eval"), | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": "Evaluate TinyMind on this capability with strict evidence and no score transfer.", | |
| }, | |
| {"role": "user", "content": task["prompt"] + " Provide the answer and a self-check."}, | |
| ], | |
| } | |
| for idx, task in enumerate(tasks) | |
| ] | |
| with sft_path.open("w", encoding="utf-8") as f: | |
| for row in sft_rows: | |
| f.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n") | |
| with eval_path.open("w", encoding="utf-8") as f: | |
| for row in eval_rows: | |
| f.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n") | |
| report = { | |
| "schema_version": "tinymind-mythos-capability-forge-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "mythos_analysis_path": str(mythos_analysis_path), | |
| "task_count": len(tasks), | |
| "task_ladder": tasks, | |
| "sft_jsonl": str(sft_path), | |
| "eval_jsonl": str(eval_path), | |
| "json_path": str(json_path), | |
| "markdown_path": str(md_path), | |
| "claim_gate": { | |
| "ready_to_train_capability_ladder": len(tasks) >= 5, | |
| "tinymind_can_claim_more_than_mythos": False, | |
| "reason": "The ladder enables training/eval for reported capability areas; only measured TinyMind results can support claims.", | |
| }, | |
| "next_actions": [ | |
| "Train or evaluate TinyMind on mythos_capability_ladder_sft/eval.", | |
| "Attach raw outputs and scoring rubrics before any public comparison.", | |
| "Keep defensive security tasks authorized and sandboxed.", | |
| ], | |
| } | |
| json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md_path.write_text(_markdown(report), encoding="utf-8") | |
| return report | |
| def _markdown(report: dict[str, Any]) -> str: | |
| lines = [ | |
| "# TinyMind Mythos Capability Forge", | |
| "", | |
| f"- Task count: {report['task_count']}", | |
| f"- Ready to train ladder: {report['claim_gate']['ready_to_train_capability_ladder']}", | |
| f"- Can claim more than Mythos: {report['claim_gate']['tinymind_can_claim_more_than_mythos']}", | |
| "", | |
| "## Task Ladder", | |
| "", | |
| ] | |
| for task in report["task_ladder"]: | |
| lines.append(f"- {task['axis']}: {task['prompt']}") | |
| return "\n".join(lines) + "\n" | |
Xet Storage Details
- Size:
- 7.93 kB
- Xet hash:
- b47471e323372885c2f75e8a7b87973c18c77d3d706877b10ebcdf71177bdcd9
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.