Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /evaluation /knowledge_dashboard.py
| """World-standard knowledge dashboard for TinyMind. | |
| The dashboard mirrors common model-report layouts: translation bars, | |
| instruction/domain radar summaries, and size comparisons. Local TinyMind | |
| scores are measured; external rows are reference bands unless explicitly | |
| replaced by a measured provider result. | |
| """ | |
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| from typing import Sequence | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import torch | |
| from evaluation.local_evidence import _encode | |
| from evaluation.official_hard_eval import ( | |
| build_size_comparison, | |
| profile_model_size, | |
| run_mmlu_pro, | |
| _load_model, | |
| ) | |
| WORLD_REFERENCE_BANDS = [ | |
| { | |
| "model_class": "chance_10_way_mcq", | |
| "size_class": "metric floor", | |
| "params_b": 0.0, | |
| "knowledge_mmlu_pro": 10.0, | |
| "instruction_following": 0.0, | |
| "translation": 0.0, | |
| "source": "metric_baseline", | |
| }, | |
| { | |
| "model_class": "sub_1b_open_llm", | |
| "size_class": "<1B", | |
| "params_b": 0.5, | |
| "knowledge_mmlu_pro": None, | |
| "instruction_following": None, | |
| "translation": None, | |
| "source": "reference_slot_fill_with_measured_provider_result", | |
| }, | |
| { | |
| "model_class": "7b_open_llm", | |
| "size_class": "7B", | |
| "params_b": 7.0, | |
| "knowledge_mmlu_pro": None, | |
| "instruction_following": None, | |
| "translation": None, | |
| "source": "reference_slot_fill_with_measured_provider_result", | |
| }, | |
| { | |
| "model_class": "70b_open_llm", | |
| "size_class": "70B", | |
| "params_b": 70.0, | |
| "knowledge_mmlu_pro": None, | |
| "instruction_following": None, | |
| "translation": None, | |
| "source": "reference_slot_fill_with_measured_provider_result", | |
| }, | |
| { | |
| "model_class": "frontier_closed_llm", | |
| "size_class": "frontier", | |
| "params_b": None, | |
| "knowledge_mmlu_pro": None, | |
| "instruction_following": None, | |
| "translation": None, | |
| "source": "reference_slot_requires_official_provider_result", | |
| }, | |
| ] | |
| TRANSLATION_ITEMS = [ | |
| { | |
| "id": "en_th_1", | |
| "source_lang": "en", | |
| "target_lang": "th", | |
| "prompt": "Translate to Thai exactly: The ledger stores evidence before claims.", | |
| "reference": "บัญชีหลักฐานเก็บหลักฐานก่อนการอ้างสิทธิ์", | |
| }, | |
| { | |
| "id": "th_en_1", | |
| "source_lang": "th", | |
| "target_lang": "en", | |
| "prompt": "Translate to English exactly: โมเดลต้องวัดผลก่อนอ้างว่าเก่งที่สุด", | |
| "reference": "The model must be evaluated before claiming it is the best.", | |
| }, | |
| { | |
| "id": "en_th_2", | |
| "source_lang": "en", | |
| "target_lang": "th", | |
| "prompt": "Translate to Thai exactly: Stable memory needs exact retrieval.", | |
| "reference": "ความจำที่เสถียรต้องการการดึงคืนที่แม่นยำ", | |
| }, | |
| ] | |
| INSTRUCTION_ITEMS = [ | |
| { | |
| "id": "json_only", | |
| "prompt": 'Return only compact JSON with keys "answer" and "evidence".', | |
| "check": "json_object", | |
| }, | |
| { | |
| "id": "prefix", | |
| "prompt": 'Answer with exactly one sentence that starts with "Verified:".', | |
| "check": "starts_with_verified", | |
| }, | |
| { | |
| "id": "three_bullets", | |
| "prompt": "Give exactly three bullet points about safe benchmark claims.", | |
| "check": "three_bullets", | |
| }, | |
| ] | |
| def _generate_text(model, prompt: str, max_new_tokens: int = 40) -> str: | |
| ids = _encode(prompt, model.cfg.max_seq_len, model.cfg.vocab_size).unsqueeze(0) | |
| out = model.generate(ids, max_new_tokens=max_new_tokens, temperature=0.7, top_p=0.9) | |
| return _decode_ids(out[0].tolist(), model.cfg.vocab_size)[len(prompt) :].strip() | |
| def _decode_ids(ids: Sequence[int], vocab_size: int) -> str: | |
| usable = max(vocab_size - 4, 1) | |
| data = bytes((int(tok) - 4) % usable for tok in ids if int(tok) >= 4) | |
| return data.decode("utf-8", errors="ignore") | |
| def _char_f1(candidate: str, reference: str) -> float: | |
| cand = [c for c in candidate.lower() if not c.isspace()] | |
| ref = [c for c in reference.lower() if not c.isspace()] | |
| if not cand or not ref: | |
| return 0.0 | |
| common = 0 | |
| ref_counts: dict[str, int] = {} | |
| for c in ref: | |
| ref_counts[c] = ref_counts.get(c, 0) + 1 | |
| for c in cand: | |
| if ref_counts.get(c, 0) > 0: | |
| common += 1 | |
| ref_counts[c] -= 1 | |
| precision = common / len(cand) | |
| recall = common / len(ref) | |
| if precision + recall == 0: | |
| return 0.0 | |
| return 2 * precision * recall / (precision + recall) | |
| def run_translation_smoke(model) -> dict: | |
| rows = [] | |
| for item in TRANSLATION_ITEMS: | |
| generated = _generate_text(model, item["prompt"]) | |
| score = _char_f1(generated, item["reference"]) * 100 | |
| rows.append({**item, "generated": generated, "score": score}) | |
| avg = sum(row["score"] for row in rows) / max(len(rows), 1) | |
| return { | |
| "benchmark": "TinyMind-FLORES-style Thai/English translation smoke", | |
| "metric": "character_F1_percent", | |
| "official_rank_claimed": False, | |
| "score": avg, | |
| "rows": rows, | |
| } | |
| def _instruction_ok(output: str, check: str) -> bool: | |
| text = output.strip() | |
| if check == "json_object": | |
| try: | |
| parsed = json.loads(text) | |
| return isinstance(parsed, dict) and {"answer", "evidence"} <= set(parsed) | |
| except Exception: | |
| return False | |
| if check == "starts_with_verified": | |
| return text.startswith("Verified:") and text.count(".") <= 1 | |
| if check == "three_bullets": | |
| bullets = [line for line in text.splitlines() if line.strip().startswith(("-", "*"))] | |
| return len(bullets) == 3 | |
| return False | |
| def run_instruction_smoke(model) -> dict: | |
| rows = [] | |
| correct = 0 | |
| for item in INSTRUCTION_ITEMS: | |
| generated = _generate_text(model, item["prompt"]) | |
| ok = _instruction_ok(generated, item["check"]) | |
| correct += int(ok) | |
| rows.append({**item, "generated": generated, "correct": ok}) | |
| return { | |
| "benchmark": "IFEval-style verifiable instruction smoke", | |
| "metric": "strict_rule_accuracy_percent", | |
| "official_rank_claimed": False, | |
| "score": 100 * correct / max(len(rows), 1), | |
| "correct": correct, | |
| "samples": len(rows), | |
| "rows": rows, | |
| } | |
| def _plot_dashboard(report: dict, out_png: Path) -> None: | |
| tiny = report["summary_scores"] | |
| labels = ["Knowledge\nMMLU-Pro", "Instruction\nIFEval-style", "Translation\nTH/EN"] | |
| values = [tiny["knowledge"], tiny["instruction"], tiny["translation"]] | |
| fig = plt.figure(figsize=(12, 8), dpi=140) | |
| gs = fig.add_gridspec(2, 2, height_ratios=[1.15, 1.0]) | |
| ax_bar = fig.add_subplot(gs[0, :]) | |
| colors = ["#2979c2", "#43a047", "#f9a825"] | |
| bars = ax_bar.bar(labels, values, color=colors, width=0.55) | |
| ax_bar.set_ylim(0, 100) | |
| ax_bar.set_ylabel("Score (%)") | |
| ax_bar.set_title("TinyMind Knowledge Dashboard: Measured Local Scores") | |
| ax_bar.grid(axis="y", alpha=0.25) | |
| for bar, value in zip(bars, values): | |
| ax_bar.text(bar.get_x() + bar.get_width() / 2, value + 1, f"{value:.1f}", ha="center", fontsize=10) | |
| ax_radar = fig.add_subplot(gs[1, 0], polar=True) | |
| angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist() | |
| radar_values = values + values[:1] | |
| radar_angles = angles + angles[:1] | |
| ax_radar.plot(radar_angles, radar_values, color="#2979c2", linewidth=2) | |
| ax_radar.fill(radar_angles, radar_values, color="#2979c2", alpha=0.18) | |
| ax_radar.set_xticks(angles) | |
| ax_radar.set_xticklabels(labels, fontsize=8) | |
| ax_radar.set_ylim(0, 100) | |
| ax_radar.set_title("Capability Shape") | |
| ax_size = fig.add_subplot(gs[1, 1]) | |
| rows = report["size_comparison"][:4] | |
| names = [row["model"].replace("TinyMind-PureField-ReGenesis", "TinyMind") for row in rows] | |
| params = [max(row["params"], 1) for row in rows] | |
| ax_size.barh(names, params, color=["#2979c2", "#90caf9", "#9e9e9e", "#616161"]) | |
| ax_size.set_xscale("log") | |
| ax_size.set_xlabel("Parameters (log scale)") | |
| ax_size.set_title("Size vs Common Baselines") | |
| ax_size.grid(axis="x", alpha=0.25) | |
| fig.tight_layout() | |
| fig.savefig(out_png) | |
| plt.close(fig) | |
| def run_knowledge_dashboard( | |
| checkpoint_path: str | Path, | |
| out_dir: str | Path, | |
| mmlu_limit: int = 20, | |
| safetensors_path: str | Path | None = None, | |
| int4_artifact_path: str | Path | None = None, | |
| ) -> dict: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| model = _load_model(checkpoint_path) | |
| size = profile_model_size(model, checkpoint_path, safetensors_path, int4_artifact_path) | |
| mmlu = run_mmlu_pro(model, split="validation", limit=mmlu_limit) | |
| instruction = run_instruction_smoke(model) | |
| translation = run_translation_smoke(model) | |
| summary = { | |
| "knowledge": 100 * mmlu["accuracy"], | |
| "instruction": instruction["score"], | |
| "translation": translation["score"], | |
| } | |
| report = { | |
| "schema_version": "tinymind-knowledge-dashboard-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "checkpoint_path": str(checkpoint_path), | |
| "measurement_scope": "local TinyMind measured; reference bands are placeholders until provider results are imported", | |
| "standards": [ | |
| "MMLU-Pro for broad hard knowledge/reasoning", | |
| "IFEval-style strict verifiable instruction following", | |
| "FLORES-style Thai/English translation smoke", | |
| ], | |
| "size": size, | |
| "size_comparison": build_size_comparison(size), | |
| "world_reference_bands": WORLD_REFERENCE_BANDS, | |
| "results": { | |
| "mmlu_pro": mmlu, | |
| "instruction_following": instruction, | |
| "translation": translation, | |
| }, | |
| "summary_scores": summary, | |
| "world_best_claim_allowed": False, | |
| "official_rank_claimed": False, | |
| } | |
| json_path = out / "knowledge_dashboard.json" | |
| md_path = out / "knowledge_dashboard.md" | |
| png_path = out / "knowledge_dashboard.png" | |
| report["json_path"] = str(json_path) | |
| report["markdown_path"] = str(md_path) | |
| report["png_path"] = str(png_path) | |
| json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| md_path.write_text(_markdown(report), encoding="utf-8") | |
| _plot_dashboard(report, png_path) | |
| return report | |
| def _markdown(report: dict) -> str: | |
| s = report["summary_scores"] | |
| size = report["size"] | |
| lines = [ | |
| "# TinyMind Knowledge Dashboard", | |
| "", | |
| "## Measured Scores", | |
| "", | |
| f"- MMLU-Pro knowledge: {s['knowledge']:.2f}%", | |
| f"- IFEval-style instruction: {s['instruction']:.2f}%", | |
| f"- FLORES-style Thai/English translation: {s['translation']:.2f}%", | |
| "", | |
| "## Current Size", | |
| "", | |
| f"- Parameters: {size['total_params']:,} ({size['million_params']:.6f}M)", | |
| f"- Checkpoint MB: {size['checkpoint_mb']:.4f}" if size["checkpoint_mb"] is not None else "- Checkpoint MB: missing", | |
| f"- Safetensors MB: {size['safetensors_mb']:.4f}" if size["safetensors_mb"] is not None else "- Safetensors MB: missing", | |
| f"- INT4 artifact MB: {size['int4_artifact_mb']:.4f}" if size["int4_artifact_mb"] is not None else "- INT4 artifact MB: missing", | |
| "", | |
| "## Comparison Scope", | |
| "", | |
| "- TinyMind rows are measured locally.", | |
| "- World reference rows are slots for provider/leaderboard results and are not claimed as measured here.", | |
| "- World-best/rank-1 claim remains blocked until external official evidence exists.", | |
| "", | |
| "## Size Comparison", | |
| "", | |
| "| Model | Params | x TinyMind | Scope |", | |
| "|---|---:|---:|---|", | |
| ] | |
| for row in report["size_comparison"]: | |
| lines.append( | |
| f"| {row['model']} | {row['params']:,} | {row['relative_to_tinymind_params']:.1f}x | {row['comparison_scope']} |" | |
| ) | |
| lines += [ | |
| "", | |
| "## World Standard Slots", | |
| "", | |
| "| Model class | Size class | Knowledge | Instruction | Translation | Source status |", | |
| "|---|---:|---:|---:|---:|---|", | |
| ] | |
| for row in report["world_reference_bands"]: | |
| lines.append( | |
| "| " | |
| f"{row['model_class']} | {row['size_class']} | " | |
| f"{_pct_or_pending(row['knowledge_mmlu_pro'])} | " | |
| f"{_pct_or_pending(row['instruction_following'])} | " | |
| f"{_pct_or_pending(row['translation'])} | {row['source']} |" | |
| ) | |
| lines.append("") | |
| return "\n".join(lines) | |
| def _pct_or_pending(value: float | None) -> str: | |
| return "pending" if value is None else f"{value:.1f}%" | |
Xet Storage Details
- Size:
- 13.1 kB
- Xet hash:
- a26106b343c3d682b659f4603498bbb3b5f65cee928e0a19714b6f3fa088d107
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.