bbkdevops's picture
download
raw
13.1 kB
"""World-standard knowledge dashboard for TinyMind.
The dashboard mirrors common model-report layouts: translation bars,
instruction/domain radar summaries, and size comparisons. Local TinyMind
scores are measured; external rows are reference bands unless explicitly
replaced by a measured provider result.
"""
from __future__ import annotations
from datetime import datetime, timezone
import json
from pathlib import Path
from typing import Sequence
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
import torch
from evaluation.local_evidence import _encode
from evaluation.official_hard_eval import (
build_size_comparison,
profile_model_size,
run_mmlu_pro,
_load_model,
)
WORLD_REFERENCE_BANDS = [
{
"model_class": "chance_10_way_mcq",
"size_class": "metric floor",
"params_b": 0.0,
"knowledge_mmlu_pro": 10.0,
"instruction_following": 0.0,
"translation": 0.0,
"source": "metric_baseline",
},
{
"model_class": "sub_1b_open_llm",
"size_class": "<1B",
"params_b": 0.5,
"knowledge_mmlu_pro": None,
"instruction_following": None,
"translation": None,
"source": "reference_slot_fill_with_measured_provider_result",
},
{
"model_class": "7b_open_llm",
"size_class": "7B",
"params_b": 7.0,
"knowledge_mmlu_pro": None,
"instruction_following": None,
"translation": None,
"source": "reference_slot_fill_with_measured_provider_result",
},
{
"model_class": "70b_open_llm",
"size_class": "70B",
"params_b": 70.0,
"knowledge_mmlu_pro": None,
"instruction_following": None,
"translation": None,
"source": "reference_slot_fill_with_measured_provider_result",
},
{
"model_class": "frontier_closed_llm",
"size_class": "frontier",
"params_b": None,
"knowledge_mmlu_pro": None,
"instruction_following": None,
"translation": None,
"source": "reference_slot_requires_official_provider_result",
},
]
TRANSLATION_ITEMS = [
{
"id": "en_th_1",
"source_lang": "en",
"target_lang": "th",
"prompt": "Translate to Thai exactly: The ledger stores evidence before claims.",
"reference": "บัญชีหลักฐานเก็บหลักฐานก่อนการอ้างสิทธิ์",
},
{
"id": "th_en_1",
"source_lang": "th",
"target_lang": "en",
"prompt": "Translate to English exactly: โมเดลต้องวัดผลก่อนอ้างว่าเก่งที่สุด",
"reference": "The model must be evaluated before claiming it is the best.",
},
{
"id": "en_th_2",
"source_lang": "en",
"target_lang": "th",
"prompt": "Translate to Thai exactly: Stable memory needs exact retrieval.",
"reference": "ความจำที่เสถียรต้องการการดึงคืนที่แม่นยำ",
},
]
INSTRUCTION_ITEMS = [
{
"id": "json_only",
"prompt": 'Return only compact JSON with keys "answer" and "evidence".',
"check": "json_object",
},
{
"id": "prefix",
"prompt": 'Answer with exactly one sentence that starts with "Verified:".',
"check": "starts_with_verified",
},
{
"id": "three_bullets",
"prompt": "Give exactly three bullet points about safe benchmark claims.",
"check": "three_bullets",
},
]
@torch.no_grad()
def _generate_text(model, prompt: str, max_new_tokens: int = 40) -> str:
ids = _encode(prompt, model.cfg.max_seq_len, model.cfg.vocab_size).unsqueeze(0)
out = model.generate(ids, max_new_tokens=max_new_tokens, temperature=0.7, top_p=0.9)
return _decode_ids(out[0].tolist(), model.cfg.vocab_size)[len(prompt) :].strip()
def _decode_ids(ids: Sequence[int], vocab_size: int) -> str:
usable = max(vocab_size - 4, 1)
data = bytes((int(tok) - 4) % usable for tok in ids if int(tok) >= 4)
return data.decode("utf-8", errors="ignore")
def _char_f1(candidate: str, reference: str) -> float:
cand = [c for c in candidate.lower() if not c.isspace()]
ref = [c for c in reference.lower() if not c.isspace()]
if not cand or not ref:
return 0.0
common = 0
ref_counts: dict[str, int] = {}
for c in ref:
ref_counts[c] = ref_counts.get(c, 0) + 1
for c in cand:
if ref_counts.get(c, 0) > 0:
common += 1
ref_counts[c] -= 1
precision = common / len(cand)
recall = common / len(ref)
if precision + recall == 0:
return 0.0
return 2 * precision * recall / (precision + recall)
def run_translation_smoke(model) -> dict:
rows = []
for item in TRANSLATION_ITEMS:
generated = _generate_text(model, item["prompt"])
score = _char_f1(generated, item["reference"]) * 100
rows.append({**item, "generated": generated, "score": score})
avg = sum(row["score"] for row in rows) / max(len(rows), 1)
return {
"benchmark": "TinyMind-FLORES-style Thai/English translation smoke",
"metric": "character_F1_percent",
"official_rank_claimed": False,
"score": avg,
"rows": rows,
}
def _instruction_ok(output: str, check: str) -> bool:
text = output.strip()
if check == "json_object":
try:
parsed = json.loads(text)
return isinstance(parsed, dict) and {"answer", "evidence"} <= set(parsed)
except Exception:
return False
if check == "starts_with_verified":
return text.startswith("Verified:") and text.count(".") <= 1
if check == "three_bullets":
bullets = [line for line in text.splitlines() if line.strip().startswith(("-", "*"))]
return len(bullets) == 3
return False
def run_instruction_smoke(model) -> dict:
rows = []
correct = 0
for item in INSTRUCTION_ITEMS:
generated = _generate_text(model, item["prompt"])
ok = _instruction_ok(generated, item["check"])
correct += int(ok)
rows.append({**item, "generated": generated, "correct": ok})
return {
"benchmark": "IFEval-style verifiable instruction smoke",
"metric": "strict_rule_accuracy_percent",
"official_rank_claimed": False,
"score": 100 * correct / max(len(rows), 1),
"correct": correct,
"samples": len(rows),
"rows": rows,
}
def _plot_dashboard(report: dict, out_png: Path) -> None:
tiny = report["summary_scores"]
labels = ["Knowledge\nMMLU-Pro", "Instruction\nIFEval-style", "Translation\nTH/EN"]
values = [tiny["knowledge"], tiny["instruction"], tiny["translation"]]
fig = plt.figure(figsize=(12, 8), dpi=140)
gs = fig.add_gridspec(2, 2, height_ratios=[1.15, 1.0])
ax_bar = fig.add_subplot(gs[0, :])
colors = ["#2979c2", "#43a047", "#f9a825"]
bars = ax_bar.bar(labels, values, color=colors, width=0.55)
ax_bar.set_ylim(0, 100)
ax_bar.set_ylabel("Score (%)")
ax_bar.set_title("TinyMind Knowledge Dashboard: Measured Local Scores")
ax_bar.grid(axis="y", alpha=0.25)
for bar, value in zip(bars, values):
ax_bar.text(bar.get_x() + bar.get_width() / 2, value + 1, f"{value:.1f}", ha="center", fontsize=10)
ax_radar = fig.add_subplot(gs[1, 0], polar=True)
angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
radar_values = values + values[:1]
radar_angles = angles + angles[:1]
ax_radar.plot(radar_angles, radar_values, color="#2979c2", linewidth=2)
ax_radar.fill(radar_angles, radar_values, color="#2979c2", alpha=0.18)
ax_radar.set_xticks(angles)
ax_radar.set_xticklabels(labels, fontsize=8)
ax_radar.set_ylim(0, 100)
ax_radar.set_title("Capability Shape")
ax_size = fig.add_subplot(gs[1, 1])
rows = report["size_comparison"][:4]
names = [row["model"].replace("TinyMind-PureField-ReGenesis", "TinyMind") for row in rows]
params = [max(row["params"], 1) for row in rows]
ax_size.barh(names, params, color=["#2979c2", "#90caf9", "#9e9e9e", "#616161"])
ax_size.set_xscale("log")
ax_size.set_xlabel("Parameters (log scale)")
ax_size.set_title("Size vs Common Baselines")
ax_size.grid(axis="x", alpha=0.25)
fig.tight_layout()
fig.savefig(out_png)
plt.close(fig)
def run_knowledge_dashboard(
checkpoint_path: str | Path,
out_dir: str | Path,
mmlu_limit: int = 20,
safetensors_path: str | Path | None = None,
int4_artifact_path: str | Path | None = None,
) -> dict:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
model = _load_model(checkpoint_path)
size = profile_model_size(model, checkpoint_path, safetensors_path, int4_artifact_path)
mmlu = run_mmlu_pro(model, split="validation", limit=mmlu_limit)
instruction = run_instruction_smoke(model)
translation = run_translation_smoke(model)
summary = {
"knowledge": 100 * mmlu["accuracy"],
"instruction": instruction["score"],
"translation": translation["score"],
}
report = {
"schema_version": "tinymind-knowledge-dashboard-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"checkpoint_path": str(checkpoint_path),
"measurement_scope": "local TinyMind measured; reference bands are placeholders until provider results are imported",
"standards": [
"MMLU-Pro for broad hard knowledge/reasoning",
"IFEval-style strict verifiable instruction following",
"FLORES-style Thai/English translation smoke",
],
"size": size,
"size_comparison": build_size_comparison(size),
"world_reference_bands": WORLD_REFERENCE_BANDS,
"results": {
"mmlu_pro": mmlu,
"instruction_following": instruction,
"translation": translation,
},
"summary_scores": summary,
"world_best_claim_allowed": False,
"official_rank_claimed": False,
}
json_path = out / "knowledge_dashboard.json"
md_path = out / "knowledge_dashboard.md"
png_path = out / "knowledge_dashboard.png"
report["json_path"] = str(json_path)
report["markdown_path"] = str(md_path)
report["png_path"] = str(png_path)
json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
md_path.write_text(_markdown(report), encoding="utf-8")
_plot_dashboard(report, png_path)
return report
def _markdown(report: dict) -> str:
s = report["summary_scores"]
size = report["size"]
lines = [
"# TinyMind Knowledge Dashboard",
"",
"## Measured Scores",
"",
f"- MMLU-Pro knowledge: {s['knowledge']:.2f}%",
f"- IFEval-style instruction: {s['instruction']:.2f}%",
f"- FLORES-style Thai/English translation: {s['translation']:.2f}%",
"",
"## Current Size",
"",
f"- Parameters: {size['total_params']:,} ({size['million_params']:.6f}M)",
f"- Checkpoint MB: {size['checkpoint_mb']:.4f}" if size["checkpoint_mb"] is not None else "- Checkpoint MB: missing",
f"- Safetensors MB: {size['safetensors_mb']:.4f}" if size["safetensors_mb"] is not None else "- Safetensors MB: missing",
f"- INT4 artifact MB: {size['int4_artifact_mb']:.4f}" if size["int4_artifact_mb"] is not None else "- INT4 artifact MB: missing",
"",
"## Comparison Scope",
"",
"- TinyMind rows are measured locally.",
"- World reference rows are slots for provider/leaderboard results and are not claimed as measured here.",
"- World-best/rank-1 claim remains blocked until external official evidence exists.",
"",
"## Size Comparison",
"",
"| Model | Params | x TinyMind | Scope |",
"|---|---:|---:|---|",
]
for row in report["size_comparison"]:
lines.append(
f"| {row['model']} | {row['params']:,} | {row['relative_to_tinymind_params']:.1f}x | {row['comparison_scope']} |"
)
lines += [
"",
"## World Standard Slots",
"",
"| Model class | Size class | Knowledge | Instruction | Translation | Source status |",
"|---|---:|---:|---:|---:|---|",
]
for row in report["world_reference_bands"]:
lines.append(
"| "
f"{row['model_class']} | {row['size_class']} | "
f"{_pct_or_pending(row['knowledge_mmlu_pro'])} | "
f"{_pct_or_pending(row['instruction_following'])} | "
f"{_pct_or_pending(row['translation'])} | {row['source']} |"
)
lines.append("")
return "\n".join(lines)
def _pct_or_pending(value: float | None) -> str:
return "pending" if value is None else f"{value:.1f}%"

Xet Storage Details

Size:
13.1 kB
·
Xet hash:
a26106b343c3d682b659f4603498bbb3b5f65cee928e0a19714b6f3fa088d107

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.