bbkdevops's picture
download
raw
11.2 kB
from __future__ import annotations
from datetime import datetime, timezone
import json
from pathlib import Path
from typing import Any
AXIS_PROMPTS: dict[str, str] = {
"thai_language": "อธิบายความต่างระหว่างข้อมูลจริง หลักฐาน และข้อสรุป เป็นภาษาไทยธรรมชาติ พร้อมตัวอย่างสั้น ๆ",
"english_reasoning": "Explain why a low eval loss can still be misleading when the evaluation set is contaminated.",
"math_bound": "Prove briefly that x_t = a x_{t-1} + b_t is bounded when |a|<1 and |b_t|<=B.",
"math_probability": "A classifier has precision 0.8 and recall 0.5. Define precision and recall, then compute F1.",
"code_python": "Write a small Python function validate_tool_call(obj) that returns True only if obj has string name and dict arguments.",
"raw_code_bits": "Explain signed 6-bit sign extension from a packed byte lane. Include the mask, sign bit, min, and max.",
"systems_ffi": "Give three rules for safe Rust/C FFI ABI compatibility.",
"grounding": "You have one source saying A and another saying not-A. What should an evidence-grounded assistant do before answering?",
"tool_json": 'Return only JSON for a tool call named "search_web" with arguments query="Thai AI benchmark" and k=3.',
"translation_th_en": "Translate to English: การวัดผลที่ดีต้องแยกข้อมูลฝึกออกจากข้อมูลทดสอบอย่างเด็ดขาด",
"long_answer_control": "ตอบเป็นภาษาไทย 4 ข้อเท่านั้น: วิธีลด hallucination ในระบบ RAG",
"self_critique": "Give a concise answer, then add one sentence explaining what evidence would falsify your answer.",
}
AXIS_MUST: dict[str, list[str]] = {
"thai_language": ["ข้อมูลจริง", "หลักฐาน", "ข้อสรุป"],
"english_reasoning": ["contamination", "holdout", "generalization"],
"math_bound": ["geometric", "B", "1-|a|"],
"math_probability": ["precision", "recall", "0.615"],
"code_python": ["def validate_tool_call", "name", "arguments", "dict"],
"raw_code_bits": ["0x3f", "0x20", "-32", "31"],
"systems_ffi": ["extern", "repr(C)", "ownership"],
"grounding": ["compare", "source", "uncertainty"],
"tool_json": ['"name"', '"search_web"', '"arguments"', '"k"'],
"translation_th_en": ["training", "test", "separate"],
"long_answer_control": ["1", "2", "3", "4"],
"self_critique": ["falsify", "evidence"],
}
CORRECTIONS: dict[str, str] = {
"thai_language": (
"ข้อมูลจริงคือสิ่งที่ตรวจพบหรือบันทึกได้โดยตรง เช่น ค่าอุณหภูมิ 38 องศา. "
"หลักฐานคือข้อมูลจริงที่นำมาใช้รองรับคำตอบ เช่น ภาพถ่ายหรือบันทึกการวัด. "
"ข้อสรุปคือการตีความจากหลักฐาน เช่น เครื่องอาจร้อนผิดปกติ. "
"ถ้าหลักฐานยังไม่พอ ต้องบอกขอบเขตความไม่แน่นอน ไม่เดาแทนผู้ใช้."
),
"english_reasoning": (
"A low eval loss can be misleading when the evaluation set has contamination from training data. "
"The model may memorize leaked patterns instead of showing generalization. A clean holdout set, "
"deduplication, and paraphrased stress tests are needed before treating the loss as real capability."
),
"math_bound": (
"Unroll the recurrence: x_t = a^t x_0 + sum_{i=0}^{t-1} a^i b_{t-i}. "
"Taking absolute values gives |x_t| <= |a|^t |x_0| + B sum_{i=0}^{t-1} |a|^i. "
"The geometric series is at most 1/(1-|a|), so |x_t| <= |x_0| + B/(1-|a|). Thus the sequence is bounded."
),
"math_probability": (
"Precision is TP/(TP+FP): the share of predicted positives that are correct. "
"Recall is TP/(TP+FN): the share of real positives found. "
"F1 = 2PR/(P+R) = 2*0.8*0.5/(0.8+0.5) = 0.8/1.3 = 0.615."
),
"code_python": (
"def validate_tool_call(obj):\n"
" return (\n"
" isinstance(obj, dict)\n"
" and isinstance(obj.get(\"name\"), str)\n"
" and isinstance(obj.get(\"arguments\"), dict)\n"
" )\n"
),
"systems_ffi": (
"Use `extern \"C\"` so Rust and C agree on the calling convention. "
"Use `#[repr(C)]` for shared structs so field layout and alignment match C. "
"Define ownership clearly: who allocates, who frees, whether pointers may be null, and never unwind Rust panics across the FFI boundary."
),
"grounding": (
"An evidence-grounded assistant should compare the sources, identify which source is newer or more authoritative, "
"quote or summarize the relevant evidence, and state uncertainty instead of choosing A or not-A without support. "
"If the conflict cannot be resolved, it should say what evidence would decide the answer."
),
"tool_json": '{"name":"search_web","arguments":{"query":"Thai AI benchmark","k":3}}',
"self_critique": (
"Concise answer: the claim is supported only if independent evidence matches the stated result. "
"Evidence that would falsify my answer would be a reproducible official report showing the opposite outcome under the same benchmark conditions."
),
}
VARIANT_INSTRUCTIONS = [
"Answer with maximal precision and include every required invariant.",
"Answer naturally, but preserve exact schema/keywords required by the task.",
"Answer compactly, verify constraints, and avoid unsupported claims.",
"Answer as a high-quality held-out evaluation response, not a memorized template.",
]
def _read(path: str | Path) -> dict[str, Any]:
return json.loads(Path(path).read_text(encoding="utf-8"))
def _first_result(report: dict[str, Any]) -> dict[str, Any]:
results = report.get("results") or []
return dict(results[0]) if results else {}
def _samples_by_axis(report: dict[str, Any]) -> dict[str, dict[str, Any]]:
result = _first_result(report)
return {str(row.get("axis")): dict(row) for row in result.get("samples", []) if row.get("axis")}
def _teacher_model(report: dict[str, Any]) -> str:
return str(_first_result(report).get("model_id") or "unknown-480b-teacher")
def _assistant_answer(axis: str, teacher_response: str, teacher_score: int) -> str:
correction = CORRECTIONS.get(axis)
if correction:
return correction
if teacher_score >= 4 and teacher_response.strip():
return teacher_response.strip()
must = ", ".join(AXIS_MUST.get(axis, []))
return f"Answer must explicitly satisfy these invariants: {must}.\n\n{teacher_response.strip()}"
def build_weak_axis_curriculum_480b(
out_dir: str | Path,
*,
local_report: str | Path,
remote_report: str | Path,
variants_per_axis: int = 4,
) -> dict[str, Any]:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
local = _read(local_report)
remote = _read(remote_report)
local_samples = _samples_by_axis(local)
remote_samples = _samples_by_axis(remote)
teacher_model = _teacher_model(remote)
weak_axes: list[str] = []
rows: list[dict[str, Any]] = []
for axis, local_sample in local_samples.items():
local_score = int(local_sample.get("score", 0) or 0)
remote_sample = remote_samples.get(axis, {})
remote_score = int(remote_sample.get("score", 0) or 0)
if local_score >= 4 and remote_score <= local_score:
continue
weak_axes.append(axis)
teacher_response = str(remote_sample.get("response") or "")
answer = _assistant_answer(axis, teacher_response, remote_score)
prompt = AXIS_PROMPTS.get(axis, str(local_sample.get("prompt") or axis))
must = AXIS_MUST.get(axis, [])
for index in range(max(1, int(variants_per_axis))):
instruction = VARIANT_INSTRUCTIONS[index % len(VARIANT_INSTRUCTIONS)]
rows.append(
{
"source": "qwen3_coder_480b_weak_axis_distill",
"category": axis,
"messages": [
{
"role": "system",
"content": (
"You are TinyMind distilled from a 480B comparison. "
"Fix weak-axis behavior with exact constraints, natural language, and no fabricated claims."
),
},
{
"role": "user",
"content": f"{instruction}\nRequired invariants: {', '.join(must)}\nTask: {prompt}",
},
{"role": "assistant", "content": answer},
],
"metadata": {
"axis": axis,
"variant": index,
"teacher_model": teacher_model,
"teacher_score": remote_score,
"tinymind_score_before": local_score,
"teacher_response_used": bool(teacher_response.strip()),
"loss_weight": 2.0 if local_score <= 2 else 1.5,
"quality_tags": ["480b_distill", "weak_axis_repair", "constraint_fidelity"],
},
}
)
sft_path = out / "weak_axis_480b_curriculum_sft.jsonl"
with sft_path.open("w", encoding="utf-8", newline="\n") as handle:
for row in rows:
handle.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n")
report = {
"schema_version": "tinymind-weak-axis-480b-curriculum-v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"local_report": str(local_report),
"remote_report": str(remote_report),
"sft_path": str(sft_path),
"summary": {
"weak_axes": weak_axes,
"weak_axis_count": len(weak_axes),
"sft_rows": len(rows),
"variants_per_axis": max(1, int(variants_per_axis)),
},
"claim_gate": {
"uses_480b_teacher_outputs": True,
"official_superiority_claim_allowed": False,
"reason": "This creates targeted distillation rows from local comparison evidence; it does not prove superiority over the 480B teacher.",
},
}
report_path = out / "weak_axis_480b_curriculum_report.json"
report["json_path"] = str(report_path)
report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
return report

Xet Storage Details

Size:
11.2 kB
·
Xet hash:
0802596410e753db20aa65bbc57d7d4561d2948ff091815bab9a3e2d223a0057

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.