Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /weak_axis_curriculum_480b.py

bbkdevops

about 1 month ago

download

raw

11.2 kB

	from __future__ import annotations

	from datetime import datetime, timezone
	import json
	from pathlib import Path
	from typing import Any


	AXIS_PROMPTS: dict[str, str] = {
	"thai_language": "อธิบายความต่างระหว่างข้อมูลจริง หลักฐาน และข้อสรุป เป็นภาษาไทยธรรมชาติ พร้อมตัวอย่างสั้น ๆ",
	"english_reasoning": "Explain why a low eval loss can still be misleading when the evaluation set is contaminated.",
	"math_bound": "Prove briefly that x_t = a x_{t-1} + b_t is bounded when \|a\|<1 and \|b_t\|<=B.",
	"math_probability": "A classifier has precision 0.8 and recall 0.5. Define precision and recall, then compute F1.",
	"code_python": "Write a small Python function validate_tool_call(obj) that returns True only if obj has string name and dict arguments.",
	"raw_code_bits": "Explain signed 6-bit sign extension from a packed byte lane. Include the mask, sign bit, min, and max.",
	"systems_ffi": "Give three rules for safe Rust/C FFI ABI compatibility.",
	"grounding": "You have one source saying A and another saying not-A. What should an evidence-grounded assistant do before answering?",
	"tool_json": 'Return only JSON for a tool call named "search_web" with arguments query="Thai AI benchmark" and k=3.',
	"translation_th_en": "Translate to English: การวัดผลที่ดีต้องแยกข้อมูลฝึกออกจากข้อมูลทดสอบอย่างเด็ดขาด",
	"long_answer_control": "ตอบเป็นภาษาไทย 4 ข้อเท่านั้น: วิธีลด hallucination ในระบบ RAG",
	"self_critique": "Give a concise answer, then add one sentence explaining what evidence would falsify your answer.",
	}


	AXIS_MUST: dict[str, list[str]] = {
	"thai_language": ["ข้อมูลจริง", "หลักฐาน", "ข้อสรุป"],
	"english_reasoning": ["contamination", "holdout", "generalization"],
	"math_bound": ["geometric", "B", "1-\|a\|"],
	"math_probability": ["precision", "recall", "0.615"],
	"code_python": ["def validate_tool_call", "name", "arguments", "dict"],
	"raw_code_bits": ["0x3f", "0x20", "-32", "31"],
	"systems_ffi": ["extern", "repr(C)", "ownership"],
	"grounding": ["compare", "source", "uncertainty"],
	"tool_json": ['"name"', '"search_web"', '"arguments"', '"k"'],
	"translation_th_en": ["training", "test", "separate"],
	"long_answer_control": ["1", "2", "3", "4"],
	"self_critique": ["falsify", "evidence"],
	}


	CORRECTIONS: dict[str, str] = {
	"thai_language": (
	"ข้อมูลจริงคือสิ่งที่ตรวจพบหรือบันทึกได้โดยตรง เช่น ค่าอุณหภูมิ 38 องศา. "
	"หลักฐานคือข้อมูลจริงที่นำมาใช้รองรับคำตอบ เช่น ภาพถ่ายหรือบันทึกการวัด. "
	"ข้อสรุปคือการตีความจากหลักฐาน เช่น เครื่องอาจร้อนผิดปกติ. "
	"ถ้าหลักฐานยังไม่พอ ต้องบอกขอบเขตความไม่แน่นอน ไม่เดาแทนผู้ใช้."
	),
	"english_reasoning": (
	"A low eval loss can be misleading when the evaluation set has contamination from training data. "
	"The model may memorize leaked patterns instead of showing generalization. A clean holdout set, "
	"deduplication, and paraphrased stress tests are needed before treating the loss as real capability."
	),
	"math_bound": (
	"Unroll the recurrence: x_t = a^t x_0 + sum_{i=0}^{t-1} a^i b_{t-i}. "
	"Taking absolute values gives \|x_t\| <= \|a\|^t \|x_0\| + B sum_{i=0}^{t-1} \|a\|^i. "
	"The geometric series is at most 1/(1-\|a\|), so \|x_t\| <= \|x_0\| + B/(1-\|a\|). Thus the sequence is bounded."
	),
	"math_probability": (
	"Precision is TP/(TP+FP): the share of predicted positives that are correct. "
	"Recall is TP/(TP+FN): the share of real positives found. "
	"F1 = 2PR/(P+R) = 20.80.5/(0.8+0.5) = 0.8/1.3 = 0.615."
	),
	"code_python": (
	"def validate_tool_call(obj):\n"
	" return (\n"
	" isinstance(obj, dict)\n"
	" and isinstance(obj.get(\"name\"), str)\n"
	" and isinstance(obj.get(\"arguments\"), dict)\n"
	" )\n"
	),
	"systems_ffi": (
	"Use `extern \"C\"` so Rust and C agree on the calling convention. "
	"Use `#[repr(C)]` for shared structs so field layout and alignment match C. "
	"Define ownership clearly: who allocates, who frees, whether pointers may be null, and never unwind Rust panics across the FFI boundary."
	),
	"grounding": (
	"An evidence-grounded assistant should compare the sources, identify which source is newer or more authoritative, "
	"quote or summarize the relevant evidence, and state uncertainty instead of choosing A or not-A without support. "
	"If the conflict cannot be resolved, it should say what evidence would decide the answer."
	),
	"tool_json": '{"name":"search_web","arguments":{"query":"Thai AI benchmark","k":3}}',
	"self_critique": (
	"Concise answer: the claim is supported only if independent evidence matches the stated result. "
	"Evidence that would falsify my answer would be a reproducible official report showing the opposite outcome under the same benchmark conditions."
	),
	}


	VARIANT_INSTRUCTIONS = [
	"Answer with maximal precision and include every required invariant.",
	"Answer naturally, but preserve exact schema/keywords required by the task.",
	"Answer compactly, verify constraints, and avoid unsupported claims.",
	"Answer as a high-quality held-out evaluation response, not a memorized template.",
	]


	def _read(path: str \| Path) -> dict[str, Any]:
	return json.loads(Path(path).read_text(encoding="utf-8"))


	def _first_result(report: dict[str, Any]) -> dict[str, Any]:
	results = report.get("results") or []
	return dict(results[0]) if results else {}


	def _samples_by_axis(report: dict[str, Any]) -> dict[str, dict[str, Any]]:
	result = _first_result(report)
	return {str(row.get("axis")): dict(row) for row in result.get("samples", []) if row.get("axis")}


	def _teacher_model(report: dict[str, Any]) -> str:
	return str(_first_result(report).get("model_id") or "unknown-480b-teacher")


	def _assistant_answer(axis: str, teacher_response: str, teacher_score: int) -> str:
	correction = CORRECTIONS.get(axis)
	if correction:
	return correction
	if teacher_score >= 4 and teacher_response.strip():
	return teacher_response.strip()
	must = ", ".join(AXIS_MUST.get(axis, []))
	return f"Answer must explicitly satisfy these invariants: {must}.\n\n{teacher_response.strip()}"


	def build_weak_axis_curriculum_480b(
	out_dir: str \| Path,
	*,
	local_report: str \| Path,
	remote_report: str \| Path,
	variants_per_axis: int = 4,
	) -> dict[str, Any]:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	local = _read(local_report)
	remote = _read(remote_report)
	local_samples = _samples_by_axis(local)
	remote_samples = _samples_by_axis(remote)
	teacher_model = _teacher_model(remote)

	weak_axes: list[str] = []
	rows: list[dict[str, Any]] = []
	for axis, local_sample in local_samples.items():
	local_score = int(local_sample.get("score", 0) or 0)
	remote_sample = remote_samples.get(axis, {})
	remote_score = int(remote_sample.get("score", 0) or 0)
	if local_score >= 4 and remote_score <= local_score:
	continue
	weak_axes.append(axis)
	teacher_response = str(remote_sample.get("response") or "")
	answer = _assistant_answer(axis, teacher_response, remote_score)
	prompt = AXIS_PROMPTS.get(axis, str(local_sample.get("prompt") or axis))
	must = AXIS_MUST.get(axis, [])
	for index in range(max(1, int(variants_per_axis))):
	instruction = VARIANT_INSTRUCTIONS[index % len(VARIANT_INSTRUCTIONS)]
	rows.append(
	{
	"source": "qwen3_coder_480b_weak_axis_distill",
	"category": axis,
	"messages": [
	{
	"role": "system",
	"content": (
	"You are TinyMind distilled from a 480B comparison. "
	"Fix weak-axis behavior with exact constraints, natural language, and no fabricated claims."
	),
	},
	{
	"role": "user",
	"content": f"{instruction}\nRequired invariants: {', '.join(must)}\nTask: {prompt}",
	},
	{"role": "assistant", "content": answer},
	],
	"metadata": {
	"axis": axis,
	"variant": index,
	"teacher_model": teacher_model,
	"teacher_score": remote_score,
	"tinymind_score_before": local_score,
	"teacher_response_used": bool(teacher_response.strip()),
	"loss_weight": 2.0 if local_score <= 2 else 1.5,
	"quality_tags": ["480b_distill", "weak_axis_repair", "constraint_fidelity"],
	},
	}
	)

	sft_path = out / "weak_axis_480b_curriculum_sft.jsonl"
	with sft_path.open("w", encoding="utf-8", newline="\n") as handle:
	for row in rows:
	handle.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n")

	report = {
	"schema_version": "tinymind-weak-axis-480b-curriculum-v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"local_report": str(local_report),
	"remote_report": str(remote_report),
	"sft_path": str(sft_path),
	"summary": {
	"weak_axes": weak_axes,
	"weak_axis_count": len(weak_axes),
	"sft_rows": len(rows),
	"variants_per_axis": max(1, int(variants_per_axis)),
	},
	"claim_gate": {
	"uses_480b_teacher_outputs": True,
	"official_superiority_claim_allowed": False,
	"reason": "This creates targeted distillation rows from local comparison evidence; it does not prove superiority over the 480B teacher.",
	},
	}
	report_path = out / "weak_axis_480b_curriculum_report.json"
	report["json_path"] = str(report_path)
	report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	return report

Xet Storage Details

Size:: 11.2 kB
Xet hash:: 0802596410e753db20aa65bbc57d7d4561d2948ff091815bab9a3e2d223a0057

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.