Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /weak_axis_curriculum_480b.py
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| from typing import Any | |
| AXIS_PROMPTS: dict[str, str] = { | |
| "thai_language": "อธิบายความต่างระหว่างข้อมูลจริง หลักฐาน และข้อสรุป เป็นภาษาไทยธรรมชาติ พร้อมตัวอย่างสั้น ๆ", | |
| "english_reasoning": "Explain why a low eval loss can still be misleading when the evaluation set is contaminated.", | |
| "math_bound": "Prove briefly that x_t = a x_{t-1} + b_t is bounded when |a|<1 and |b_t|<=B.", | |
| "math_probability": "A classifier has precision 0.8 and recall 0.5. Define precision and recall, then compute F1.", | |
| "code_python": "Write a small Python function validate_tool_call(obj) that returns True only if obj has string name and dict arguments.", | |
| "raw_code_bits": "Explain signed 6-bit sign extension from a packed byte lane. Include the mask, sign bit, min, and max.", | |
| "systems_ffi": "Give three rules for safe Rust/C FFI ABI compatibility.", | |
| "grounding": "You have one source saying A and another saying not-A. What should an evidence-grounded assistant do before answering?", | |
| "tool_json": 'Return only JSON for a tool call named "search_web" with arguments query="Thai AI benchmark" and k=3.', | |
| "translation_th_en": "Translate to English: การวัดผลที่ดีต้องแยกข้อมูลฝึกออกจากข้อมูลทดสอบอย่างเด็ดขาด", | |
| "long_answer_control": "ตอบเป็นภาษาไทย 4 ข้อเท่านั้น: วิธีลด hallucination ในระบบ RAG", | |
| "self_critique": "Give a concise answer, then add one sentence explaining what evidence would falsify your answer.", | |
| } | |
| AXIS_MUST: dict[str, list[str]] = { | |
| "thai_language": ["ข้อมูลจริง", "หลักฐาน", "ข้อสรุป"], | |
| "english_reasoning": ["contamination", "holdout", "generalization"], | |
| "math_bound": ["geometric", "B", "1-|a|"], | |
| "math_probability": ["precision", "recall", "0.615"], | |
| "code_python": ["def validate_tool_call", "name", "arguments", "dict"], | |
| "raw_code_bits": ["0x3f", "0x20", "-32", "31"], | |
| "systems_ffi": ["extern", "repr(C)", "ownership"], | |
| "grounding": ["compare", "source", "uncertainty"], | |
| "tool_json": ['"name"', '"search_web"', '"arguments"', '"k"'], | |
| "translation_th_en": ["training", "test", "separate"], | |
| "long_answer_control": ["1", "2", "3", "4"], | |
| "self_critique": ["falsify", "evidence"], | |
| } | |
| CORRECTIONS: dict[str, str] = { | |
| "thai_language": ( | |
| "ข้อมูลจริงคือสิ่งที่ตรวจพบหรือบันทึกได้โดยตรง เช่น ค่าอุณหภูมิ 38 องศา. " | |
| "หลักฐานคือข้อมูลจริงที่นำมาใช้รองรับคำตอบ เช่น ภาพถ่ายหรือบันทึกการวัด. " | |
| "ข้อสรุปคือการตีความจากหลักฐาน เช่น เครื่องอาจร้อนผิดปกติ. " | |
| "ถ้าหลักฐานยังไม่พอ ต้องบอกขอบเขตความไม่แน่นอน ไม่เดาแทนผู้ใช้." | |
| ), | |
| "english_reasoning": ( | |
| "A low eval loss can be misleading when the evaluation set has contamination from training data. " | |
| "The model may memorize leaked patterns instead of showing generalization. A clean holdout set, " | |
| "deduplication, and paraphrased stress tests are needed before treating the loss as real capability." | |
| ), | |
| "math_bound": ( | |
| "Unroll the recurrence: x_t = a^t x_0 + sum_{i=0}^{t-1} a^i b_{t-i}. " | |
| "Taking absolute values gives |x_t| <= |a|^t |x_0| + B sum_{i=0}^{t-1} |a|^i. " | |
| "The geometric series is at most 1/(1-|a|), so |x_t| <= |x_0| + B/(1-|a|). Thus the sequence is bounded." | |
| ), | |
| "math_probability": ( | |
| "Precision is TP/(TP+FP): the share of predicted positives that are correct. " | |
| "Recall is TP/(TP+FN): the share of real positives found. " | |
| "F1 = 2PR/(P+R) = 2*0.8*0.5/(0.8+0.5) = 0.8/1.3 = 0.615." | |
| ), | |
| "code_python": ( | |
| "def validate_tool_call(obj):\n" | |
| " return (\n" | |
| " isinstance(obj, dict)\n" | |
| " and isinstance(obj.get(\"name\"), str)\n" | |
| " and isinstance(obj.get(\"arguments\"), dict)\n" | |
| " )\n" | |
| ), | |
| "systems_ffi": ( | |
| "Use `extern \"C\"` so Rust and C agree on the calling convention. " | |
| "Use `#[repr(C)]` for shared structs so field layout and alignment match C. " | |
| "Define ownership clearly: who allocates, who frees, whether pointers may be null, and never unwind Rust panics across the FFI boundary." | |
| ), | |
| "grounding": ( | |
| "An evidence-grounded assistant should compare the sources, identify which source is newer or more authoritative, " | |
| "quote or summarize the relevant evidence, and state uncertainty instead of choosing A or not-A without support. " | |
| "If the conflict cannot be resolved, it should say what evidence would decide the answer." | |
| ), | |
| "tool_json": '{"name":"search_web","arguments":{"query":"Thai AI benchmark","k":3}}', | |
| "self_critique": ( | |
| "Concise answer: the claim is supported only if independent evidence matches the stated result. " | |
| "Evidence that would falsify my answer would be a reproducible official report showing the opposite outcome under the same benchmark conditions." | |
| ), | |
| } | |
| VARIANT_INSTRUCTIONS = [ | |
| "Answer with maximal precision and include every required invariant.", | |
| "Answer naturally, but preserve exact schema/keywords required by the task.", | |
| "Answer compactly, verify constraints, and avoid unsupported claims.", | |
| "Answer as a high-quality held-out evaluation response, not a memorized template.", | |
| ] | |
| def _read(path: str | Path) -> dict[str, Any]: | |
| return json.loads(Path(path).read_text(encoding="utf-8")) | |
| def _first_result(report: dict[str, Any]) -> dict[str, Any]: | |
| results = report.get("results") or [] | |
| return dict(results[0]) if results else {} | |
| def _samples_by_axis(report: dict[str, Any]) -> dict[str, dict[str, Any]]: | |
| result = _first_result(report) | |
| return {str(row.get("axis")): dict(row) for row in result.get("samples", []) if row.get("axis")} | |
| def _teacher_model(report: dict[str, Any]) -> str: | |
| return str(_first_result(report).get("model_id") or "unknown-480b-teacher") | |
| def _assistant_answer(axis: str, teacher_response: str, teacher_score: int) -> str: | |
| correction = CORRECTIONS.get(axis) | |
| if correction: | |
| return correction | |
| if teacher_score >= 4 and teacher_response.strip(): | |
| return teacher_response.strip() | |
| must = ", ".join(AXIS_MUST.get(axis, [])) | |
| return f"Answer must explicitly satisfy these invariants: {must}.\n\n{teacher_response.strip()}" | |
| def build_weak_axis_curriculum_480b( | |
| out_dir: str | Path, | |
| *, | |
| local_report: str | Path, | |
| remote_report: str | Path, | |
| variants_per_axis: int = 4, | |
| ) -> dict[str, Any]: | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| local = _read(local_report) | |
| remote = _read(remote_report) | |
| local_samples = _samples_by_axis(local) | |
| remote_samples = _samples_by_axis(remote) | |
| teacher_model = _teacher_model(remote) | |
| weak_axes: list[str] = [] | |
| rows: list[dict[str, Any]] = [] | |
| for axis, local_sample in local_samples.items(): | |
| local_score = int(local_sample.get("score", 0) or 0) | |
| remote_sample = remote_samples.get(axis, {}) | |
| remote_score = int(remote_sample.get("score", 0) or 0) | |
| if local_score >= 4 and remote_score <= local_score: | |
| continue | |
| weak_axes.append(axis) | |
| teacher_response = str(remote_sample.get("response") or "") | |
| answer = _assistant_answer(axis, teacher_response, remote_score) | |
| prompt = AXIS_PROMPTS.get(axis, str(local_sample.get("prompt") or axis)) | |
| must = AXIS_MUST.get(axis, []) | |
| for index in range(max(1, int(variants_per_axis))): | |
| instruction = VARIANT_INSTRUCTIONS[index % len(VARIANT_INSTRUCTIONS)] | |
| rows.append( | |
| { | |
| "source": "qwen3_coder_480b_weak_axis_distill", | |
| "category": axis, | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are TinyMind distilled from a 480B comparison. " | |
| "Fix weak-axis behavior with exact constraints, natural language, and no fabricated claims." | |
| ), | |
| }, | |
| { | |
| "role": "user", | |
| "content": f"{instruction}\nRequired invariants: {', '.join(must)}\nTask: {prompt}", | |
| }, | |
| {"role": "assistant", "content": answer}, | |
| ], | |
| "metadata": { | |
| "axis": axis, | |
| "variant": index, | |
| "teacher_model": teacher_model, | |
| "teacher_score": remote_score, | |
| "tinymind_score_before": local_score, | |
| "teacher_response_used": bool(teacher_response.strip()), | |
| "loss_weight": 2.0 if local_score <= 2 else 1.5, | |
| "quality_tags": ["480b_distill", "weak_axis_repair", "constraint_fidelity"], | |
| }, | |
| } | |
| ) | |
| sft_path = out / "weak_axis_480b_curriculum_sft.jsonl" | |
| with sft_path.open("w", encoding="utf-8", newline="\n") as handle: | |
| for row in rows: | |
| handle.write(json.dumps(row, ensure_ascii=False, sort_keys=True) + "\n") | |
| report = { | |
| "schema_version": "tinymind-weak-axis-480b-curriculum-v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "local_report": str(local_report), | |
| "remote_report": str(remote_report), | |
| "sft_path": str(sft_path), | |
| "summary": { | |
| "weak_axes": weak_axes, | |
| "weak_axis_count": len(weak_axes), | |
| "sft_rows": len(rows), | |
| "variants_per_axis": max(1, int(variants_per_axis)), | |
| }, | |
| "claim_gate": { | |
| "uses_480b_teacher_outputs": True, | |
| "official_superiority_claim_allowed": False, | |
| "reason": "This creates targeted distillation rows from local comparison evidence; it does not prove superiority over the 480B teacher.", | |
| }, | |
| } | |
| report_path = out / "weak_axis_480b_curriculum_report.json" | |
| report["json_path"] = str(report_path) | |
| report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| return report | |
Xet Storage Details
- Size:
- 11.2 kB
- Xet hash:
- 0802596410e753db20aa65bbc57d7d4561d2948ff091815bab9a3e2d223a0057
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.