Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /surgical_weak_axis_sft.py
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| from pathlib import Path | |
| from typing import Any | |
| TARGET_AXES = { | |
| "thai_language", | |
| "english_reasoning", | |
| "math_bound", | |
| "math_probability", | |
| "code_python", | |
| "systems_ffi", | |
| "grounding", | |
| "tool_json", | |
| "self_critique", | |
| } | |
| PROMPTS = { | |
| "thai_language": "อธิบายความต่างระหว่างข้อมูลจริง หลักฐาน และข้อสรุป เป็นภาษาไทยธรรมชาติ พร้อมตัวอย่างสั้น ๆ", | |
| "english_reasoning": "Explain why a low eval loss can still be misleading when the evaluation set is contaminated.", | |
| "math_bound": "Prove briefly that x_t = a x_{t-1} + b_t is bounded when |a|<1 and |b_t|<=B.", | |
| "math_probability": "A classifier has precision 0.8 and recall 0.5. Define precision and recall, then compute F1.", | |
| "code_python": "Write a small Python function validate_tool_call(obj) that returns True only if obj has string name and dict arguments.", | |
| "systems_ffi": "Give three rules for safe Rust/C FFI ABI compatibility.", | |
| "grounding": "You have one source saying A and another saying not-A. What should an evidence-grounded assistant do before answering?", | |
| "tool_json": 'Return only JSON for a tool call named "search_web" with arguments query="Thai AI benchmark" and k=3.', | |
| "self_critique": "Give a concise answer, then add one sentence explaining what evidence would falsify your answer.", | |
| } | |
| ANSWERS = { | |
| "thai_language": ( | |
| "ข้อมูลจริงคือสิ่งที่เกิดขึ้นหรือตรวจวัดได้โดยตรง เช่น เซิร์ฟเวอร์ตอบกลับใน 820 มิลลิวินาที. " | |
| "หลักฐานคือข้อมูลจริงที่ใช้รองรับคำตอบ เช่น log เวลา, timestamp, และผลวัดซ้ำ. " | |
| "ข้อสรุปคือการตีความจากหลักฐาน เช่น ระบบเริ่มช้ากว่าค่าเป้าหมาย. " | |
| "ถ้าหลักฐานยังไม่พอ ต้องบอกขอบเขตความไม่แน่นอนแทนการเดา." | |
| ), | |
| "english_reasoning": ( | |
| "A low eval loss can be misleading when contamination lets training examples or near-duplicates leak into evaluation. " | |
| "The model may memorize surface patterns rather than show generalization. A credible result needs a clean holdout set, " | |
| "deduplication against training data, and paraphrased stress tests that preserve the task while changing the wording." | |
| ), | |
| "math_bound": ( | |
| "Unroll the recurrence: x_t = a^t x_0 + sum_{i=0}^{t-1} a^i b_{t-i}. " | |
| "Then |x_t| <= |a|^t |x_0| + B sum_{i=0}^{t-1} |a|^i. " | |
| "Because |a|<1, the geometric series is bounded by 1/(1-|a|). " | |
| "So |x_t| <= |x_0| + B/(1-|a|), hence the sequence is bounded." | |
| ), | |
| "math_probability": ( | |
| "Precision is TP/(TP+FP), the fraction of predicted positives that are correct. " | |
| "Recall is TP/(TP+FN), the fraction of actual positives found. " | |
| "F1 = 2PR/(P+R) = 2*0.8*0.5/(0.8+0.5) = 0.8/1.3 = 0.615." | |
| ), | |
| "code_python": ( | |
| "```python\n" | |
| "def validate_tool_call(obj):\n" | |
| " return (\n" | |
| " isinstance(obj, dict)\n" | |
| " and isinstance(obj.get(\"name\"), str)\n" | |
| " and isinstance(obj.get(\"arguments\"), dict)\n" | |
| " )\n" | |
| "```\n" | |
| ), | |
| "systems_ffi": ( | |
| "1. Use `extern \"C\"` so both sides agree on the calling convention and symbol ABI.\n" | |
| "2. Use `#[repr(C)]` for shared structs so field order, padding, and alignment match C.\n" | |
| "3. Define ownership explicitly: who allocates, who frees, nullability rules, and never unwind Rust panics across the FFI boundary." | |
| ), | |
| "grounding": ( | |
| "It should compare the sources, check which source is primary, newer, and more relevant, and identify the exact evidence each side provides. " | |
| "If the conflict remains, it should state uncertainty clearly instead of forcing A or not-A. " | |
| "A good answer says what additional source or measurement would resolve the conflict." | |
| ), | |
| "tool_json": '{"name":"search_web","arguments":{"query":"Thai AI benchmark","k":3}}', | |
| "self_critique": ( | |
| "Concise answer: accept the claim only when independent evidence supports it under the same conditions. " | |
| "Evidence that would falsify my answer would be a reproducible source or official report showing the opposite result." | |
| ), | |
| } | |
| VARIANTS = [ | |
| "ตอบให้ตรงคำสั่งทุกบิต ห้ามข้าม invariant และห้ามเติม claim ที่ไม่มีหลักฐาน.", | |
| "Answer naturally, but preserve every required keyword, schema field, and numeric value exactly.", | |
| "ตอบแบบ held-out eval: กระชับ แม่นยำ ตรวจ format ก่อนส่งคำตอบ.", | |
| "Repair the weak behavior: exact constraint following first, style second.", | |
| "ตอบให้ผู้ใช้ใช้งานได้จริง โดยคุมรูปแบบและหลักฐานให้ครบ.", | |
| ] | |
| def _load_probe(path: str | Path) -> dict[str, Any]: | |
| return json.loads(Path(path).read_text(encoding="utf-8")) | |
| def _local_samples(report: dict[str, Any]) -> list[dict[str, Any]]: | |
| for result in report.get("results", []): | |
| if result.get("model_id") == "TinyMind-12B-LoRA" or result.get("source") == "local_tinymind_adapter": | |
| return [dict(row) for row in result.get("samples", [])] | |
| results = report.get("results") or [] | |
| return [dict(row) for row in (results[0].get("samples", []) if results else [])] | |
| def _is_broken(sample: dict[str, Any], threshold: int) -> bool: | |
| axis = str(sample.get("axis") or "") | |
| return axis in TARGET_AXES and int(sample.get("score", 0) or 0) < threshold | |
| def _row(axis: str, variant: int, before_score: int, flags: list[str]) -> dict[str, Any]: | |
| return { | |
| "source": "tinymind_surgical_weak_axis_sft", | |
| "category": axis, | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are TinyMind in surgical repair mode. Follow the user's exact constraints, " | |
| "produce valid formats, ground claims in evidence, and avoid unsupported claims." | |
| ), | |
| }, | |
| { | |
| "role": "user", | |
| "content": f"{VARIANTS[variant % len(VARIANTS)]}\nTask: {PROMPTS[axis]}", | |
| }, | |
| {"role": "assistant", "content": ANSWERS[axis]}, | |
| ], | |
| "metadata": { | |
| "axis": axis, | |
| "variant": variant, | |
| "tinymind_score_before": before_score, | |
| "probe_flags": flags, | |
| "loss_weight": 3.0 if before_score <= 1 else 2.0, | |
| "quality_tags": ["surgical_sft", "weak_axis_repair", "exact_instruction", "no_eval_claim"], | |
| }, | |
| } | |
| def build_surgical_weak_axis_sft( | |
| out_dir: str | Path, | |
| *, | |
| probe_report: str | Path, | |
| variants_per_axis: int = 8, | |
| weak_threshold: int = 4, | |
| ) -> dict[str, Any]: | |
| if variants_per_axis <= 0: | |
| raise ValueError("variants_per_axis must be positive") | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| report = _load_probe(probe_report) | |
| samples = _local_samples(report) | |
| target_samples = [sample for sample in samples if _is_broken(sample, weak_threshold)] | |
| rows = [] | |
| for sample in target_samples: | |
| axis = str(sample["axis"]) | |
| flags = [str(flag) for flag in sample.get("flags", [])] | |
| before_score = int(sample.get("score", 0) or 0) | |
| for variant in range(variants_per_axis): | |
| rows.append(_row(axis, variant, before_score, flags)) | |
| sft_path = out / "surgical_weak_axis_sft.jsonl" | |
| with sft_path.open("w", encoding="utf-8", newline="\n") as handle: | |
| for item in rows: | |
| handle.write(json.dumps(item, ensure_ascii=False, sort_keys=True) + "\n") | |
| axes = [str(sample["axis"]) for sample in target_samples] | |
| result = { | |
| "schema": "tinymind.surgical_weak_axis_sft.v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "probe_report": str(probe_report), | |
| "sft_path": str(sft_path), | |
| "summary": { | |
| "target_axes": axes, | |
| "target_axis_count": len(axes), | |
| "sft_rows": len(rows), | |
| "variants_per_axis": variants_per_axis, | |
| "weak_threshold": weak_threshold, | |
| }, | |
| "claim_gate": { | |
| "main_training_allowed": bool(rows), | |
| "world_best_claim_allowed": False, | |
| "reason": "This is targeted repair SFT, not external leaderboard evidence.", | |
| }, | |
| } | |
| report_path = out / "surgical_weak_axis_report.json" | |
| result["json_path"] = str(report_path) | |
| report_path.write_text(json.dumps(result, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8") | |
| return result | |
| if __name__ == "__main__": | |
| default_probe = Path("reports") / "broad_480b_eval_tinymind_current_25690527_145135" / "broad_480b_eval_report.json" | |
| print(json.dumps(build_surgical_weak_axis_sft("reports/surgical_weak_axis_sft", probe_report=default_probe), ensure_ascii=False, indent=2)) | |
Xet Storage Details
- Size:
- 9.72 kB
- Xet hash:
- 6c5b1f1a5879c7a7c3fb256a6a77635eecfb06c5e5ab8bbfa8e3a64258b52e5e
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.