bbkdevops's picture
download
raw
9.72 kB
from __future__ import annotations
from datetime import datetime, timezone
import json
from pathlib import Path
from typing import Any
TARGET_AXES = {
"thai_language",
"english_reasoning",
"math_bound",
"math_probability",
"code_python",
"systems_ffi",
"grounding",
"tool_json",
"self_critique",
}
PROMPTS = {
"thai_language": "อธิบายความต่างระหว่างข้อมูลจริง หลักฐาน และข้อสรุป เป็นภาษาไทยธรรมชาติ พร้อมตัวอย่างสั้น ๆ",
"english_reasoning": "Explain why a low eval loss can still be misleading when the evaluation set is contaminated.",
"math_bound": "Prove briefly that x_t = a x_{t-1} + b_t is bounded when |a|<1 and |b_t|<=B.",
"math_probability": "A classifier has precision 0.8 and recall 0.5. Define precision and recall, then compute F1.",
"code_python": "Write a small Python function validate_tool_call(obj) that returns True only if obj has string name and dict arguments.",
"systems_ffi": "Give three rules for safe Rust/C FFI ABI compatibility.",
"grounding": "You have one source saying A and another saying not-A. What should an evidence-grounded assistant do before answering?",
"tool_json": 'Return only JSON for a tool call named "search_web" with arguments query="Thai AI benchmark" and k=3.',
"self_critique": "Give a concise answer, then add one sentence explaining what evidence would falsify your answer.",
}
ANSWERS = {
"thai_language": (
"ข้อมูลจริงคือสิ่งที่เกิดขึ้นหรือตรวจวัดได้โดยตรง เช่น เซิร์ฟเวอร์ตอบกลับใน 820 มิลลิวินาที. "
"หลักฐานคือข้อมูลจริงที่ใช้รองรับคำตอบ เช่น log เวลา, timestamp, และผลวัดซ้ำ. "
"ข้อสรุปคือการตีความจากหลักฐาน เช่น ระบบเริ่มช้ากว่าค่าเป้าหมาย. "
"ถ้าหลักฐานยังไม่พอ ต้องบอกขอบเขตความไม่แน่นอนแทนการเดา."
),
"english_reasoning": (
"A low eval loss can be misleading when contamination lets training examples or near-duplicates leak into evaluation. "
"The model may memorize surface patterns rather than show generalization. A credible result needs a clean holdout set, "
"deduplication against training data, and paraphrased stress tests that preserve the task while changing the wording."
),
"math_bound": (
"Unroll the recurrence: x_t = a^t x_0 + sum_{i=0}^{t-1} a^i b_{t-i}. "
"Then |x_t| <= |a|^t |x_0| + B sum_{i=0}^{t-1} |a|^i. "
"Because |a|<1, the geometric series is bounded by 1/(1-|a|). "
"So |x_t| <= |x_0| + B/(1-|a|), hence the sequence is bounded."
),
"math_probability": (
"Precision is TP/(TP+FP), the fraction of predicted positives that are correct. "
"Recall is TP/(TP+FN), the fraction of actual positives found. "
"F1 = 2PR/(P+R) = 2*0.8*0.5/(0.8+0.5) = 0.8/1.3 = 0.615."
),
"code_python": (
"```python\n"
"def validate_tool_call(obj):\n"
" return (\n"
" isinstance(obj, dict)\n"
" and isinstance(obj.get(\"name\"), str)\n"
" and isinstance(obj.get(\"arguments\"), dict)\n"
" )\n"
"```\n"
),
"systems_ffi": (
"1. Use `extern \"C\"` so both sides agree on the calling convention and symbol ABI.\n"
"2. Use `#[repr(C)]` for shared structs so field order, padding, and alignment match C.\n"
"3. Define ownership explicitly: who allocates, who frees, nullability rules, and never unwind Rust panics across the FFI boundary."
),
"grounding": (
"It should compare the sources, check which source is primary, newer, and more relevant, and identify the exact evidence each side provides. "
"If the conflict remains, it should state uncertainty clearly instead of forcing A or not-A. "
"A good answer says what additional source or measurement would resolve the conflict."
),
"tool_json": '{"name":"search_web","arguments":{"query":"Thai AI benchmark","k":3}}',
"self_critique": (
"Concise answer: accept the claim only when independent evidence supports it under the same conditions. "
"Evidence that would falsify my answer would be a reproducible source or official report showing the opposite result."
),
}
VARIANTS = [
"ตอบให้ตรงคำสั่งทุกบิต ห้ามข้าม invariant และห้ามเติม claim ที่ไม่มีหลักฐาน.",
"Answer naturally, but preserve every required keyword, schema field, and numeric value exactly.",
"ตอบแบบ held-out eval: กระชับ แม่นยำ ตรวจ format ก่อนส่งคำตอบ.",
"Repair the weak behavior: exact constraint following first, style second.",
"ตอบให้ผู้ใช้ใช้งานได้จริง โดยคุมรูปแบบและหลักฐานให้ครบ.",
]
def _load_probe(path: str | Path) -> dict[str, Any]:
return json.loads(Path(path).read_text(encoding="utf-8"))
def _local_samples(report: dict[str, Any]) -> list[dict[str, Any]]:
for result in report.get("results", []):
if result.get("model_id") == "TinyMind-12B-LoRA" or result.get("source") == "local_tinymind_adapter":
return [dict(row) for row in result.get("samples", [])]
results = report.get("results") or []
return [dict(row) for row in (results[0].get("samples", []) if results else [])]
def _is_broken(sample: dict[str, Any], threshold: int) -> bool:
axis = str(sample.get("axis") or "")
return axis in TARGET_AXES and int(sample.get("score", 0) or 0) < threshold
def _row(axis: str, variant: int, before_score: int, flags: list[str]) -> dict[str, Any]:
return {
"source": "tinymind_surgical_weak_axis_sft",
"category": axis,
"messages": [
{
"role": "system",
"content": (
"You are TinyMind in surgical repair mode. Follow the user's exact constraints, "
"produce valid formats, ground claims in evidence, and avoid unsupported claims."
),
},
{
"role": "user",
"content": f"{VARIANTS[variant % len(VARIANTS)]}\nTask: {PROMPTS[axis]}",
},
{"role": "assistant", "content": ANSWERS[axis]},
],
"metadata": {
"axis": axis,
"variant": variant,
"tinymind_score_before": before_score,
"probe_flags": flags,
"loss_weight": 3.0 if before_score <= 1 else 2.0,
"quality_tags": ["surgical_sft", "weak_axis_repair", "exact_instruction", "no_eval_claim"],
},
}
def build_surgical_weak_axis_sft(
out_dir: str | Path,
*,
probe_report: str | Path,
variants_per_axis: int = 8,
weak_threshold: int = 4,
) -> dict[str, Any]:
if variants_per_axis <= 0:
raise ValueError("variants_per_axis must be positive")
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
report = _load_probe(probe_report)
samples = _local_samples(report)
target_samples = [sample for sample in samples if _is_broken(sample, weak_threshold)]
rows = []
for sample in target_samples:
axis = str(sample["axis"])
flags = [str(flag) for flag in sample.get("flags", [])]
before_score = int(sample.get("score", 0) or 0)
for variant in range(variants_per_axis):
rows.append(_row(axis, variant, before_score, flags))
sft_path = out / "surgical_weak_axis_sft.jsonl"
with sft_path.open("w", encoding="utf-8", newline="\n") as handle:
for item in rows:
handle.write(json.dumps(item, ensure_ascii=False, sort_keys=True) + "\n")
axes = [str(sample["axis"]) for sample in target_samples]
result = {
"schema": "tinymind.surgical_weak_axis_sft.v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"probe_report": str(probe_report),
"sft_path": str(sft_path),
"summary": {
"target_axes": axes,
"target_axis_count": len(axes),
"sft_rows": len(rows),
"variants_per_axis": variants_per_axis,
"weak_threshold": weak_threshold,
},
"claim_gate": {
"main_training_allowed": bool(rows),
"world_best_claim_allowed": False,
"reason": "This is targeted repair SFT, not external leaderboard evidence.",
},
}
report_path = out / "surgical_weak_axis_report.json"
result["json_path"] = str(report_path)
report_path.write_text(json.dumps(result, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
return result
if __name__ == "__main__":
default_probe = Path("reports") / "broad_480b_eval_tinymind_current_25690527_145135" / "broad_480b_eval_report.json"
print(json.dumps(build_surgical_weak_axis_sft("reports/surgical_weak_axis_sft", probe_report=default_probe), ensure_ascii=False, indent=2))

Xet Storage Details

Size:
9.72 kB
·
Xet hash:
6c5b1f1a5879c7a7c3fb256a6a77635eecfb06c5e5ab8bbfa8e3a64258b52e5e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.