Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /surgical_weak_axis_sft.py

bbkdevops

about 1 month ago

download

raw

9.72 kB

	from __future__ import annotations

	from datetime import datetime, timezone
	import json
	from pathlib import Path
	from typing import Any


	TARGET_AXES = {
	"thai_language",
	"english_reasoning",
	"math_bound",
	"math_probability",
	"code_python",
	"systems_ffi",
	"grounding",
	"tool_json",
	"self_critique",
	}


	PROMPTS = {
	"thai_language": "อธิบายความต่างระหว่างข้อมูลจริง หลักฐาน และข้อสรุป เป็นภาษาไทยธรรมชาติ พร้อมตัวอย่างสั้น ๆ",
	"english_reasoning": "Explain why a low eval loss can still be misleading when the evaluation set is contaminated.",
	"math_bound": "Prove briefly that x_t = a x_{t-1} + b_t is bounded when \|a\|<1 and \|b_t\|<=B.",
	"math_probability": "A classifier has precision 0.8 and recall 0.5. Define precision and recall, then compute F1.",
	"code_python": "Write a small Python function validate_tool_call(obj) that returns True only if obj has string name and dict arguments.",
	"systems_ffi": "Give three rules for safe Rust/C FFI ABI compatibility.",
	"grounding": "You have one source saying A and another saying not-A. What should an evidence-grounded assistant do before answering?",
	"tool_json": 'Return only JSON for a tool call named "search_web" with arguments query="Thai AI benchmark" and k=3.',
	"self_critique": "Give a concise answer, then add one sentence explaining what evidence would falsify your answer.",
	}


	ANSWERS = {
	"thai_language": (
	"ข้อมูลจริงคือสิ่งที่เกิดขึ้นหรือตรวจวัดได้โดยตรง เช่น เซิร์ฟเวอร์ตอบกลับใน 820 มิลลิวินาที. "
	"หลักฐานคือข้อมูลจริงที่ใช้รองรับคำตอบ เช่น log เวลา, timestamp, และผลวัดซ้ำ. "
	"ข้อสรุปคือการตีความจากหลักฐาน เช่น ระบบเริ่มช้ากว่าค่าเป้าหมาย. "
	"ถ้าหลักฐานยังไม่พอ ต้องบอกขอบเขตความไม่แน่นอนแทนการเดา."
	),
	"english_reasoning": (
	"A low eval loss can be misleading when contamination lets training examples or near-duplicates leak into evaluation. "
	"The model may memorize surface patterns rather than show generalization. A credible result needs a clean holdout set, "
	"deduplication against training data, and paraphrased stress tests that preserve the task while changing the wording."
	),
	"math_bound": (
	"Unroll the recurrence: x_t = a^t x_0 + sum_{i=0}^{t-1} a^i b_{t-i}. "
	"Then \|x_t\| <= \|a\|^t \|x_0\| + B sum_{i=0}^{t-1} \|a\|^i. "
	"Because \|a\|<1, the geometric series is bounded by 1/(1-\|a\|). "
	"So \|x_t\| <= \|x_0\| + B/(1-\|a\|), hence the sequence is bounded."
	),
	"math_probability": (
	"Precision is TP/(TP+FP), the fraction of predicted positives that are correct. "
	"Recall is TP/(TP+FN), the fraction of actual positives found. "
	"F1 = 2PR/(P+R) = 20.80.5/(0.8+0.5) = 0.8/1.3 = 0.615."
	),
	"code_python": (
	"```python\n"
	"def validate_tool_call(obj):\n"
	" return (\n"
	" isinstance(obj, dict)\n"
	" and isinstance(obj.get(\"name\"), str)\n"
	" and isinstance(obj.get(\"arguments\"), dict)\n"
	" )\n"
	"```\n"
	),
	"systems_ffi": (
	"1. Use `extern \"C\"` so both sides agree on the calling convention and symbol ABI.\n"
	"2. Use `#[repr(C)]` for shared structs so field order, padding, and alignment match C.\n"
	"3. Define ownership explicitly: who allocates, who frees, nullability rules, and never unwind Rust panics across the FFI boundary."
	),
	"grounding": (
	"It should compare the sources, check which source is primary, newer, and more relevant, and identify the exact evidence each side provides. "
	"If the conflict remains, it should state uncertainty clearly instead of forcing A or not-A. "
	"A good answer says what additional source or measurement would resolve the conflict."
	),
	"tool_json": '{"name":"search_web","arguments":{"query":"Thai AI benchmark","k":3}}',
	"self_critique": (
	"Concise answer: accept the claim only when independent evidence supports it under the same conditions. "
	"Evidence that would falsify my answer would be a reproducible source or official report showing the opposite result."
	),
	}


	VARIANTS = [
	"ตอบให้ตรงคำสั่งทุกบิต ห้ามข้าม invariant และห้ามเติม claim ที่ไม่มีหลักฐาน.",
	"Answer naturally, but preserve every required keyword, schema field, and numeric value exactly.",
	"ตอบแบบ held-out eval: กระชับ แม่นยำ ตรวจ format ก่อนส่งคำตอบ.",
	"Repair the weak behavior: exact constraint following first, style second.",
	"ตอบให้ผู้ใช้ใช้งานได้จริง โดยคุมรูปแบบและหลักฐานให้ครบ.",
	]


	def _load_probe(path: str \| Path) -> dict[str, Any]:
	return json.loads(Path(path).read_text(encoding="utf-8"))


	def _local_samples(report: dict[str, Any]) -> list[dict[str, Any]]:
	for result in report.get("results", []):
	if result.get("model_id") == "TinyMind-12B-LoRA" or result.get("source") == "local_tinymind_adapter":
	return [dict(row) for row in result.get("samples", [])]
	results = report.get("results") or []
	return [dict(row) for row in (results[0].get("samples", []) if results else [])]


	def _is_broken(sample: dict[str, Any], threshold: int) -> bool:
	axis = str(sample.get("axis") or "")
	return axis in TARGET_AXES and int(sample.get("score", 0) or 0) < threshold


	def _row(axis: str, variant: int, before_score: int, flags: list[str]) -> dict[str, Any]:
	return {
	"source": "tinymind_surgical_weak_axis_sft",
	"category": axis,
	"messages": [
	{
	"role": "system",
	"content": (
	"You are TinyMind in surgical repair mode. Follow the user's exact constraints, "
	"produce valid formats, ground claims in evidence, and avoid unsupported claims."
	),
	},
	{
	"role": "user",
	"content": f"{VARIANTS[variant % len(VARIANTS)]}\nTask: {PROMPTS[axis]}",
	},
	{"role": "assistant", "content": ANSWERS[axis]},
	],
	"metadata": {
	"axis": axis,
	"variant": variant,
	"tinymind_score_before": before_score,
	"probe_flags": flags,
	"loss_weight": 3.0 if before_score <= 1 else 2.0,
	"quality_tags": ["surgical_sft", "weak_axis_repair", "exact_instruction", "no_eval_claim"],
	},
	}


	def build_surgical_weak_axis_sft(
	out_dir: str \| Path,
	*,
	probe_report: str \| Path,
	variants_per_axis: int = 8,
	weak_threshold: int = 4,
	) -> dict[str, Any]:
	if variants_per_axis <= 0:
	raise ValueError("variants_per_axis must be positive")
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)

	report = _load_probe(probe_report)
	samples = _local_samples(report)
	target_samples = [sample for sample in samples if _is_broken(sample, weak_threshold)]
	rows = []
	for sample in target_samples:
	axis = str(sample["axis"])
	flags = [str(flag) for flag in sample.get("flags", [])]
	before_score = int(sample.get("score", 0) or 0)
	for variant in range(variants_per_axis):
	rows.append(_row(axis, variant, before_score, flags))

	sft_path = out / "surgical_weak_axis_sft.jsonl"
	with sft_path.open("w", encoding="utf-8", newline="\n") as handle:
	for item in rows:
	handle.write(json.dumps(item, ensure_ascii=False, sort_keys=True) + "\n")

	axes = [str(sample["axis"]) for sample in target_samples]
	result = {
	"schema": "tinymind.surgical_weak_axis_sft.v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"probe_report": str(probe_report),
	"sft_path": str(sft_path),
	"summary": {
	"target_axes": axes,
	"target_axis_count": len(axes),
	"sft_rows": len(rows),
	"variants_per_axis": variants_per_axis,
	"weak_threshold": weak_threshold,
	},
	"claim_gate": {
	"main_training_allowed": bool(rows),
	"world_best_claim_allowed": False,
	"reason": "This is targeted repair SFT, not external leaderboard evidence.",
	},
	}
	report_path = out / "surgical_weak_axis_report.json"
	result["json_path"] = str(report_path)
	report_path.write_text(json.dumps(result, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
	return result


	if __name__ == "__main__":
	default_probe = Path("reports") / "broad_480b_eval_tinymind_current_25690527_145135" / "broad_480b_eval_report.json"
	print(json.dumps(build_surgical_weak_axis_sft("reports/surgical_weak_axis_sft", probe_report=default_probe), ensure_ascii=False, indent=2))

Xet Storage Details

Size:: 9.72 kB
Xet hash:: 6c5b1f1a5879c7a7c3fb256a6a77635eecfb06c5e5ab8bbfa8e3a64258b52e5e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.