bbkdevops's picture
download
raw
5.9 kB
from __future__ import annotations
from collections import Counter
from datetime import datetime, timezone
import hashlib
import json
from pathlib import Path
from typing import Any
SCHEMA_VERSION = "tinymind-alignment-tool-sft-v1"
SYSTEM = "You are TinyMind Alignment Tutor. Obey exact user constraints and emit valid tool schemas when requested."
DOMAINS = ("alignment_constraint_following", "alignment_tool_calling")
CONSTRAINTS = (
"answer exactly three bullets",
"return only valid JSON with keys action,args,verification",
"summarize in one sentence then provide a checklist",
"refuse unsafe scope and provide a safe alternative",
"extract parameters and ask one clarifying question only if required",
"follow the latest user instruction unless it conflicts with system policy",
)
TOOLS = (
("sandbox.run_code", {"language": "lua", "code": "return 2 + 2"}),
("sandbox.proxy.http_get", {"url": "http://127.0.0.1:8000/health"}),
("sandbox.env.run", {"name": "build-a", "argv": ["python", "-m", "pytest", "-q"]}),
("fs.read", {"path": "reports/current_model_results/current_model_results.json"}),
("fs.write", {"path": "reports/alignment/manifest.json", "content": "{\"ok\": true}"}),
("cmd.run", {"argv": ["python", "-m", "train.cli", "current-model-results"]}),
)
def _sha(text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest()
def _constraint_row(idx: int) -> dict[str, Any]:
constraint = CONSTRAINTS[idx % len(CONSTRAINTS)]
payload = {
"action": "respond",
"args": {"format": constraint, "safety": "policy_checked"},
"verification": ["constraint_satisfied", "no_extra_keys", "no_unverified_claim"],
}
return _row(
domain="alignment_constraint_following",
user=f"ทำตาม constraint นี้ให้เป๊ะ: {constraint}. ห้ามเติมคำเกริ่นที่ไม่จำเป็น.",
assistant=json.dumps(payload, ensure_ascii=False, indent=2),
idx=idx,
)
def _tool_row(idx: int) -> dict[str, Any]:
tool, args = TOOLS[idx % len(TOOLS)]
payload = {
"tool": tool,
"arguments": args,
"audit": {"requires_sandbox": True, "network_scope": "local_only", "verify_after": True},
}
return _row(
domain="alignment_tool_calling",
user=f"เลือก tool ที่ถูกต้องสำหรับงานนี้และตอบเป็น JSON เท่านั้น: {tool}",
assistant=json.dumps(payload, ensure_ascii=False, indent=2),
idx=idx,
)
def _row(domain: str, user: str, assistant: str, idx: int) -> dict[str, Any]:
fingerprint = _sha(f"{domain}|{idx}|{user}|{assistant}")
return {
"messages": [
{"role": "system", "content": SYSTEM},
{"role": "user", "content": user},
{"role": "assistant", "content": assistant},
],
"source": "alignment_tool_sft_surgery",
"metadata": {
"schema_version": SCHEMA_VERSION,
"domain": domain,
"variant": idx,
"fingerprint_sha256": fingerprint,
"loss_weight": 1.45,
"quality_tags": [
"alignment_tool_sft",
"constraint_following",
"tool_calling",
"json_schema",
"response_only_loss_target",
],
},
}
def _rows(target_records: int):
for idx in range(target_records):
yield _constraint_row(idx) if idx % 2 == 0 else _tool_row(idx)
def _write_jsonl(path: Path, rows) -> int:
path.parent.mkdir(parents=True, exist_ok=True)
count = 0
with path.open("w", encoding="utf-8", newline="\n") as f:
for row in rows:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
count += 1
return count
def _file_sha(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""):
h.update(chunk)
return h.hexdigest()
def build_alignment_tool_sft_dataset(out_dir: str | Path, *, target_records: int = 30_000, eval_fraction: float = 0.02) -> dict[str, Any]:
out = Path(out_dir)
train_path = out / "alignment_tool_sft_train.jsonl"
eval_path = out / "alignment_tool_sft_eval.jsonl"
manifest_path = out / "alignment_tool_sft_manifest.json"
eval_mod = max(1, round(1 / max(0.001, min(eval_fraction, 0.5))))
train_rows = []
eval_rows = []
domain_counts: Counter[str] = Counter()
for idx, row in enumerate(_rows(target_records)):
domain_counts[row["metadata"]["domain"]] += 1
if idx % eval_mod == 0:
eval_rows.append(row)
else:
train_rows.append(row)
train_count = _write_jsonl(train_path, train_rows)
eval_count = _write_jsonl(eval_path, eval_rows)
report: dict[str, Any] = {
"schema_version": SCHEMA_VERSION,
"created_at": datetime.now(timezone.utc).isoformat(),
"summary": {
"records_written": train_count + eval_count,
"train_records": train_count,
"eval_records": eval_count,
"domain_counts": dict(sorted(domain_counts.items())),
"loss_weight": 1.45,
},
"outputs": {
"train_jsonl": str(train_path),
"eval_jsonl": str(eval_path),
"train_sha256": _file_sha(train_path),
"eval_sha256": _file_sha(eval_path),
},
"claim_gate": {
"alignment_tool_sft_ready": target_records >= 100,
"world_best_claim_allowed": False,
},
}
report["manifest_path"] = str(manifest_path)
manifest_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
return report

Xet Storage Details

Size:
5.9 kB
·
Xet hash:
81e4ad6642f61d6a19752ab5cd122fef4badec8d2e4f99a75c661ffda68e6c3f

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.