bbkdevops's picture
download
raw
8.87 kB
from __future__ import annotations
from collections import Counter
from datetime import datetime, timezone
import hashlib
import json
from pathlib import Path
from typing import Any
SCHEMA_VERSION = "tinymind-logic-agent-code-surgery-v1"
SYSTEM = (
"You are TinyMind Logic-Agent-Code Tutor. Prioritize instruction following, tool grounding, "
"code correctness, verification, and concise repair loops. Use defensive boundaries for security topics."
)
DOMAINS = {
"instruction_following": (
"follow a nested instruction hierarchy",
"repair an answer that violated a JSON schema",
"separate system instruction from raw quoted data",
"answer briefly when the user requests a short answer",
"ask for the missing parameter only when execution would be unsafe",
),
"tool_grounding": (
"choose a sandbox tool and exact arguments",
"summarize stdout/stderr without inventing tool output",
"recover from a failed command using the smallest next probe",
"respect localhost-only proxy policy",
"write an audit manifest after a tool run",
),
"coding_python": (
"write a tested Python parser",
"fix a failing pytest from the traceback",
"design a streaming JSONL pipeline",
"implement a safe file walker",
"add deterministic hashing to a dataset builder",
),
"coding_cpp_rust": (
"explain a C++ ownership bug",
"design a Rust FFI boundary",
"write a CUDA benchmark validation checklist",
"compare register pressure and memory bandwidth",
"build a small CLI with reproducible output",
),
"reasoning_logic": (
"solve by invariants instead of memorization",
"find a counterexample",
"convert a vague request into testable claims",
"rank hypotheses by evidence",
"derive a pass/fail rubric",
),
"data_tooling": (
"balance a data recipe",
"detect contamination risk",
"create a train/eval split without leakage",
"assign domain loss weights",
"write a model-card limitation section",
),
}
FRAMES = ("plan", "implement", "debug", "verify", "refactor", "audit", "explain", "score")
LANGS = ("en", "th")
def _sha(text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest()
def _assistant(domain: str, skill: str, frame: str, variant: int, lang: str) -> str:
if lang == "th":
return (
f"คำตอบสำหรับ `{skill}` ต้องเริ่มจากเป้าหมายที่ตรวจได้ แล้วค่อยทำ {frame} แบบไม่เดา.\n\n"
"1. แยกข้อกำหนด: ระบุ input, output, constraint, และสิ่งที่ห้ามทำ.\n"
"2. ลงมือแบบแคบ: เลือกขั้นตอนหรือโค้ดที่เล็กที่สุดซึ่งพิสูจน์สมมติฐานได้.\n"
"3. ตรวจผล: ใช้ test, schema, log, hash, หรือ manifest แทนความรู้สึกว่า 'น่าจะถูก'.\n"
"4. ซ่อม: ถ้าผิด ให้แก้จากหลักฐานล่าสุด ไม่เขียนทับทั้งระบบโดยไม่มีเหตุผล.\n"
"5. สรุป: บอกสิ่งที่ทำได้จริง สิ่งที่ยังไม่รู้ และขั้นตอนถัดไป.\n\n"
f"โดเมน `{domain}` รอบ {variant}: ให้รักษาโครงนี้เพื่อฝึก instruction/tool/code reliability ไม่ใช่ท่องจำคำตอบ."
)
return (
f"For `{skill}`, start with a verifiable target, then {frame} without guessing.\n\n"
"1. Requirements: identify inputs, outputs, constraints, and forbidden actions.\n"
"2. Minimal action: choose the smallest step or code change that tests the hypothesis.\n"
"3. Verification: use tests, schemas, logs, hashes, or manifests instead of confidence-by-wording.\n"
"4. Repair: if it fails, patch from the newest evidence rather than rewriting unrelated parts.\n"
"5. Summary: state what is proven, what remains unknown, and the next action.\n\n"
f"Domain `{domain}` variant {variant}: this trains instruction/tool/code reliability, not memorized answers."
)
def _record(domain: str, skill: str, frame: str, variant: int) -> dict[str, Any]:
lang = LANGS[variant % len(LANGS)]
user = (
f"ช่วย {frame} งาน `{skill}` แบบละเอียดแต่ไม่หลุดข้อกำหนด และต้องมีวิธีตรวจผล"
if lang == "th"
else f"{frame.title()} the task `{skill}` with strict instruction following and explicit verification."
)
fingerprint = _sha(f"{domain}|{skill}|{frame}|{variant}|{lang}")
return {
"messages": [
{"role": "system", "content": SYSTEM},
{"role": "user", "content": user},
{"role": "assistant", "content": _assistant(domain, skill, frame, variant, lang)},
],
"source": "logic_agent_code_surgery",
"metadata": {
"schema_version": SCHEMA_VERSION,
"domain": domain,
"skill": skill,
"frame": frame,
"variant": variant,
"language": lang,
"fingerprint_sha256": fingerprint,
"loss_weight": 1.25,
"quality_tags": [
"logic_agent_code",
"instruction_following",
"tool_grounding",
"code_reasoning",
"verification",
"short_sft",
],
},
}
def _write_jsonl(path: Path, rows) -> int:
count = 0
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8", newline="\n") as f:
for row in rows:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
count += 1
return count
def _file_sha(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""):
h.update(chunk)
return h.hexdigest()
def _rows(target_records: int):
skills = [(domain, skill) for domain, items in DOMAINS.items() for skill in items]
for i in range(target_records):
domain, skill = skills[i % len(skills)]
frame = FRAMES[(i // len(skills)) % len(FRAMES)]
yield _record(domain, skill, frame, i)
def build_logic_agent_code_dataset(out_dir: str | Path, *, target_records: int = 50_000, eval_fraction: float = 0.02) -> dict[str, Any]:
out = Path(out_dir)
train_path = out / "logic_agent_code_train.jsonl"
eval_path = out / "logic_agent_code_eval.jsonl"
manifest_path = out / "logic_agent_code_manifest.json"
eval_mod = max(1, round(1 / max(0.001, min(eval_fraction, 0.5))))
train_rows = []
eval_rows = []
domain_counts: Counter[str] = Counter()
lang_counts: Counter[str] = Counter()
for idx, row in enumerate(_rows(target_records)):
domain_counts[row["metadata"]["domain"]] += 1
lang_counts[row["metadata"]["language"]] += 1
if idx % eval_mod == 0:
eval_rows.append(row)
else:
train_rows.append(row)
train_count = _write_jsonl(train_path, train_rows)
eval_count = _write_jsonl(eval_path, eval_rows)
report: dict[str, Any] = {
"schema_version": SCHEMA_VERSION,
"created_at": datetime.now(timezone.utc).isoformat(),
"summary": {
"records_written": train_count + eval_count,
"train_records": train_count,
"eval_records": eval_count,
"domain_counts": dict(sorted(domain_counts.items())),
"language_counts": dict(sorted(lang_counts.items())),
"loss_weight": 1.25,
},
"outputs": {
"train_jsonl": str(train_path),
"eval_jsonl": str(eval_path),
"train_sha256": _file_sha(train_path),
"eval_sha256": _file_sha(eval_path),
},
"claim_gate": {
"logic_agent_code_ready": target_records >= 100,
"world_best_claim_allowed": False,
"reason": "This is a balancing SFT curriculum for logic/agent/code behavior, not a capability claim.",
},
}
report["manifest_path"] = str(manifest_path)
manifest_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
return report

Xet Storage Details

Size:
8.87 kB
·
Xet hash:
c8eed85efdf5545fef7356f967e9eb60e20e4c27ee1e3c198c53542f728d7b66

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.