Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /logic_agent_code_forge.py

bbkdevops

30 days ago

download

raw

8.87 kB

	from __future__ import annotations

	from collections import Counter
	from datetime import datetime, timezone
	import hashlib
	import json
	from pathlib import Path
	from typing import Any


	SCHEMA_VERSION = "tinymind-logic-agent-code-surgery-v1"

	SYSTEM = (
	"You are TinyMind Logic-Agent-Code Tutor. Prioritize instruction following, tool grounding, "
	"code correctness, verification, and concise repair loops. Use defensive boundaries for security topics."
	)

	DOMAINS = {
	"instruction_following": (
	"follow a nested instruction hierarchy",
	"repair an answer that violated a JSON schema",
	"separate system instruction from raw quoted data",
	"answer briefly when the user requests a short answer",
	"ask for the missing parameter only when execution would be unsafe",
	),
	"tool_grounding": (
	"choose a sandbox tool and exact arguments",
	"summarize stdout/stderr without inventing tool output",
	"recover from a failed command using the smallest next probe",
	"respect localhost-only proxy policy",
	"write an audit manifest after a tool run",
	),
	"coding_python": (
	"write a tested Python parser",
	"fix a failing pytest from the traceback",
	"design a streaming JSONL pipeline",
	"implement a safe file walker",
	"add deterministic hashing to a dataset builder",
	),
	"coding_cpp_rust": (
	"explain a C++ ownership bug",
	"design a Rust FFI boundary",
	"write a CUDA benchmark validation checklist",
	"compare register pressure and memory bandwidth",
	"build a small CLI with reproducible output",
	),
	"reasoning_logic": (
	"solve by invariants instead of memorization",
	"find a counterexample",
	"convert a vague request into testable claims",
	"rank hypotheses by evidence",
	"derive a pass/fail rubric",
	),
	"data_tooling": (
	"balance a data recipe",
	"detect contamination risk",
	"create a train/eval split without leakage",
	"assign domain loss weights",
	"write a model-card limitation section",
	),
	}

	FRAMES = ("plan", "implement", "debug", "verify", "refactor", "audit", "explain", "score")
	LANGS = ("en", "th")


	def _sha(text: str) -> str:
	return hashlib.sha256(text.encode("utf-8")).hexdigest()


	def _assistant(domain: str, skill: str, frame: str, variant: int, lang: str) -> str:
	if lang == "th":
	return (
	f"คำตอบสำหรับ `{skill}` ต้องเริ่มจากเป้าหมายที่ตรวจได้ แล้วค่อยทำ {frame} แบบไม่เดา.\n\n"
	"1. แยกข้อกำหนด: ระบุ input, output, constraint, และสิ่งที่ห้ามทำ.\n"
	"2. ลงมือแบบแคบ: เลือกขั้นตอนหรือโค้ดที่เล็กที่สุดซึ่งพิสูจน์สมมติฐานได้.\n"
	"3. ตรวจผล: ใช้ test, schema, log, hash, หรือ manifest แทนความรู้สึกว่า 'น่าจะถูก'.\n"
	"4. ซ่อม: ถ้าผิด ให้แก้จากหลักฐานล่าสุด ไม่เขียนทับทั้งระบบโดยไม่มีเหตุผล.\n"
	"5. สรุป: บอกสิ่งที่ทำได้จริง สิ่งที่ยังไม่รู้ และขั้นตอนถัดไป.\n\n"
	f"โดเมน `{domain}` รอบ {variant}: ให้รักษาโครงนี้เพื่อฝึก instruction/tool/code reliability ไม่ใช่ท่องจำคำตอบ."
	)
	return (
	f"For `{skill}`, start with a verifiable target, then {frame} without guessing.\n\n"
	"1. Requirements: identify inputs, outputs, constraints, and forbidden actions.\n"
	"2. Minimal action: choose the smallest step or code change that tests the hypothesis.\n"
	"3. Verification: use tests, schemas, logs, hashes, or manifests instead of confidence-by-wording.\n"
	"4. Repair: if it fails, patch from the newest evidence rather than rewriting unrelated parts.\n"
	"5. Summary: state what is proven, what remains unknown, and the next action.\n\n"
	f"Domain `{domain}` variant {variant}: this trains instruction/tool/code reliability, not memorized answers."
	)


	def _record(domain: str, skill: str, frame: str, variant: int) -> dict[str, Any]:
	lang = LANGS[variant % len(LANGS)]
	user = (
	f"ช่วย {frame} งาน `{skill}` แบบละเอียดแต่ไม่หลุดข้อกำหนด และต้องมีวิธีตรวจผล"
	if lang == "th"
	else f"{frame.title()} the task `{skill}` with strict instruction following and explicit verification."
	)
	fingerprint = _sha(f"{domain}\|{skill}\|{frame}\|{variant}\|{lang}")
	return {
	"messages": [
	{"role": "system", "content": SYSTEM},
	{"role": "user", "content": user},
	{"role": "assistant", "content": _assistant(domain, skill, frame, variant, lang)},
	],
	"source": "logic_agent_code_surgery",
	"metadata": {
	"schema_version": SCHEMA_VERSION,
	"domain": domain,
	"skill": skill,
	"frame": frame,
	"variant": variant,
	"language": lang,
	"fingerprint_sha256": fingerprint,
	"loss_weight": 1.25,
	"quality_tags": [
	"logic_agent_code",
	"instruction_following",
	"tool_grounding",
	"code_reasoning",
	"verification",
	"short_sft",
	],
	},
	}


	def _write_jsonl(path: Path, rows) -> int:
	count = 0
	path.parent.mkdir(parents=True, exist_ok=True)
	with path.open("w", encoding="utf-8", newline="\n") as f:
	for row in rows:
	f.write(json.dumps(row, ensure_ascii=False) + "\n")
	count += 1
	return count


	def _file_sha(path: Path) -> str:
	h = hashlib.sha256()
	with path.open("rb") as f:
	for chunk in iter(lambda: f.read(1024 * 1024), b""):
	h.update(chunk)
	return h.hexdigest()


	def _rows(target_records: int):
	skills = [(domain, skill) for domain, items in DOMAINS.items() for skill in items]
	for i in range(target_records):
	domain, skill = skills[i % len(skills)]
	frame = FRAMES[(i // len(skills)) % len(FRAMES)]
	yield _record(domain, skill, frame, i)


	def build_logic_agent_code_dataset(out_dir: str \| Path, *, target_records: int = 50_000, eval_fraction: float = 0.02) -> dict[str, Any]:
	out = Path(out_dir)
	train_path = out / "logic_agent_code_train.jsonl"
	eval_path = out / "logic_agent_code_eval.jsonl"
	manifest_path = out / "logic_agent_code_manifest.json"
	eval_mod = max(1, round(1 / max(0.001, min(eval_fraction, 0.5))))
	train_rows = []
	eval_rows = []
	domain_counts: Counter[str] = Counter()
	lang_counts: Counter[str] = Counter()
	for idx, row in enumerate(_rows(target_records)):
	domain_counts[row["metadata"]["domain"]] += 1
	lang_counts[row["metadata"]["language"]] += 1
	if idx % eval_mod == 0:
	eval_rows.append(row)
	else:
	train_rows.append(row)
	train_count = _write_jsonl(train_path, train_rows)
	eval_count = _write_jsonl(eval_path, eval_rows)
	report: dict[str, Any] = {
	"schema_version": SCHEMA_VERSION,
	"created_at": datetime.now(timezone.utc).isoformat(),
	"summary": {
	"records_written": train_count + eval_count,
	"train_records": train_count,
	"eval_records": eval_count,
	"domain_counts": dict(sorted(domain_counts.items())),
	"language_counts": dict(sorted(lang_counts.items())),
	"loss_weight": 1.25,
	},
	"outputs": {
	"train_jsonl": str(train_path),
	"eval_jsonl": str(eval_path),
	"train_sha256": _file_sha(train_path),
	"eval_sha256": _file_sha(eval_path),
	},
	"claim_gate": {
	"logic_agent_code_ready": target_records >= 100,
	"world_best_claim_allowed": False,
	"reason": "This is a balancing SFT curriculum for logic/agent/code behavior, not a capability claim.",
	},
	}
	report["manifest_path"] = str(manifest_path)
	manifest_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	return report

Xet Storage Details

Size:: 8.87 kB
Xet hash:: c8eed85efdf5545fef7356f967e9eb60e20e4c27ee1e3c198c53542f728d7b66

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.