Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /logic_agent_code_forge.py
| from __future__ import annotations | |
| from collections import Counter | |
| from datetime import datetime, timezone | |
| import hashlib | |
| import json | |
| from pathlib import Path | |
| from typing import Any | |
| SCHEMA_VERSION = "tinymind-logic-agent-code-surgery-v1" | |
| SYSTEM = ( | |
| "You are TinyMind Logic-Agent-Code Tutor. Prioritize instruction following, tool grounding, " | |
| "code correctness, verification, and concise repair loops. Use defensive boundaries for security topics." | |
| ) | |
| DOMAINS = { | |
| "instruction_following": ( | |
| "follow a nested instruction hierarchy", | |
| "repair an answer that violated a JSON schema", | |
| "separate system instruction from raw quoted data", | |
| "answer briefly when the user requests a short answer", | |
| "ask for the missing parameter only when execution would be unsafe", | |
| ), | |
| "tool_grounding": ( | |
| "choose a sandbox tool and exact arguments", | |
| "summarize stdout/stderr without inventing tool output", | |
| "recover from a failed command using the smallest next probe", | |
| "respect localhost-only proxy policy", | |
| "write an audit manifest after a tool run", | |
| ), | |
| "coding_python": ( | |
| "write a tested Python parser", | |
| "fix a failing pytest from the traceback", | |
| "design a streaming JSONL pipeline", | |
| "implement a safe file walker", | |
| "add deterministic hashing to a dataset builder", | |
| ), | |
| "coding_cpp_rust": ( | |
| "explain a C++ ownership bug", | |
| "design a Rust FFI boundary", | |
| "write a CUDA benchmark validation checklist", | |
| "compare register pressure and memory bandwidth", | |
| "build a small CLI with reproducible output", | |
| ), | |
| "reasoning_logic": ( | |
| "solve by invariants instead of memorization", | |
| "find a counterexample", | |
| "convert a vague request into testable claims", | |
| "rank hypotheses by evidence", | |
| "derive a pass/fail rubric", | |
| ), | |
| "data_tooling": ( | |
| "balance a data recipe", | |
| "detect contamination risk", | |
| "create a train/eval split without leakage", | |
| "assign domain loss weights", | |
| "write a model-card limitation section", | |
| ), | |
| } | |
| FRAMES = ("plan", "implement", "debug", "verify", "refactor", "audit", "explain", "score") | |
| LANGS = ("en", "th") | |
| def _sha(text: str) -> str: | |
| return hashlib.sha256(text.encode("utf-8")).hexdigest() | |
| def _assistant(domain: str, skill: str, frame: str, variant: int, lang: str) -> str: | |
| if lang == "th": | |
| return ( | |
| f"คำตอบสำหรับ `{skill}` ต้องเริ่มจากเป้าหมายที่ตรวจได้ แล้วค่อยทำ {frame} แบบไม่เดา.\n\n" | |
| "1. แยกข้อกำหนด: ระบุ input, output, constraint, และสิ่งที่ห้ามทำ.\n" | |
| "2. ลงมือแบบแคบ: เลือกขั้นตอนหรือโค้ดที่เล็กที่สุดซึ่งพิสูจน์สมมติฐานได้.\n" | |
| "3. ตรวจผล: ใช้ test, schema, log, hash, หรือ manifest แทนความรู้สึกว่า 'น่าจะถูก'.\n" | |
| "4. ซ่อม: ถ้าผิด ให้แก้จากหลักฐานล่าสุด ไม่เขียนทับทั้งระบบโดยไม่มีเหตุผล.\n" | |
| "5. สรุป: บอกสิ่งที่ทำได้จริง สิ่งที่ยังไม่รู้ และขั้นตอนถัดไป.\n\n" | |
| f"โดเมน `{domain}` รอบ {variant}: ให้รักษาโครงนี้เพื่อฝึก instruction/tool/code reliability ไม่ใช่ท่องจำคำตอบ." | |
| ) | |
| return ( | |
| f"For `{skill}`, start with a verifiable target, then {frame} without guessing.\n\n" | |
| "1. Requirements: identify inputs, outputs, constraints, and forbidden actions.\n" | |
| "2. Minimal action: choose the smallest step or code change that tests the hypothesis.\n" | |
| "3. Verification: use tests, schemas, logs, hashes, or manifests instead of confidence-by-wording.\n" | |
| "4. Repair: if it fails, patch from the newest evidence rather than rewriting unrelated parts.\n" | |
| "5. Summary: state what is proven, what remains unknown, and the next action.\n\n" | |
| f"Domain `{domain}` variant {variant}: this trains instruction/tool/code reliability, not memorized answers." | |
| ) | |
| def _record(domain: str, skill: str, frame: str, variant: int) -> dict[str, Any]: | |
| lang = LANGS[variant % len(LANGS)] | |
| user = ( | |
| f"ช่วย {frame} งาน `{skill}` แบบละเอียดแต่ไม่หลุดข้อกำหนด และต้องมีวิธีตรวจผล" | |
| if lang == "th" | |
| else f"{frame.title()} the task `{skill}` with strict instruction following and explicit verification." | |
| ) | |
| fingerprint = _sha(f"{domain}|{skill}|{frame}|{variant}|{lang}") | |
| return { | |
| "messages": [ | |
| {"role": "system", "content": SYSTEM}, | |
| {"role": "user", "content": user}, | |
| {"role": "assistant", "content": _assistant(domain, skill, frame, variant, lang)}, | |
| ], | |
| "source": "logic_agent_code_surgery", | |
| "metadata": { | |
| "schema_version": SCHEMA_VERSION, | |
| "domain": domain, | |
| "skill": skill, | |
| "frame": frame, | |
| "variant": variant, | |
| "language": lang, | |
| "fingerprint_sha256": fingerprint, | |
| "loss_weight": 1.25, | |
| "quality_tags": [ | |
| "logic_agent_code", | |
| "instruction_following", | |
| "tool_grounding", | |
| "code_reasoning", | |
| "verification", | |
| "short_sft", | |
| ], | |
| }, | |
| } | |
| def _write_jsonl(path: Path, rows) -> int: | |
| count = 0 | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| with path.open("w", encoding="utf-8", newline="\n") as f: | |
| for row in rows: | |
| f.write(json.dumps(row, ensure_ascii=False) + "\n") | |
| count += 1 | |
| return count | |
| def _file_sha(path: Path) -> str: | |
| h = hashlib.sha256() | |
| with path.open("rb") as f: | |
| for chunk in iter(lambda: f.read(1024 * 1024), b""): | |
| h.update(chunk) | |
| return h.hexdigest() | |
| def _rows(target_records: int): | |
| skills = [(domain, skill) for domain, items in DOMAINS.items() for skill in items] | |
| for i in range(target_records): | |
| domain, skill = skills[i % len(skills)] | |
| frame = FRAMES[(i // len(skills)) % len(FRAMES)] | |
| yield _record(domain, skill, frame, i) | |
| def build_logic_agent_code_dataset(out_dir: str | Path, *, target_records: int = 50_000, eval_fraction: float = 0.02) -> dict[str, Any]: | |
| out = Path(out_dir) | |
| train_path = out / "logic_agent_code_train.jsonl" | |
| eval_path = out / "logic_agent_code_eval.jsonl" | |
| manifest_path = out / "logic_agent_code_manifest.json" | |
| eval_mod = max(1, round(1 / max(0.001, min(eval_fraction, 0.5)))) | |
| train_rows = [] | |
| eval_rows = [] | |
| domain_counts: Counter[str] = Counter() | |
| lang_counts: Counter[str] = Counter() | |
| for idx, row in enumerate(_rows(target_records)): | |
| domain_counts[row["metadata"]["domain"]] += 1 | |
| lang_counts[row["metadata"]["language"]] += 1 | |
| if idx % eval_mod == 0: | |
| eval_rows.append(row) | |
| else: | |
| train_rows.append(row) | |
| train_count = _write_jsonl(train_path, train_rows) | |
| eval_count = _write_jsonl(eval_path, eval_rows) | |
| report: dict[str, Any] = { | |
| "schema_version": SCHEMA_VERSION, | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "summary": { | |
| "records_written": train_count + eval_count, | |
| "train_records": train_count, | |
| "eval_records": eval_count, | |
| "domain_counts": dict(sorted(domain_counts.items())), | |
| "language_counts": dict(sorted(lang_counts.items())), | |
| "loss_weight": 1.25, | |
| }, | |
| "outputs": { | |
| "train_jsonl": str(train_path), | |
| "eval_jsonl": str(eval_path), | |
| "train_sha256": _file_sha(train_path), | |
| "eval_sha256": _file_sha(eval_path), | |
| }, | |
| "claim_gate": { | |
| "logic_agent_code_ready": target_records >= 100, | |
| "world_best_claim_allowed": False, | |
| "reason": "This is a balancing SFT curriculum for logic/agent/code behavior, not a capability claim.", | |
| }, | |
| } | |
| report["manifest_path"] = str(manifest_path) | |
| manifest_path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8") | |
| return report | |
Xet Storage Details
- Size:
- 8.87 kB
- Xet hash:
- c8eed85efdf5545fef7356f967e9eb60e20e4c27ee1e3c198c53542f728d7b66
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.