Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /tinymind_native_code_forge.py

bbkdevops

29 days ago

download

raw

12.1 kB

	from __future__ import annotations

	from dataclasses import dataclass
	from datetime import datetime, timezone
	import hashlib
	import json
	from pathlib import Path
	from typing import Any


	LANGUAGES = [
	"python",
	"typescript",
	"rust",
	"go",
	"cpp",
	"sql",
	"powershell",
	"lua",
	"cuda",
	"bash",
	]

	TASKS = [
	("schema_validator", "validate a nested JSON object against required fields and types"),
	("streaming_dedupe", "deduplicate a streaming record source without loading everything into memory"),
	("leak_detector", "detect latency and memory drift from periodic telemetry samples"),
	("retry_backoff", "wrap an unreliable operation with bounded exponential backoff"),
	("chunk_index", "build a content-addressed chunk index with SHA-256 keys"),
	("safe_parser", "parse loosely formatted input without executing embedded code"),
	("test_split", "create a train/eval split that prevents semantic leakage"),
	("loss_weights", "assign domain loss weights with caps for dominant sources"),
	("tool_call", "emit a strict tool-call JSON object and verify the arguments"),
	("kv_ledger", "store exact chunk metadata while keeping model KV storage bounded"),
	("config_merge", "merge layered configs with deterministic conflict resolution"),
	("benchmark_report", "summarize benchmark rows without converting local evidence into official claims"),
	]

	FRAMES = ["implement", "debug", "verify", "refactor", "explain_tradeoffs"]


	@dataclass(frozen=True)
	class NativeCodeForgePolicy:
	target_records: int = 20_000
	eval_fraction: float = 0.02
	seed: int = 20260527


	def _sha(text: str) -> str:
	return hashlib.sha256(text.encode("utf-8")).hexdigest()


	def _code_for(language: str, task_id: str, variant: int) -> str:
	if language == "python":
	return (
	"def solve(records):\n"
	" seen = set()\n"
	" out = []\n"
	" for row in records:\n"
	" key = row.get('id') or row.get('sha256')\n"
	" if key in seen:\n"
	" continue\n"
	" seen.add(key)\n"
	" out.append(row)\n"
	" return out\n"
	)
	if language == "typescript":
	return (
	"export function solve<T extends { id?: string; sha256?: string }>(rows: T[]): T[] {\n"
	" const seen = new Set<string>();\n"
	" return rows.filter((row) => {\n"
	" const key = row.id ?? row.sha256 ?? JSON.stringify(row);\n"
	" if (seen.has(key)) return false;\n"
	" seen.add(key);\n"
	" return true;\n"
	" });\n"
	"}\n"
	)
	if language == "rust":
	return (
	"use std::collections::HashSet;\n\n"
	"pub fn solve(keys: &[String]) -> Vec<String> {\n"
	" let mut seen = HashSet::new();\n"
	" let mut out = Vec::new();\n"
	" for key in keys {\n"
	" if seen.insert(key.clone()) {\n"
	" out.push(key.clone());\n"
	" }\n"
	" }\n"
	" out\n"
	"}\n"
	)
	if language == "go":
	return (
	"func Solve(keys []string) []string {\n"
	"\tseen := map[string]bool{}\n"
	"\tout := []string{}\n"
	"\tfor _, key := range keys {\n"
	"\t\tif seen[key] { continue }\n"
	"\t\tseen[key] = true\n"
	"\t\tout = append(out, key)\n"
	"\t}\n"
	"\treturn out\n"
	"}\n"
	)
	if language == "cpp":
	return (
	"#include <string>\n#include <unordered_set>\n#include <vector>\n\n"
	"std::vector<std::string> solve(const std::vector<std::string>& keys) {\n"
	" std::unordered_set<std::string> seen;\n"
	" std::vector<std::string> out;\n"
	" for (const auto& key : keys) {\n"
	" if (seen.insert(key).second) out.push_back(key);\n"
	" }\n"
	" return out;\n"
	"}\n"
	)
	if language == "sql":
	return (
	"WITH ranked AS (\n"
	" SELECT *, ROW_NUMBER() OVER (PARTITION BY semantic_sha256 ORDER BY quality_score DESC) AS rn\n"
	" FROM training_rows\n"
	")\n"
	"SELECT * FROM ranked WHERE rn = 1;\n"
	)
	if language == "powershell":
	return (
	"$seen = @{}\n"
	"$output = foreach ($row in $Rows) {\n"
	" $key = $row.semantic_sha256\n"
	" if (-not $seen.ContainsKey($key)) { $seen[$key] = $true; $row }\n"
	"}\n"
	)
	if language == "lua":
	return (
	"function solve(rows)\n"
	" local seen, out = {}, {}\n"
	" for _, row in ipairs(rows) do\n"
	" local key = row.id or row.sha256\n"
	" if not seen[key] then\n"
	" seen[key] = true\n"
	" table.insert(out, row)\n"
	" end\n"
	" end\n"
	" return out\n"
	"end\n"
	)
	if language == "cuda":
	return (
	"__global__ void mark_unique(const unsigned long long* keys, int* keep, int n) {\n"
	" int i = blockIdx.x * blockDim.x + threadIdx.x;\n"
	" if (i >= n) return;\n"
	" keep[i] = (i == 0 \|\| keys[i] != keys[i - 1]) ? 1 : 0;\n"
	"}\n"
	)
	return (
	"declare -A seen\n"
	"while IFS= read -r key; do\n"
	" if [[ -z \"${seen[$key]}\" ]]; then\n"
	" seen[$key]=1\n"
	" printf '%s\\n' \"$key\"\n"
	" fi\n"
	"done\n"
	)


	def _answer(language: str, task_id: str, task: str, frame: str, variant: int) -> str:
	code = _code_for(language, task_id, variant)
	if frame == "explain_tradeoffs":
	return (
	f"แนวทางสำหรับ `{task_id}` ในภาษา `{language}` ต้องแยกเป้าหมาย ความเสี่ยง และหลักฐานตรวจผลให้ชัด.\n\n"
	f"- Core idea: {task} โดยไม่เพิ่ม state ที่ไม่จำเป็นและไม่ปนข้อมูล eval.\n"
	"- Tradeoff: ความเร็วต้องไม่แลกกับการเสีย determinism, ส่วน memory ต้องวัดด้วย sample drift ไม่ใช่เดา.\n"
	"- Verification: ตรวจ duplicate, schema, edge case ว่าง/null, และบันทึก hash ของ input/output.\n"
	"- Failure mode: ถ้า source license ไม่ชัดหรือพบ secret ให้ quarantine ทันที.\n"
	)
	return (
	f"งาน `{task_id}` ภาษา `{language}` เฟรม `{frame}` ต้องทำแบบตรวจซ้ำได้ ไม่ใช่จำ template.\n\n"
	"ข้อกำหนด:\n"
	f"1. เป้าหมาย: {task}.\n"
	"2. ต้อง deterministic, bounded memory, และบันทึกหลักฐาน hash ได้.\n"
	"3. ห้ามใช้ข้อมูล benchmark/eval เป็นคำตอบฝึก.\n\n"
	f"```{language}\n{code}```\n\n"
	"Verification:\n"
	"- สร้าง input ที่มี key ซ้ำ, key หาย, และลำดับต่างกัน.\n"
	"- ตรวจว่า output stable เมื่อรันซ้ำ.\n"
	"- วัด memory/latency drift หากใช้กับ stream ยาว.\n"
	"- บันทึก manifest: source, semantic_sha256, loss_weight, และผล test.\n"
	)


	def _record(language: str, task_id: str, task: str, frame: str, variant: int) -> dict[str, Any]:
	prompt = (
	f"สร้างคำตอบโค้ดระดับ production สำหรับ `{task_id}` ด้วยภาษา `{language}` "
	f"ในโหมด `{frame}` โดยต้องมี verification และไม่ใช้ข้อมูล benchmark เป็นคำตอบจำ"
	)
	answer = _answer(language, task_id, task, frame, variant)
	fingerprint = _sha(f"{language}\|{task_id}\|{frame}\|{variant}\|tinymind-native-code")
	return {
	"messages": [
	{
	"role": "system",
	"content": "You are TinyMind Native Code Forge. Produce compact, verifiable, defensive, high-density code training examples.",
	},
	{"role": "user", "content": prompt},
	{"role": "assistant", "content": answer},
	],
	"source": "tinymind_native_code_forge",
	"metadata": {
	"schema_version": "tinymind-native-code-forge-v1",
	"domain": "coding_cpp_rust" if language in {"rust", "cpp", "cuda"} else "coding_python" if language == "python" else "data_tooling",
	"language": language,
	"task_id": task_id,
	"frame": frame,
	"variant": variant,
	"fingerprint_sha256": fingerprint,
	"loss_weight": 1.42 if language in {"rust", "cpp", "cuda"} else 1.35,
	"quality_tags": [
	"tinymind_created",
	"code_reasoning",
	"verification",
	"bounded_memory",
	"anti_contamination",
	"response_only_loss_target",
	],
	},
	}


	def build_tinymind_native_code_forge(
	out_dir: str \| Path,
	*,
	policy: NativeCodeForgePolicy \| None = None,
	) -> dict[str, Any]:
	policy = policy or NativeCodeForgePolicy()
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	train_path = out / "tinymind_native_code_train.jsonl"
	eval_path = out / "tinymind_native_code_eval.jsonl"
	records: list[dict[str, Any]] = []
	idx = 0
	while len(records) < policy.target_records:
	language = LANGUAGES[idx % len(LANGUAGES)]
	task_id, task = TASKS[(idx // len(LANGUAGES)) % len(TASKS)]
	frame = FRAMES[(idx // (len(LANGUAGES) * len(TASKS))) % len(FRAMES)]
	variant = idx
	records.append(_record(language, task_id, task, frame, variant))
	idx += 1
	eval_count = max(1, int(len(records) * policy.eval_fraction))
	eval_every = max(1, len(records) // eval_count)
	train_rows: list[dict[str, Any]] = []
	eval_rows: list[dict[str, Any]] = []
	for i, row in enumerate(records):
	(eval_rows if i % eval_every == 0 and len(eval_rows) < eval_count else train_rows).append(row)
	train_path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in train_rows) + "\n", encoding="utf-8")
	eval_path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in eval_rows) + "\n", encoding="utf-8")
	manifest = {
	"schema": "tinymind.native_code_forge.v1",
	"created_at": datetime.now(timezone.utc).isoformat(),
	"outputs": {"train_jsonl": str(train_path), "eval_jsonl": str(eval_path)},
	"summary": {
	"target_records": policy.target_records,
	"train_records": len(train_rows),
	"eval_records": len(eval_rows),
	"languages": LANGUAGES,
	"tasks": [task_id for task_id, _task in TASKS],
	"frames": FRAMES,
	},
	"claim_gate": {
	"tinymind_created_code_data_ready": True,
	"external_code_copied": False,
	"world_best_code_data_claim_allowed": False,
	"reason": "Dataset is generated by TinyMind templates for targeted training. It is not proof of world-best code skill without eval.",
	},
	}
	manifest_path = out / "tinymind_native_code_forge_manifest.json"
	manifest["manifest_path"] = str(manifest_path)
	manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
	return manifest

Xet Storage Details

Size:: 12.1 kB
Xet hash:: c0d25ddc1b1eb01c6bf3eb5dccf734647f1fdd4f13bd70598881ac85584bc8eb

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.