bbkdevops's picture
download
raw
12.1 kB
from __future__ import annotations
from dataclasses import dataclass
from datetime import datetime, timezone
import hashlib
import json
from pathlib import Path
from typing import Any
LANGUAGES = [
"python",
"typescript",
"rust",
"go",
"cpp",
"sql",
"powershell",
"lua",
"cuda",
"bash",
]
TASKS = [
("schema_validator", "validate a nested JSON object against required fields and types"),
("streaming_dedupe", "deduplicate a streaming record source without loading everything into memory"),
("leak_detector", "detect latency and memory drift from periodic telemetry samples"),
("retry_backoff", "wrap an unreliable operation with bounded exponential backoff"),
("chunk_index", "build a content-addressed chunk index with SHA-256 keys"),
("safe_parser", "parse loosely formatted input without executing embedded code"),
("test_split", "create a train/eval split that prevents semantic leakage"),
("loss_weights", "assign domain loss weights with caps for dominant sources"),
("tool_call", "emit a strict tool-call JSON object and verify the arguments"),
("kv_ledger", "store exact chunk metadata while keeping model KV storage bounded"),
("config_merge", "merge layered configs with deterministic conflict resolution"),
("benchmark_report", "summarize benchmark rows without converting local evidence into official claims"),
]
FRAMES = ["implement", "debug", "verify", "refactor", "explain_tradeoffs"]
@dataclass(frozen=True)
class NativeCodeForgePolicy:
target_records: int = 20_000
eval_fraction: float = 0.02
seed: int = 20260527
def _sha(text: str) -> str:
return hashlib.sha256(text.encode("utf-8")).hexdigest()
def _code_for(language: str, task_id: str, variant: int) -> str:
if language == "python":
return (
"def solve(records):\n"
" seen = set()\n"
" out = []\n"
" for row in records:\n"
" key = row.get('id') or row.get('sha256')\n"
" if key in seen:\n"
" continue\n"
" seen.add(key)\n"
" out.append(row)\n"
" return out\n"
)
if language == "typescript":
return (
"export function solve<T extends { id?: string; sha256?: string }>(rows: T[]): T[] {\n"
" const seen = new Set<string>();\n"
" return rows.filter((row) => {\n"
" const key = row.id ?? row.sha256 ?? JSON.stringify(row);\n"
" if (seen.has(key)) return false;\n"
" seen.add(key);\n"
" return true;\n"
" });\n"
"}\n"
)
if language == "rust":
return (
"use std::collections::HashSet;\n\n"
"pub fn solve(keys: &[String]) -> Vec<String> {\n"
" let mut seen = HashSet::new();\n"
" let mut out = Vec::new();\n"
" for key in keys {\n"
" if seen.insert(key.clone()) {\n"
" out.push(key.clone());\n"
" }\n"
" }\n"
" out\n"
"}\n"
)
if language == "go":
return (
"func Solve(keys []string) []string {\n"
"\tseen := map[string]bool{}\n"
"\tout := []string{}\n"
"\tfor _, key := range keys {\n"
"\t\tif seen[key] { continue }\n"
"\t\tseen[key] = true\n"
"\t\tout = append(out, key)\n"
"\t}\n"
"\treturn out\n"
"}\n"
)
if language == "cpp":
return (
"#include <string>\n#include <unordered_set>\n#include <vector>\n\n"
"std::vector<std::string> solve(const std::vector<std::string>& keys) {\n"
" std::unordered_set<std::string> seen;\n"
" std::vector<std::string> out;\n"
" for (const auto& key : keys) {\n"
" if (seen.insert(key).second) out.push_back(key);\n"
" }\n"
" return out;\n"
"}\n"
)
if language == "sql":
return (
"WITH ranked AS (\n"
" SELECT *, ROW_NUMBER() OVER (PARTITION BY semantic_sha256 ORDER BY quality_score DESC) AS rn\n"
" FROM training_rows\n"
")\n"
"SELECT * FROM ranked WHERE rn = 1;\n"
)
if language == "powershell":
return (
"$seen = @{}\n"
"$output = foreach ($row in $Rows) {\n"
" $key = $row.semantic_sha256\n"
" if (-not $seen.ContainsKey($key)) { $seen[$key] = $true; $row }\n"
"}\n"
)
if language == "lua":
return (
"function solve(rows)\n"
" local seen, out = {}, {}\n"
" for _, row in ipairs(rows) do\n"
" local key = row.id or row.sha256\n"
" if not seen[key] then\n"
" seen[key] = true\n"
" table.insert(out, row)\n"
" end\n"
" end\n"
" return out\n"
"end\n"
)
if language == "cuda":
return (
"__global__ void mark_unique(const unsigned long long* keys, int* keep, int n) {\n"
" int i = blockIdx.x * blockDim.x + threadIdx.x;\n"
" if (i >= n) return;\n"
" keep[i] = (i == 0 || keys[i] != keys[i - 1]) ? 1 : 0;\n"
"}\n"
)
return (
"declare -A seen\n"
"while IFS= read -r key; do\n"
" if [[ -z \"${seen[$key]}\" ]]; then\n"
" seen[$key]=1\n"
" printf '%s\\n' \"$key\"\n"
" fi\n"
"done\n"
)
def _answer(language: str, task_id: str, task: str, frame: str, variant: int) -> str:
code = _code_for(language, task_id, variant)
if frame == "explain_tradeoffs":
return (
f"แนวทางสำหรับ `{task_id}` ในภาษา `{language}` ต้องแยกเป้าหมาย ความเสี่ยง และหลักฐานตรวจผลให้ชัด.\n\n"
f"- Core idea: {task} โดยไม่เพิ่ม state ที่ไม่จำเป็นและไม่ปนข้อมูล eval.\n"
"- Tradeoff: ความเร็วต้องไม่แลกกับการเสีย determinism, ส่วน memory ต้องวัดด้วย sample drift ไม่ใช่เดา.\n"
"- Verification: ตรวจ duplicate, schema, edge case ว่าง/null, และบันทึก hash ของ input/output.\n"
"- Failure mode: ถ้า source license ไม่ชัดหรือพบ secret ให้ quarantine ทันที.\n"
)
return (
f"งาน `{task_id}` ภาษา `{language}` เฟรม `{frame}` ต้องทำแบบตรวจซ้ำได้ ไม่ใช่จำ template.\n\n"
"ข้อกำหนด:\n"
f"1. เป้าหมาย: {task}.\n"
"2. ต้อง deterministic, bounded memory, และบันทึกหลักฐาน hash ได้.\n"
"3. ห้ามใช้ข้อมูล benchmark/eval เป็นคำตอบฝึก.\n\n"
f"```{language}\n{code}```\n\n"
"Verification:\n"
"- สร้าง input ที่มี key ซ้ำ, key หาย, และลำดับต่างกัน.\n"
"- ตรวจว่า output stable เมื่อรันซ้ำ.\n"
"- วัด memory/latency drift หากใช้กับ stream ยาว.\n"
"- บันทึก manifest: source, semantic_sha256, loss_weight, และผล test.\n"
)
def _record(language: str, task_id: str, task: str, frame: str, variant: int) -> dict[str, Any]:
prompt = (
f"สร้างคำตอบโค้ดระดับ production สำหรับ `{task_id}` ด้วยภาษา `{language}` "
f"ในโหมด `{frame}` โดยต้องมี verification และไม่ใช้ข้อมูล benchmark เป็นคำตอบจำ"
)
answer = _answer(language, task_id, task, frame, variant)
fingerprint = _sha(f"{language}|{task_id}|{frame}|{variant}|tinymind-native-code")
return {
"messages": [
{
"role": "system",
"content": "You are TinyMind Native Code Forge. Produce compact, verifiable, defensive, high-density code training examples.",
},
{"role": "user", "content": prompt},
{"role": "assistant", "content": answer},
],
"source": "tinymind_native_code_forge",
"metadata": {
"schema_version": "tinymind-native-code-forge-v1",
"domain": "coding_cpp_rust" if language in {"rust", "cpp", "cuda"} else "coding_python" if language == "python" else "data_tooling",
"language": language,
"task_id": task_id,
"frame": frame,
"variant": variant,
"fingerprint_sha256": fingerprint,
"loss_weight": 1.42 if language in {"rust", "cpp", "cuda"} else 1.35,
"quality_tags": [
"tinymind_created",
"code_reasoning",
"verification",
"bounded_memory",
"anti_contamination",
"response_only_loss_target",
],
},
}
def build_tinymind_native_code_forge(
out_dir: str | Path,
*,
policy: NativeCodeForgePolicy | None = None,
) -> dict[str, Any]:
policy = policy or NativeCodeForgePolicy()
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
train_path = out / "tinymind_native_code_train.jsonl"
eval_path = out / "tinymind_native_code_eval.jsonl"
records: list[dict[str, Any]] = []
idx = 0
while len(records) < policy.target_records:
language = LANGUAGES[idx % len(LANGUAGES)]
task_id, task = TASKS[(idx // len(LANGUAGES)) % len(TASKS)]
frame = FRAMES[(idx // (len(LANGUAGES) * len(TASKS))) % len(FRAMES)]
variant = idx
records.append(_record(language, task_id, task, frame, variant))
idx += 1
eval_count = max(1, int(len(records) * policy.eval_fraction))
eval_every = max(1, len(records) // eval_count)
train_rows: list[dict[str, Any]] = []
eval_rows: list[dict[str, Any]] = []
for i, row in enumerate(records):
(eval_rows if i % eval_every == 0 and len(eval_rows) < eval_count else train_rows).append(row)
train_path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in train_rows) + "\n", encoding="utf-8")
eval_path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in eval_rows) + "\n", encoding="utf-8")
manifest = {
"schema": "tinymind.native_code_forge.v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"outputs": {"train_jsonl": str(train_path), "eval_jsonl": str(eval_path)},
"summary": {
"target_records": policy.target_records,
"train_records": len(train_rows),
"eval_records": len(eval_rows),
"languages": LANGUAGES,
"tasks": [task_id for task_id, _task in TASKS],
"frames": FRAMES,
},
"claim_gate": {
"tinymind_created_code_data_ready": True,
"external_code_copied": False,
"world_best_code_data_claim_allowed": False,
"reason": "Dataset is generated by TinyMind templates for targeted training. It is not proof of world-best code skill without eval.",
},
}
manifest_path = out / "tinymind_native_code_forge_manifest.json"
manifest["manifest_path"] = str(manifest_path)
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
return manifest

Xet Storage Details

Size:
12.1 kB
·
Xet hash:
c0d25ddc1b1eb01c6bf3eb5dccf734647f1fdd4f13bd70598881ac85584bc8eb

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.