Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /tinymind_native_code_forge.py
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from datetime import datetime, timezone | |
| import hashlib | |
| import json | |
| from pathlib import Path | |
| from typing import Any | |
| LANGUAGES = [ | |
| "python", | |
| "typescript", | |
| "rust", | |
| "go", | |
| "cpp", | |
| "sql", | |
| "powershell", | |
| "lua", | |
| "cuda", | |
| "bash", | |
| ] | |
| TASKS = [ | |
| ("schema_validator", "validate a nested JSON object against required fields and types"), | |
| ("streaming_dedupe", "deduplicate a streaming record source without loading everything into memory"), | |
| ("leak_detector", "detect latency and memory drift from periodic telemetry samples"), | |
| ("retry_backoff", "wrap an unreliable operation with bounded exponential backoff"), | |
| ("chunk_index", "build a content-addressed chunk index with SHA-256 keys"), | |
| ("safe_parser", "parse loosely formatted input without executing embedded code"), | |
| ("test_split", "create a train/eval split that prevents semantic leakage"), | |
| ("loss_weights", "assign domain loss weights with caps for dominant sources"), | |
| ("tool_call", "emit a strict tool-call JSON object and verify the arguments"), | |
| ("kv_ledger", "store exact chunk metadata while keeping model KV storage bounded"), | |
| ("config_merge", "merge layered configs with deterministic conflict resolution"), | |
| ("benchmark_report", "summarize benchmark rows without converting local evidence into official claims"), | |
| ] | |
| FRAMES = ["implement", "debug", "verify", "refactor", "explain_tradeoffs"] | |
| class NativeCodeForgePolicy: | |
| target_records: int = 20_000 | |
| eval_fraction: float = 0.02 | |
| seed: int = 20260527 | |
| def _sha(text: str) -> str: | |
| return hashlib.sha256(text.encode("utf-8")).hexdigest() | |
| def _code_for(language: str, task_id: str, variant: int) -> str: | |
| if language == "python": | |
| return ( | |
| "def solve(records):\n" | |
| " seen = set()\n" | |
| " out = []\n" | |
| " for row in records:\n" | |
| " key = row.get('id') or row.get('sha256')\n" | |
| " if key in seen:\n" | |
| " continue\n" | |
| " seen.add(key)\n" | |
| " out.append(row)\n" | |
| " return out\n" | |
| ) | |
| if language == "typescript": | |
| return ( | |
| "export function solve<T extends { id?: string; sha256?: string }>(rows: T[]): T[] {\n" | |
| " const seen = new Set<string>();\n" | |
| " return rows.filter((row) => {\n" | |
| " const key = row.id ?? row.sha256 ?? JSON.stringify(row);\n" | |
| " if (seen.has(key)) return false;\n" | |
| " seen.add(key);\n" | |
| " return true;\n" | |
| " });\n" | |
| "}\n" | |
| ) | |
| if language == "rust": | |
| return ( | |
| "use std::collections::HashSet;\n\n" | |
| "pub fn solve(keys: &[String]) -> Vec<String> {\n" | |
| " let mut seen = HashSet::new();\n" | |
| " let mut out = Vec::new();\n" | |
| " for key in keys {\n" | |
| " if seen.insert(key.clone()) {\n" | |
| " out.push(key.clone());\n" | |
| " }\n" | |
| " }\n" | |
| " out\n" | |
| "}\n" | |
| ) | |
| if language == "go": | |
| return ( | |
| "func Solve(keys []string) []string {\n" | |
| "\tseen := map[string]bool{}\n" | |
| "\tout := []string{}\n" | |
| "\tfor _, key := range keys {\n" | |
| "\t\tif seen[key] { continue }\n" | |
| "\t\tseen[key] = true\n" | |
| "\t\tout = append(out, key)\n" | |
| "\t}\n" | |
| "\treturn out\n" | |
| "}\n" | |
| ) | |
| if language == "cpp": | |
| return ( | |
| "#include <string>\n#include <unordered_set>\n#include <vector>\n\n" | |
| "std::vector<std::string> solve(const std::vector<std::string>& keys) {\n" | |
| " std::unordered_set<std::string> seen;\n" | |
| " std::vector<std::string> out;\n" | |
| " for (const auto& key : keys) {\n" | |
| " if (seen.insert(key).second) out.push_back(key);\n" | |
| " }\n" | |
| " return out;\n" | |
| "}\n" | |
| ) | |
| if language == "sql": | |
| return ( | |
| "WITH ranked AS (\n" | |
| " SELECT *, ROW_NUMBER() OVER (PARTITION BY semantic_sha256 ORDER BY quality_score DESC) AS rn\n" | |
| " FROM training_rows\n" | |
| ")\n" | |
| "SELECT * FROM ranked WHERE rn = 1;\n" | |
| ) | |
| if language == "powershell": | |
| return ( | |
| "$seen = @{}\n" | |
| "$output = foreach ($row in $Rows) {\n" | |
| " $key = $row.semantic_sha256\n" | |
| " if (-not $seen.ContainsKey($key)) { $seen[$key] = $true; $row }\n" | |
| "}\n" | |
| ) | |
| if language == "lua": | |
| return ( | |
| "function solve(rows)\n" | |
| " local seen, out = {}, {}\n" | |
| " for _, row in ipairs(rows) do\n" | |
| " local key = row.id or row.sha256\n" | |
| " if not seen[key] then\n" | |
| " seen[key] = true\n" | |
| " table.insert(out, row)\n" | |
| " end\n" | |
| " end\n" | |
| " return out\n" | |
| "end\n" | |
| ) | |
| if language == "cuda": | |
| return ( | |
| "__global__ void mark_unique(const unsigned long long* keys, int* keep, int n) {\n" | |
| " int i = blockIdx.x * blockDim.x + threadIdx.x;\n" | |
| " if (i >= n) return;\n" | |
| " keep[i] = (i == 0 || keys[i] != keys[i - 1]) ? 1 : 0;\n" | |
| "}\n" | |
| ) | |
| return ( | |
| "declare -A seen\n" | |
| "while IFS= read -r key; do\n" | |
| " if [[ -z \"${seen[$key]}\" ]]; then\n" | |
| " seen[$key]=1\n" | |
| " printf '%s\\n' \"$key\"\n" | |
| " fi\n" | |
| "done\n" | |
| ) | |
| def _answer(language: str, task_id: str, task: str, frame: str, variant: int) -> str: | |
| code = _code_for(language, task_id, variant) | |
| if frame == "explain_tradeoffs": | |
| return ( | |
| f"แนวทางสำหรับ `{task_id}` ในภาษา `{language}` ต้องแยกเป้าหมาย ความเสี่ยง และหลักฐานตรวจผลให้ชัด.\n\n" | |
| f"- Core idea: {task} โดยไม่เพิ่ม state ที่ไม่จำเป็นและไม่ปนข้อมูล eval.\n" | |
| "- Tradeoff: ความเร็วต้องไม่แลกกับการเสีย determinism, ส่วน memory ต้องวัดด้วย sample drift ไม่ใช่เดา.\n" | |
| "- Verification: ตรวจ duplicate, schema, edge case ว่าง/null, และบันทึก hash ของ input/output.\n" | |
| "- Failure mode: ถ้า source license ไม่ชัดหรือพบ secret ให้ quarantine ทันที.\n" | |
| ) | |
| return ( | |
| f"งาน `{task_id}` ภาษา `{language}` เฟรม `{frame}` ต้องทำแบบตรวจซ้ำได้ ไม่ใช่จำ template.\n\n" | |
| "ข้อกำหนด:\n" | |
| f"1. เป้าหมาย: {task}.\n" | |
| "2. ต้อง deterministic, bounded memory, และบันทึกหลักฐาน hash ได้.\n" | |
| "3. ห้ามใช้ข้อมูล benchmark/eval เป็นคำตอบฝึก.\n\n" | |
| f"```{language}\n{code}```\n\n" | |
| "Verification:\n" | |
| "- สร้าง input ที่มี key ซ้ำ, key หาย, และลำดับต่างกัน.\n" | |
| "- ตรวจว่า output stable เมื่อรันซ้ำ.\n" | |
| "- วัด memory/latency drift หากใช้กับ stream ยาว.\n" | |
| "- บันทึก manifest: source, semantic_sha256, loss_weight, และผล test.\n" | |
| ) | |
| def _record(language: str, task_id: str, task: str, frame: str, variant: int) -> dict[str, Any]: | |
| prompt = ( | |
| f"สร้างคำตอบโค้ดระดับ production สำหรับ `{task_id}` ด้วยภาษา `{language}` " | |
| f"ในโหมด `{frame}` โดยต้องมี verification และไม่ใช้ข้อมูล benchmark เป็นคำตอบจำ" | |
| ) | |
| answer = _answer(language, task_id, task, frame, variant) | |
| fingerprint = _sha(f"{language}|{task_id}|{frame}|{variant}|tinymind-native-code") | |
| return { | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": "You are TinyMind Native Code Forge. Produce compact, verifiable, defensive, high-density code training examples.", | |
| }, | |
| {"role": "user", "content": prompt}, | |
| {"role": "assistant", "content": answer}, | |
| ], | |
| "source": "tinymind_native_code_forge", | |
| "metadata": { | |
| "schema_version": "tinymind-native-code-forge-v1", | |
| "domain": "coding_cpp_rust" if language in {"rust", "cpp", "cuda"} else "coding_python" if language == "python" else "data_tooling", | |
| "language": language, | |
| "task_id": task_id, | |
| "frame": frame, | |
| "variant": variant, | |
| "fingerprint_sha256": fingerprint, | |
| "loss_weight": 1.42 if language in {"rust", "cpp", "cuda"} else 1.35, | |
| "quality_tags": [ | |
| "tinymind_created", | |
| "code_reasoning", | |
| "verification", | |
| "bounded_memory", | |
| "anti_contamination", | |
| "response_only_loss_target", | |
| ], | |
| }, | |
| } | |
| def build_tinymind_native_code_forge( | |
| out_dir: str | Path, | |
| *, | |
| policy: NativeCodeForgePolicy | None = None, | |
| ) -> dict[str, Any]: | |
| policy = policy or NativeCodeForgePolicy() | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| train_path = out / "tinymind_native_code_train.jsonl" | |
| eval_path = out / "tinymind_native_code_eval.jsonl" | |
| records: list[dict[str, Any]] = [] | |
| idx = 0 | |
| while len(records) < policy.target_records: | |
| language = LANGUAGES[idx % len(LANGUAGES)] | |
| task_id, task = TASKS[(idx // len(LANGUAGES)) % len(TASKS)] | |
| frame = FRAMES[(idx // (len(LANGUAGES) * len(TASKS))) % len(FRAMES)] | |
| variant = idx | |
| records.append(_record(language, task_id, task, frame, variant)) | |
| idx += 1 | |
| eval_count = max(1, int(len(records) * policy.eval_fraction)) | |
| eval_every = max(1, len(records) // eval_count) | |
| train_rows: list[dict[str, Any]] = [] | |
| eval_rows: list[dict[str, Any]] = [] | |
| for i, row in enumerate(records): | |
| (eval_rows if i % eval_every == 0 and len(eval_rows) < eval_count else train_rows).append(row) | |
| train_path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in train_rows) + "\n", encoding="utf-8") | |
| eval_path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in eval_rows) + "\n", encoding="utf-8") | |
| manifest = { | |
| "schema": "tinymind.native_code_forge.v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "outputs": {"train_jsonl": str(train_path), "eval_jsonl": str(eval_path)}, | |
| "summary": { | |
| "target_records": policy.target_records, | |
| "train_records": len(train_rows), | |
| "eval_records": len(eval_rows), | |
| "languages": LANGUAGES, | |
| "tasks": [task_id for task_id, _task in TASKS], | |
| "frames": FRAMES, | |
| }, | |
| "claim_gate": { | |
| "tinymind_created_code_data_ready": True, | |
| "external_code_copied": False, | |
| "world_best_code_data_claim_allowed": False, | |
| "reason": "Dataset is generated by TinyMind templates for targeted training. It is not proof of world-best code skill without eval.", | |
| }, | |
| } | |
| manifest_path = out / "tinymind_native_code_forge_manifest.json" | |
| manifest["manifest_path"] = str(manifest_path) | |
| manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8") | |
| return manifest | |
Xet Storage Details
- Size:
- 12.1 kB
- Xet hash:
- c0d25ddc1b1eb01c6bf3eb5dccf734647f1fdd4f13bd70598881ac85584bc8eb
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.