bbkdevops's picture
download
raw
19.2 kB
"""Open Pure expert curriculum forge for TinyMind.
The forge filters junk only: malformed rows, duplicates, low-quality text,
missing provenance, weak CEV structure, and unverifiable filler. It does not
drop domains by default; domain breadth is represented through explicit
coverage tags and claim/evidence/verification fields.
"""
from __future__ import annotations
from collections import Counter
from dataclasses import asdict, dataclass
import hashlib
import json
import re
from pathlib import Path
from typing import Iterable
SCHEMA_VERSION = "tinymind-open-pure-expert-curriculum-v1"
JUNK_MARKERS = (
"lorem ipsum",
"todo",
"fixme",
"???",
"ไม่รู้",
"ไม่แน่ใจ",
"as an ai language model",
)
DOMAIN_TEMPLATES = {
"thai_advanced_language": [
(
"อธิบายการเขียนภาษาไทยเชิงวิเคราะห์ระดับสูงให้คม ชัด และตรวจทานได้",
"แยกแก่นความหมายก่อน แล้วจัดประโยคเป็นชั้น: ประเด็นหลัก เหตุผล หลักฐาน และข้อจำกัด ใช้คำเชื่อมเพื่อบอกความสัมพันธ์เชิงตรรกะ เช่น เพราะว่า อย่างไรก็ตาม ดังนั้น และหลีกเลี่ยงคำฟุ่มเฟือยที่ไม่เพิ่มสาระ",
),
(
"สร้างเกณฑ์ตรวจคำตอบภาษาไทยระดับผู้เชี่ยวชาญ",
"ตรวจ 5 ชั้น: ความถูกต้องของข้อเท็จจริง ความครบถ้วนของเหตุผล ความแม่นของศัพท์เฉพาะ ความต่อเนื่องทางตรรกะ และความกระชับของภาษา ทุกข้ออ้างต้องโยงกับหลักฐานหรือเงื่อนไขที่ระบุได้",
),
],
"english_advanced_language": [
(
"Explain advanced English technical writing for precise reasoning",
"Start with the governing claim, define scope, state assumptions, then separate evidence from interpretation. Prefer concrete verbs, preserve modality, and mark uncertainty with calibrated language instead of vague hedging.",
),
(
"Design an expert rubric for English answers",
"Score semantic accuracy, causal structure, terminology control, counterexample handling, and revision quality. A strong answer survives paraphrase and keeps the same logical commitments.",
),
],
"polyglot_code_projects": [
(
"ออกแบบโปรเจกต์ระดับสูงที่ดูแลได้หลายภาษา",
"เริ่มจาก contract กลาง เช่น schema, API boundary, test fixtures และ build matrix จากนั้นแยก implementation ต่อภาษาโดยให้ทุกภาษาผ่าน golden tests เดียวกัน เพื่อลด drift ระหว่าง Python, TypeScript, Rust, Go, C++, Java และ Swift",
),
(
"How should an expert project scaffold prove it is maintainable?",
"It should include reproducible setup, typed boundaries, deterministic tests, lint/format rules, security review notes, benchmark hooks, and a release checklist. The scaffold is incomplete until a new contributor can run verification from a clean checkout.",
),
],
"safe_cross_platform_commands": [
(
"สรุปหลักการใช้คำสั่งข้ามระบบให้ตรวจซ้ำได้",
"แยก shell ก่อนเสมอ: PowerShell ใช้ cmdlet และ object pipeline, Bash ใช้ text streams, CMD ใช้ built-ins แบบ legacy, Android ใช้ adb, iOS ใช้ xcrun/simctl เมื่อมีสิทธิ์ถูกต้อง ทุกคำสั่งต้องระบุ working directory, dry-run หรือ read-only mode ก่อนทำงานที่เปลี่ยนสถานะ",
),
(
"Give a cross-device command verification pattern",
"Use inspect-before-act: list target, print resolved path, validate it is inside the intended workspace, run a dry check, then apply the smallest command. Record stdout, stderr, exit code, tool version, and timestamp for reproducibility.",
),
],
"cev_claim_evidence_verification": [
(
"CEV คืออะไรและใช้กันข้อมูลขยะอย่างไร",
"CEV คือ Claim-Evidence-Verification: claim ระบุสิ่งที่พูด, evidence ระบุแหล่งหรือ artifact, verification ระบุวิธีตรวจซ้ำ ข้อมูลที่ไม่มีหนึ่งในสามส่วนนี้ถือว่าเสี่ยงปนเปื้อนและต้องลดคะแนนคุณภาพ",
),
(
"Build a deep CEV record for rare expert knowledge",
"A deep record contains the claim, prerequisites, boundary conditions, failure modes, independent checks, minimal reproducible example, and provenance hash. This lets the model learn knowledge with audit trails instead of memorising unsupported prose.",
),
],
"sandbox_rl_lua_os_tools": [
(
"ออกแบบ loop ให้โมเดลเรียนรู้จากข้อมูลน้อยแต่แก้โจทย์ด้วยเครื่องมือได้",
"ใช้ pattern Think-Tool-Verify: วิเคราะห์โจทย์เป็น CEV ก่อน เลือกเครื่องมือ sandbox เช่น Lua สำหรับคำนวณเร็วหรือ file writer สำหรับสร้าง artifact แล้วตรวจผลด้วย ledger ทุก action ต้องมี claim, evidence และ verification เพื่อให้เรียนรู้จากตัวอย่างน้อยแต่ reusable สูง",
),
(
"How can a model create code and files without memorising every answer?",
"Teach it tool policies and verification loops instead of fixed outputs: decompose the request, draft code, run a sandbox check, inspect stdout and files, repair failures, then save the final artifact with a manifest. This makes small data teach general procedures.",
),
],
"natural_explanation_mastery": [
(
"อธิบายแนวคิดยากให้มนุษย์เข้าใจง่ายโดยไม่ทำให้ความหมายผิดเพี้ยน",
"เริ่มจากภาพรวมหนึ่งประโยค จากนั้นค่อยแยกส่วนสำคัญทีละชั้น: สิ่งนี้คืออะไร ทำไมจึงสำคัญ ทำงานอย่างไร ตัวอย่างที่จับต้องได้คืออะไร และข้อจำกัดอยู่ตรงไหน ภาษาต้องเป็นธรรมชาติ ไม่ท่องศัพท์ ไม่ข้ามเงื่อนไข และไม่แต่งข้อมูลที่ไม่มีหลักฐาน",
),
(
"How should a model answer a complex question naturally and accurately?",
"It should first identify the user's real goal, then explain the core idea in plain language, add the important details in a sensible order, mark uncertainty when evidence is incomplete, and end with a useful next action. The answer should feel like a careful expert talking to a person, not a dataset row.",
),
],
"deep_learning_to_learn": [
(
"ทำอย่างไรให้โมเดลเรียนจากข้อมูลน้อยแต่ต่อยอดเองได้",
"ให้สอนหลักการที่ถ่ายโอนได้แทนการจำคำตอบ: นิยามปัญหา ตรวจแหล่งข้อมูล สร้างสมมติฐาน ทดลอง ตรวจผล และปรับคำตอบ วงจรนี้ทำให้ตัวอย่างน้อยกลายเป็นวิธีคิดที่ใช้กับโจทย์ใหม่ได้ โดยยังต้องเก็บหลักฐานและข้อจำกัดไว้เสมอ",
),
(
"Explain self-improving learning without unsupported claims",
"A small model can improve by using retrieval, tools, tests, and feedback loops around a compact reasoning core. It does not magically know everything; it asks for or retrieves evidence, checks outputs, stores what was verified, and updates future behavior through measured training or adapters.",
),
],
"strict_instruction_following": [
(
"Return only compact JSON with keys answer and evidence for a verified result",
'{"answer":"Verified result depends on the attached evidence.","evidence":"Use the provided source hash or say evidence is missing before making a factual claim."}',
),
(
"Answer with exactly one sentence that starts with Verified:",
"Verified: the answer is supported only when source evidence, hash, and verification all point to the same claim.",
),
(
"Give exactly three bullet points about safe benchmark claims",
"- Measure on a named benchmark before claiming quality.\n- Save JSON or CSV evidence with dates and source links.\n- Block world-best claims until official external ranks exist.",
),
],
"deep_logic_reasoning": [
(
"If every verified claim has evidence and this claim has no evidence, what follows?",
"The valid conclusion is that the claim is not verified. This does not prove the claim is false; it only says the verification condition has not been met. The logic separates truth from evidence status.",
),
(
"Explain the difference between implication, converse, inverse, and contrapositive",
"For an implication if P then Q, the converse is if Q then P, the inverse is if not P then not Q, and the contrapositive is if not Q then not P. Only the contrapositive is logically equivalent to the original implication.",
),
(
"A system must answer only when evidence exists. Evidence is missing. What should the system do?",
"It should refuse to assert a factual answer and state that evidence is missing. This is a policy consequence from the rule, not a guess about the hidden truth of the world.",
),
(
"How do you solve a contradiction in a reasoning trace?",
"Locate the smallest pair of statements that cannot both be true, identify which premise lacks evidence or has weaker support, remove or qualify that premise, then recompute the conclusion from the remaining consistent set.",
),
],
}
COVERAGE_TAGS = (
"thai_advanced_semantics",
"english_technical_reasoning",
"python_typescript_rust_go_cpp_java_swift_project_scaffolds",
"cmd_powershell_bash_android_ios_network_admin_safe",
"cev_claim_evidence_verification_deep_records",
"sandbox_rl_lua_workspace_file_project_tools",
"natural_human_explanation_complex_topics",
"small_data_learning_source_grounded_adaptation",
"strict_instruction_following_json_prefix_bullets",
"formal_logic_implication_contradiction_evidence_policy",
)
@dataclass(frozen=True)
class ExpertRecord:
domain: str
lang: str
question: str
answer: str
claim: str
evidence: str
verification: str
source: str
license: str
quality_score: float
rarity_score: float
junk_score: float = 0.0
openness_label: str = "open_pure_knowledge"
def _stable_id(record: ExpertRecord) -> str:
payload = json.dumps(asdict(record), ensure_ascii=False, sort_keys=True)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:24]
def _norm(text: str) -> str:
return re.sub(r"\s+", " ", text.strip().lower())
def _junk_score(record: ExpertRecord) -> float:
text = f"{record.question}\n{record.answer}\n{record.claim}\n{record.evidence}\n{record.verification}".lower()
score = 0.0
if any(marker in text for marker in JUNK_MARKERS):
score += 0.7
if len(record.answer.strip()) < 80:
score += 0.2
if not all((record.claim.strip(), record.evidence.strip(), record.verification.strip())):
score += 0.5
if record.quality_score < 0.95:
score += 0.2
return min(score, 1.0)
class ExpertCurriculumForge:
"""Build an open, high-purity curriculum across expert domains."""
purity_policy = (
"junk_only_filtering",
"no_domain_censorship_by_default",
"deduplicated_by_domain_language_question",
"claim_evidence_verification_required",
"quality_rarity_thresholded",
"provenance_and_license_required",
)
def __init__(self, records_per_domain: int = 4, eval_ratio: float = 0.2):
self.records_per_domain = max(1, int(records_per_domain))
self.eval_ratio = min(max(float(eval_ratio), 0.05), 0.5)
def build_records(self) -> list[ExpertRecord]:
rows: list[ExpertRecord] = []
for domain, templates in DOMAIN_TEMPLATES.items():
for i in range(self.records_per_domain):
question, answer = templates[i % len(templates)]
lang = "th" if re.search(r"[\u0E00-\u0E7F]", question + answer) else "en"
claim = f"{domain} record teaches a reusable expert method, not a memorized answer."
evidence = f"local_open_pure_seed:{domain}:{i % len(templates)}"
verification = "Check CEV fields, run dedupe hash, verify answer length and domain tag coverage."
rows.append(
ExpertRecord(
domain=domain,
lang=lang,
question=f"{question} [case {i}]",
answer=answer,
claim=claim,
evidence=evidence,
verification=verification,
source="local_expert_curriculum_seed",
license="internal-clean",
quality_score=0.99,
rarity_score=0.93,
)
)
return rows
def select(self, records: Iterable[ExpertRecord]) -> tuple[list[ExpertRecord], int]:
kept: dict[str, ExpertRecord] = {}
blocked = 0
for record in records:
score = _junk_score(record)
if score > 0.03:
blocked += 1
continue
key = f"{record.domain}:{record.lang}:{_norm(record.question)}"
kept.setdefault(key, record)
return list(kept.values()), blocked
def _row(self, record: ExpertRecord) -> dict:
row = asdict(record)
row["id"] = _stable_id(record)
row["schema_version"] = SCHEMA_VERSION
row["text"] = (
f"<domain>{record.domain}</domain>\n"
f"<claim>{record.claim}</claim>\n"
f"<evidence>{record.evidence}</evidence>\n"
f"<verification>{record.verification}</verification>\n"
f"<user>{record.question}</user>\n"
f"<assistant>{record.answer}</assistant>"
)
return row
def write_jsonl(self, out_dir: str | Path) -> dict:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
selected, blocked = self.select(self.build_records())
selected.sort(key=lambda row: (row.domain, row.lang, row.question))
by_domain: dict[str, list[ExpertRecord]] = {}
for record in selected:
by_domain.setdefault(record.domain, []).append(record)
train_records: list[ExpertRecord] = []
eval_records: list[ExpertRecord] = []
for domain_records in by_domain.values():
n_eval = max(1, int(round(len(domain_records) * self.eval_ratio))) if len(domain_records) > 1 else 0
if n_eval:
eval_records.extend(domain_records[-n_eval:])
train_records.extend(domain_records[:-n_eval])
else:
train_records.extend(domain_records)
train_records.sort(key=lambda row: (row.domain, row.lang, row.question))
eval_records.sort(key=lambda row: (row.domain, row.lang, row.question))
train_path = out / "expert_curriculum_train.jsonl"
eval_path = out / "expert_curriculum_eval.jsonl"
self._write(train_path, train_records)
self._write(eval_path, eval_records)
all_rows = [self._row(row) for row in selected]
manifest = {
"schema_version": SCHEMA_VERSION,
"train_path": str(train_path),
"eval_path": str(eval_path),
"records_written": len(selected),
"train_records": len(train_records),
"eval_records": len(eval_records),
"blocked_records": blocked,
"domain_counts": dict(Counter(row.domain for row in selected)),
"lang_counts": dict(Counter(row.lang for row in selected)),
"coverage_tags": list(COVERAGE_TAGS),
"purity_policy": list(self.purity_policy),
"sha256": {
"train": hashlib.sha256(train_path.read_bytes()).hexdigest(),
"eval": hashlib.sha256(eval_path.read_bytes()).hexdigest(),
},
"rows_preview": all_rows[:3],
}
manifest_path = out / "expert_curriculum_manifest.json"
manifest["manifest_path"] = str(manifest_path)
manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
return manifest
def _write(self, path: Path, records: list[ExpertRecord]) -> None:
with path.open("w", encoding="utf-8", newline="\n") as f:
for record in records:
f.write(json.dumps(self._row(record), ensure_ascii=False, sort_keys=True) + "\n")

Xet Storage Details

Size:
19.2 kB
·
Xet hash:
fe07942c6fc98635f0d1fd3b3f8e7eee5e9acc878b3b6d672548c50060db00f3

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.