Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

Files

xet

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /data /expert_curriculum_forge.py

bbkdevops

about 1 month ago

download

raw

19.2 kB

	"""Open Pure expert curriculum forge for TinyMind.

	The forge filters junk only: malformed rows, duplicates, low-quality text,
	missing provenance, weak CEV structure, and unverifiable filler. It does not
	drop domains by default; domain breadth is represented through explicit
	coverage tags and claim/evidence/verification fields.
	"""

	from __future__ import annotations

	from collections import Counter
	from dataclasses import asdict, dataclass
	import hashlib
	import json
	import re
	from pathlib import Path
	from typing import Iterable


	SCHEMA_VERSION = "tinymind-open-pure-expert-curriculum-v1"

	JUNK_MARKERS = (
	"lorem ipsum",
	"todo",
	"fixme",
	"???",
	"ไม่รู้",
	"ไม่แน่ใจ",
	"as an ai language model",
	)

	DOMAIN_TEMPLATES = {
	"thai_advanced_language": [
	(
	"อธิบายการเขียนภาษาไทยเชิงวิเคราะห์ระดับสูงให้คม ชัด และตรวจทานได้",
	"แยกแก่นความหมายก่อน แล้วจัดประโยคเป็นชั้น: ประเด็นหลัก เหตุผล หลักฐาน และข้อจำกัด ใช้คำเชื่อมเพื่อบอกความสัมพันธ์เชิงตรรกะ เช่น เพราะว่า อย่างไรก็ตาม ดังนั้น และหลีกเลี่ยงคำฟุ่มเฟือยที่ไม่เพิ่มสาระ",
	),
	(
	"สร้างเกณฑ์ตรวจคำตอบภาษาไทยระดับผู้เชี่ยวชาญ",
	"ตรวจ 5 ชั้น: ความถูกต้องของข้อเท็จจริง ความครบถ้วนของเหตุผล ความแม่นของศัพท์เฉพาะ ความต่อเนื่องทางตรรกะ และความกระชับของภาษา ทุกข้ออ้างต้องโยงกับหลักฐานหรือเงื่อนไขที่ระบุได้",
	),
	],
	"english_advanced_language": [
	(
	"Explain advanced English technical writing for precise reasoning",
	"Start with the governing claim, define scope, state assumptions, then separate evidence from interpretation. Prefer concrete verbs, preserve modality, and mark uncertainty with calibrated language instead of vague hedging.",
	),
	(
	"Design an expert rubric for English answers",
	"Score semantic accuracy, causal structure, terminology control, counterexample handling, and revision quality. A strong answer survives paraphrase and keeps the same logical commitments.",
	),
	],
	"polyglot_code_projects": [
	(
	"ออกแบบโปรเจกต์ระดับสูงที่ดูแลได้หลายภาษา",
	"เริ่มจาก contract กลาง เช่น schema, API boundary, test fixtures และ build matrix จากนั้นแยก implementation ต่อภาษาโดยให้ทุกภาษาผ่าน golden tests เดียวกัน เพื่อลด drift ระหว่าง Python, TypeScript, Rust, Go, C++, Java และ Swift",
	),
	(
	"How should an expert project scaffold prove it is maintainable?",
	"It should include reproducible setup, typed boundaries, deterministic tests, lint/format rules, security review notes, benchmark hooks, and a release checklist. The scaffold is incomplete until a new contributor can run verification from a clean checkout.",
	),
	],
	"safe_cross_platform_commands": [
	(
	"สรุปหลักการใช้คำสั่งข้ามระบบให้ตรวจซ้ำได้",
	"แยก shell ก่อนเสมอ: PowerShell ใช้ cmdlet และ object pipeline, Bash ใช้ text streams, CMD ใช้ built-ins แบบ legacy, Android ใช้ adb, iOS ใช้ xcrun/simctl เมื่อมีสิทธิ์ถูกต้อง ทุกคำสั่งต้องระบุ working directory, dry-run หรือ read-only mode ก่อนทำงานที่เปลี่ยนสถานะ",
	),
	(
	"Give a cross-device command verification pattern",
	"Use inspect-before-act: list target, print resolved path, validate it is inside the intended workspace, run a dry check, then apply the smallest command. Record stdout, stderr, exit code, tool version, and timestamp for reproducibility.",
	),
	],
	"cev_claim_evidence_verification": [
	(
	"CEV คืออะไรและใช้กันข้อมูลขยะอย่างไร",
	"CEV คือ Claim-Evidence-Verification: claim ระบุสิ่งที่พูด, evidence ระบุแหล่งหรือ artifact, verification ระบุวิธีตรวจซ้ำ ข้อมูลที่ไม่มีหนึ่งในสามส่วนนี้ถือว่าเสี่ยงปนเปื้อนและต้องลดคะแนนคุณภาพ",
	),
	(
	"Build a deep CEV record for rare expert knowledge",
	"A deep record contains the claim, prerequisites, boundary conditions, failure modes, independent checks, minimal reproducible example, and provenance hash. This lets the model learn knowledge with audit trails instead of memorising unsupported prose.",
	),
	],
	"sandbox_rl_lua_os_tools": [
	(
	"ออกแบบ loop ให้โมเดลเรียนรู้จากข้อมูลน้อยแต่แก้โจทย์ด้วยเครื่องมือได้",
	"ใช้ pattern Think-Tool-Verify: วิเคราะห์โจทย์เป็น CEV ก่อน เลือกเครื่องมือ sandbox เช่น Lua สำหรับคำนวณเร็วหรือ file writer สำหรับสร้าง artifact แล้วตรวจผลด้วย ledger ทุก action ต้องมี claim, evidence และ verification เพื่อให้เรียนรู้จากตัวอย่างน้อยแต่ reusable สูง",
	),
	(
	"How can a model create code and files without memorising every answer?",
	"Teach it tool policies and verification loops instead of fixed outputs: decompose the request, draft code, run a sandbox check, inspect stdout and files, repair failures, then save the final artifact with a manifest. This makes small data teach general procedures.",
	),
	],
	"natural_explanation_mastery": [
	(
	"อธิบายแนวคิดยากให้มนุษย์เข้าใจง่ายโดยไม่ทำให้ความหมายผิดเพี้ยน",
	"เริ่มจากภาพรวมหนึ่งประโยค จากนั้นค่อยแยกส่วนสำคัญทีละชั้น: สิ่งนี้คืออะไร ทำไมจึงสำคัญ ทำงานอย่างไร ตัวอย่างที่จับต้องได้คืออะไร และข้อจำกัดอยู่ตรงไหน ภาษาต้องเป็นธรรมชาติ ไม่ท่องศัพท์ ไม่ข้ามเงื่อนไข และไม่แต่งข้อมูลที่ไม่มีหลักฐาน",
	),
	(
	"How should a model answer a complex question naturally and accurately?",
	"It should first identify the user's real goal, then explain the core idea in plain language, add the important details in a sensible order, mark uncertainty when evidence is incomplete, and end with a useful next action. The answer should feel like a careful expert talking to a person, not a dataset row.",
	),
	],
	"deep_learning_to_learn": [
	(
	"ทำอย่างไรให้โมเดลเรียนจากข้อมูลน้อยแต่ต่อยอดเองได้",
	"ให้สอนหลักการที่ถ่ายโอนได้แทนการจำคำตอบ: นิยามปัญหา ตรวจแหล่งข้อมูล สร้างสมมติฐาน ทดลอง ตรวจผล และปรับคำตอบ วงจรนี้ทำให้ตัวอย่างน้อยกลายเป็นวิธีคิดที่ใช้กับโจทย์ใหม่ได้ โดยยังต้องเก็บหลักฐานและข้อจำกัดไว้เสมอ",
	),
	(
	"Explain self-improving learning without unsupported claims",
	"A small model can improve by using retrieval, tools, tests, and feedback loops around a compact reasoning core. It does not magically know everything; it asks for or retrieves evidence, checks outputs, stores what was verified, and updates future behavior through measured training or adapters.",
	),
	],
	"strict_instruction_following": [
	(
	"Return only compact JSON with keys answer and evidence for a verified result",
	'{"answer":"Verified result depends on the attached evidence.","evidence":"Use the provided source hash or say evidence is missing before making a factual claim."}',
	),
	(
	"Answer with exactly one sentence that starts with Verified:",
	"Verified: the answer is supported only when source evidence, hash, and verification all point to the same claim.",
	),
	(
	"Give exactly three bullet points about safe benchmark claims",
	"- Measure on a named benchmark before claiming quality.\n- Save JSON or CSV evidence with dates and source links.\n- Block world-best claims until official external ranks exist.",
	),
	],
	"deep_logic_reasoning": [
	(
	"If every verified claim has evidence and this claim has no evidence, what follows?",
	"The valid conclusion is that the claim is not verified. This does not prove the claim is false; it only says the verification condition has not been met. The logic separates truth from evidence status.",
	),
	(
	"Explain the difference between implication, converse, inverse, and contrapositive",
	"For an implication if P then Q, the converse is if Q then P, the inverse is if not P then not Q, and the contrapositive is if not Q then not P. Only the contrapositive is logically equivalent to the original implication.",
	),
	(
	"A system must answer only when evidence exists. Evidence is missing. What should the system do?",
	"It should refuse to assert a factual answer and state that evidence is missing. This is a policy consequence from the rule, not a guess about the hidden truth of the world.",
	),
	(
	"How do you solve a contradiction in a reasoning trace?",
	"Locate the smallest pair of statements that cannot both be true, identify which premise lacks evidence or has weaker support, remove or qualify that premise, then recompute the conclusion from the remaining consistent set.",
	),
	],
	}

	COVERAGE_TAGS = (
	"thai_advanced_semantics",
	"english_technical_reasoning",
	"python_typescript_rust_go_cpp_java_swift_project_scaffolds",
	"cmd_powershell_bash_android_ios_network_admin_safe",
	"cev_claim_evidence_verification_deep_records",
	"sandbox_rl_lua_workspace_file_project_tools",
	"natural_human_explanation_complex_topics",
	"small_data_learning_source_grounded_adaptation",
	"strict_instruction_following_json_prefix_bullets",
	"formal_logic_implication_contradiction_evidence_policy",
	)


	@dataclass(frozen=True)
	class ExpertRecord:
	domain: str
	lang: str
	question: str
	answer: str
	claim: str
	evidence: str
	verification: str
	source: str
	license: str
	quality_score: float
	rarity_score: float
	junk_score: float = 0.0
	openness_label: str = "open_pure_knowledge"


	def _stable_id(record: ExpertRecord) -> str:
	payload = json.dumps(asdict(record), ensure_ascii=False, sort_keys=True)
	return hashlib.sha256(payload.encode("utf-8")).hexdigest()[:24]


	def _norm(text: str) -> str:
	return re.sub(r"\s+", " ", text.strip().lower())


	def _junk_score(record: ExpertRecord) -> float:
	text = f"{record.question}\n{record.answer}\n{record.claim}\n{record.evidence}\n{record.verification}".lower()
	score = 0.0
	if any(marker in text for marker in JUNK_MARKERS):
	score += 0.7
	if len(record.answer.strip()) < 80:
	score += 0.2
	if not all((record.claim.strip(), record.evidence.strip(), record.verification.strip())):
	score += 0.5
	if record.quality_score < 0.95:
	score += 0.2
	return min(score, 1.0)


	class ExpertCurriculumForge:
	"""Build an open, high-purity curriculum across expert domains."""

	purity_policy = (
	"junk_only_filtering",
	"no_domain_censorship_by_default",
	"deduplicated_by_domain_language_question",
	"claim_evidence_verification_required",
	"quality_rarity_thresholded",
	"provenance_and_license_required",
	)

	def __init__(self, records_per_domain: int = 4, eval_ratio: float = 0.2):
	self.records_per_domain = max(1, int(records_per_domain))
	self.eval_ratio = min(max(float(eval_ratio), 0.05), 0.5)

	def build_records(self) -> list[ExpertRecord]:
	rows: list[ExpertRecord] = []
	for domain, templates in DOMAIN_TEMPLATES.items():
	for i in range(self.records_per_domain):
	question, answer = templates[i % len(templates)]
	lang = "th" if re.search(r"[\u0E00-\u0E7F]", question + answer) else "en"
	claim = f"{domain} record teaches a reusable expert method, not a memorized answer."
	evidence = f"local_open_pure_seed:{domain}:{i % len(templates)}"
	verification = "Check CEV fields, run dedupe hash, verify answer length and domain tag coverage."
	rows.append(
	ExpertRecord(
	domain=domain,
	lang=lang,
	question=f"{question} [case {i}]",
	answer=answer,
	claim=claim,
	evidence=evidence,
	verification=verification,
	source="local_expert_curriculum_seed",
	license="internal-clean",
	quality_score=0.99,
	rarity_score=0.93,
	)
	)
	return rows

	def select(self, records: Iterable[ExpertRecord]) -> tuple[list[ExpertRecord], int]:
	kept: dict[str, ExpertRecord] = {}
	blocked = 0
	for record in records:
	score = _junk_score(record)
	if score > 0.03:
	blocked += 1
	continue
	key = f"{record.domain}:{record.lang}:{_norm(record.question)}"
	kept.setdefault(key, record)
	return list(kept.values()), blocked

	def _row(self, record: ExpertRecord) -> dict:
	row = asdict(record)
	row["id"] = _stable_id(record)
	row["schema_version"] = SCHEMA_VERSION
	row["text"] = (
	f"<domain>{record.domain}</domain>\n"
	f"<claim>{record.claim}</claim>\n"
	f"<evidence>{record.evidence}</evidence>\n"
	f"<verification>{record.verification}</verification>\n"
	f"<user>{record.question}</user>\n"
	f"<assistant>{record.answer}</assistant>"
	)
	return row

	def write_jsonl(self, out_dir: str \| Path) -> dict:
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	selected, blocked = self.select(self.build_records())
	selected.sort(key=lambda row: (row.domain, row.lang, row.question))
	by_domain: dict[str, list[ExpertRecord]] = {}
	for record in selected:
	by_domain.setdefault(record.domain, []).append(record)
	train_records: list[ExpertRecord] = []
	eval_records: list[ExpertRecord] = []
	for domain_records in by_domain.values():
	n_eval = max(1, int(round(len(domain_records) * self.eval_ratio))) if len(domain_records) > 1 else 0
	if n_eval:
	eval_records.extend(domain_records[-n_eval:])
	train_records.extend(domain_records[:-n_eval])
	else:
	train_records.extend(domain_records)
	train_records.sort(key=lambda row: (row.domain, row.lang, row.question))
	eval_records.sort(key=lambda row: (row.domain, row.lang, row.question))

	train_path = out / "expert_curriculum_train.jsonl"
	eval_path = out / "expert_curriculum_eval.jsonl"
	self._write(train_path, train_records)
	self._write(eval_path, eval_records)

	all_rows = [self._row(row) for row in selected]
	manifest = {
	"schema_version": SCHEMA_VERSION,
	"train_path": str(train_path),
	"eval_path": str(eval_path),
	"records_written": len(selected),
	"train_records": len(train_records),
	"eval_records": len(eval_records),
	"blocked_records": blocked,
	"domain_counts": dict(Counter(row.domain for row in selected)),
	"lang_counts": dict(Counter(row.lang for row in selected)),
	"coverage_tags": list(COVERAGE_TAGS),
	"purity_policy": list(self.purity_policy),
	"sha256": {
	"train": hashlib.sha256(train_path.read_bytes()).hexdigest(),
	"eval": hashlib.sha256(eval_path.read_bytes()).hexdigest(),
	},
	"rows_preview": all_rows[:3],
	}
	manifest_path = out / "expert_curriculum_manifest.json"
	manifest["manifest_path"] = str(manifest_path)
	manifest_path.write_text(json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8")
	return manifest

	def _write(self, path: Path, records: list[ExpertRecord]) -> None:
	with path.open("w", encoding="utf-8", newline="\n") as f:
	for record in records:
	f.write(json.dumps(self._row(record), ensure_ascii=False, sort_keys=True) + "\n")

Xet Storage Details

Size:: 19.2 kB
Xet hash:: fe07942c6fc98635f0d1fd3b3f8e7eee5e9acc878b3b6d672548c50060db00f3

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.