Buckets:

bbkdevops
/

unicosys-hypergraph-bucket

bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /model /tinymind-apex /train_tinymind_apex.py

bbkdevops

about 1 month ago

download

raw

5.13 kB

	from __future__ import annotations

	import argparse
	import hashlib
	import json
	import time
	from pathlib import Path
	from typing import Any

	import joblib
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.neighbors import NearestNeighbors


	DEFAULT_ROOT = Path(r"D:\ad\tinymind\model\tinymind-apex")
	DATA_ROOT = Path(r"D:\ad\tinymind\data")


	def read_jsonl(path: Path):
	if not path.exists():
	return
	with path.open("r", encoding="utf-8") as f:
	for line in f:
	if line.strip():
	yield json.loads(line)


	def text_from_apex(item: dict[str, Any]) -> str:
	parts = [
	item.get("id", ""),
	item.get("domain", ""),
	item.get("task", ""),
	json.dumps(item.get("inputs", {}), ensure_ascii=False),
	item.get("synthesis", {}).get("final_answer", ""),
	item.get("synthesis", {}).get("reasoning_summary", ""),
	json.dumps(item.get("quality", {}).get("rubric", {}), ensure_ascii=False),
	]
	return "\n".join(str(p) for p in parts if p)


	def text_from_toolcall(item: dict[str, Any]) -> str:
	messages = " ".join(m.get("content", "") for m in item.get("messages", []))
	parts = [
	item.get("id", ""),
	item.get("domain", ""),
	item.get("difficulty", ""),
	item.get("risk", ""),
	messages,
	json.dumps(item.get("expected_tool_calls", []), ensure_ascii=False),
	json.dumps(item.get("validation", {}), ensure_ascii=False),
	" ".join(item.get("tags", [])),
	]
	return "\n".join(str(p) for p in parts if p)


	def build_records() -> list[dict[str, Any]]:
	records: list[dict[str, Any]] = []
	apex_path = DATA_ROOT / "distill" / "jsonl" / "apexdistill_gold_10000d.jsonl"
	tool_path = DATA_ROOT / "toolcall" / "jsonl" / "toolcall_gold.jsonl"

	for item in read_jsonl(apex_path) or []:
	records.append(
	{
	"id": item["id"],
	"kind": "apexdistill",
	"domain": item.get("domain", ""),
	"task": item.get("task", ""),
	"text": text_from_apex(item),
	"answer": item.get("synthesis", {}).get("final_answer", ""),
	"tool_calls": item.get("synthesis", {}).get("recommended_tool_calls", []),
	"quality": item.get("quality", {}),
	}
	)

	for item in read_jsonl(tool_path) or []:
	user = next((m.get("content", "") for m in item.get("messages", []) if m.get("role") == "user"), "")
	records.append(
	{
	"id": item["id"],
	"kind": "toolcall_gold",
	"domain": item.get("domain", ""),
	"task": user,
	"text": text_from_toolcall(item),
	"answer": "Use the expected structured tool call exactly as specified by the schema.",
	"tool_calls": item.get("expected_tool_calls", []),
	"quality": {"risk": item.get("risk"), "difficulty": item.get("difficulty")},
	}
	)

	return records


	def main() -> int:
	parser = argparse.ArgumentParser()
	parser.add_argument("--root", default=str(DEFAULT_ROOT))
	args = parser.parse_args()
	root = Path(args.root)
	artifact_dir = root / "artifacts"
	artifact_dir.mkdir(parents=True, exist_ok=True)

	records = build_records()
	if not records:
	raise SystemExit("No records found. Build datasets first.")

	corpus = [r["text"] for r in records]
	vectorizer = TfidfVectorizer(
	analyzer="char_wb",
	ngram_range=(3, 6),
	min_df=1,
	max_features=250_000,
	sublinear_tf=True,
	norm="l2",
	)
	matrix = vectorizer.fit_transform(corpus)
	nn = NearestNeighbors(n_neighbors=min(12, len(records)), metric="cosine")
	nn.fit(matrix)

	joblib.dump({"vectorizer": vectorizer, "nn": nn, "matrix": matrix}, artifact_dir / "tinymind_apex_model.joblib")
	records_path = artifact_dir / "records.jsonl"
	with records_path.open("w", encoding="utf-8") as f:
	for record in records:
	f.write(json.dumps(record, ensure_ascii=False) + "\n")

	digest = hashlib.sha256(records_path.read_bytes()).hexdigest()
	manifest = {
	"name": "tinymind-apex-runtime",
	"version": "1.0.0",
	"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
	"records": len(records),
	"sources": {
	"apexdistill_10000d": str(DATA_ROOT / "distill" / "jsonl" / "apexdistill_gold_10000d.jsonl"),
	"toolcall_gold": str(DATA_ROOT / "toolcall" / "jsonl" / "toolcall_gold.jsonl"),
	},
	"model_type": "tfidf_char_ngram_nearest_neighbor_tool_policy",
	"records_sha256": digest,
	"artifact": str(artifact_dir / "tinymind_apex_model.joblib"),
	"records_path": str(records_path),
	}
	(artifact_dir / "manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
	print(json.dumps(manifest, indent=2, ensure_ascii=False))
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())

Xet Storage Details

Size:: 5.13 kB
Xet hash:: 1edbc517c0e0c0cee9f472ef1a2a87d3f4c04023e23f8d6d5cea3d690ec714c6

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.