Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /model /tinymind-apex /train_tinymind_apex.py
| from __future__ import annotations | |
| import argparse | |
| import hashlib | |
| import json | |
| import time | |
| from pathlib import Path | |
| from typing import Any | |
| import joblib | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.neighbors import NearestNeighbors | |
| DEFAULT_ROOT = Path(r"D:\ad\tinymind\model\tinymind-apex") | |
| DATA_ROOT = Path(r"D:\ad\tinymind\data") | |
| def read_jsonl(path: Path): | |
| if not path.exists(): | |
| return | |
| with path.open("r", encoding="utf-8") as f: | |
| for line in f: | |
| if line.strip(): | |
| yield json.loads(line) | |
| def text_from_apex(item: dict[str, Any]) -> str: | |
| parts = [ | |
| item.get("id", ""), | |
| item.get("domain", ""), | |
| item.get("task", ""), | |
| json.dumps(item.get("inputs", {}), ensure_ascii=False), | |
| item.get("synthesis", {}).get("final_answer", ""), | |
| item.get("synthesis", {}).get("reasoning_summary", ""), | |
| json.dumps(item.get("quality", {}).get("rubric", {}), ensure_ascii=False), | |
| ] | |
| return "\n".join(str(p) for p in parts if p) | |
| def text_from_toolcall(item: dict[str, Any]) -> str: | |
| messages = " ".join(m.get("content", "") for m in item.get("messages", [])) | |
| parts = [ | |
| item.get("id", ""), | |
| item.get("domain", ""), | |
| item.get("difficulty", ""), | |
| item.get("risk", ""), | |
| messages, | |
| json.dumps(item.get("expected_tool_calls", []), ensure_ascii=False), | |
| json.dumps(item.get("validation", {}), ensure_ascii=False), | |
| " ".join(item.get("tags", [])), | |
| ] | |
| return "\n".join(str(p) for p in parts if p) | |
| def build_records() -> list[dict[str, Any]]: | |
| records: list[dict[str, Any]] = [] | |
| apex_path = DATA_ROOT / "distill" / "jsonl" / "apexdistill_gold_10000d.jsonl" | |
| tool_path = DATA_ROOT / "toolcall" / "jsonl" / "toolcall_gold.jsonl" | |
| for item in read_jsonl(apex_path) or []: | |
| records.append( | |
| { | |
| "id": item["id"], | |
| "kind": "apexdistill", | |
| "domain": item.get("domain", ""), | |
| "task": item.get("task", ""), | |
| "text": text_from_apex(item), | |
| "answer": item.get("synthesis", {}).get("final_answer", ""), | |
| "tool_calls": item.get("synthesis", {}).get("recommended_tool_calls", []), | |
| "quality": item.get("quality", {}), | |
| } | |
| ) | |
| for item in read_jsonl(tool_path) or []: | |
| user = next((m.get("content", "") for m in item.get("messages", []) if m.get("role") == "user"), "") | |
| records.append( | |
| { | |
| "id": item["id"], | |
| "kind": "toolcall_gold", | |
| "domain": item.get("domain", ""), | |
| "task": user, | |
| "text": text_from_toolcall(item), | |
| "answer": "Use the expected structured tool call exactly as specified by the schema.", | |
| "tool_calls": item.get("expected_tool_calls", []), | |
| "quality": {"risk": item.get("risk"), "difficulty": item.get("difficulty")}, | |
| } | |
| ) | |
| return records | |
| def main() -> int: | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--root", default=str(DEFAULT_ROOT)) | |
| args = parser.parse_args() | |
| root = Path(args.root) | |
| artifact_dir = root / "artifacts" | |
| artifact_dir.mkdir(parents=True, exist_ok=True) | |
| records = build_records() | |
| if not records: | |
| raise SystemExit("No records found. Build datasets first.") | |
| corpus = [r["text"] for r in records] | |
| vectorizer = TfidfVectorizer( | |
| analyzer="char_wb", | |
| ngram_range=(3, 6), | |
| min_df=1, | |
| max_features=250_000, | |
| sublinear_tf=True, | |
| norm="l2", | |
| ) | |
| matrix = vectorizer.fit_transform(corpus) | |
| nn = NearestNeighbors(n_neighbors=min(12, len(records)), metric="cosine") | |
| nn.fit(matrix) | |
| joblib.dump({"vectorizer": vectorizer, "nn": nn, "matrix": matrix}, artifact_dir / "tinymind_apex_model.joblib") | |
| records_path = artifact_dir / "records.jsonl" | |
| with records_path.open("w", encoding="utf-8") as f: | |
| for record in records: | |
| f.write(json.dumps(record, ensure_ascii=False) + "\n") | |
| digest = hashlib.sha256(records_path.read_bytes()).hexdigest() | |
| manifest = { | |
| "name": "tinymind-apex-runtime", | |
| "version": "1.0.0", | |
| "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), | |
| "records": len(records), | |
| "sources": { | |
| "apexdistill_10000d": str(DATA_ROOT / "distill" / "jsonl" / "apexdistill_gold_10000d.jsonl"), | |
| "toolcall_gold": str(DATA_ROOT / "toolcall" / "jsonl" / "toolcall_gold.jsonl"), | |
| }, | |
| "model_type": "tfidf_char_ngram_nearest_neighbor_tool_policy", | |
| "records_sha256": digest, | |
| "artifact": str(artifact_dir / "tinymind_apex_model.joblib"), | |
| "records_path": str(records_path), | |
| } | |
| (artifact_dir / "manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8") | |
| print(json.dumps(manifest, indent=2, ensure_ascii=False)) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |
Xet Storage Details
- Size:
- 5.13 kB
- Xet hash:
- 1edbc517c0e0c0cee9f472ef1a2a87d3f4c04023e23f8d6d5cea3d690ec714c6
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.