bbkdevops's picture
download
raw
5.13 kB
from __future__ import annotations
import argparse
import hashlib
import json
import time
from pathlib import Path
from typing import Any
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
DEFAULT_ROOT = Path(r"D:\ad\tinymind\model\tinymind-apex")
DATA_ROOT = Path(r"D:\ad\tinymind\data")
def read_jsonl(path: Path):
if not path.exists():
return
with path.open("r", encoding="utf-8") as f:
for line in f:
if line.strip():
yield json.loads(line)
def text_from_apex(item: dict[str, Any]) -> str:
parts = [
item.get("id", ""),
item.get("domain", ""),
item.get("task", ""),
json.dumps(item.get("inputs", {}), ensure_ascii=False),
item.get("synthesis", {}).get("final_answer", ""),
item.get("synthesis", {}).get("reasoning_summary", ""),
json.dumps(item.get("quality", {}).get("rubric", {}), ensure_ascii=False),
]
return "\n".join(str(p) for p in parts if p)
def text_from_toolcall(item: dict[str, Any]) -> str:
messages = " ".join(m.get("content", "") for m in item.get("messages", []))
parts = [
item.get("id", ""),
item.get("domain", ""),
item.get("difficulty", ""),
item.get("risk", ""),
messages,
json.dumps(item.get("expected_tool_calls", []), ensure_ascii=False),
json.dumps(item.get("validation", {}), ensure_ascii=False),
" ".join(item.get("tags", [])),
]
return "\n".join(str(p) for p in parts if p)
def build_records() -> list[dict[str, Any]]:
records: list[dict[str, Any]] = []
apex_path = DATA_ROOT / "distill" / "jsonl" / "apexdistill_gold_10000d.jsonl"
tool_path = DATA_ROOT / "toolcall" / "jsonl" / "toolcall_gold.jsonl"
for item in read_jsonl(apex_path) or []:
records.append(
{
"id": item["id"],
"kind": "apexdistill",
"domain": item.get("domain", ""),
"task": item.get("task", ""),
"text": text_from_apex(item),
"answer": item.get("synthesis", {}).get("final_answer", ""),
"tool_calls": item.get("synthesis", {}).get("recommended_tool_calls", []),
"quality": item.get("quality", {}),
}
)
for item in read_jsonl(tool_path) or []:
user = next((m.get("content", "") for m in item.get("messages", []) if m.get("role") == "user"), "")
records.append(
{
"id": item["id"],
"kind": "toolcall_gold",
"domain": item.get("domain", ""),
"task": user,
"text": text_from_toolcall(item),
"answer": "Use the expected structured tool call exactly as specified by the schema.",
"tool_calls": item.get("expected_tool_calls", []),
"quality": {"risk": item.get("risk"), "difficulty": item.get("difficulty")},
}
)
return records
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--root", default=str(DEFAULT_ROOT))
args = parser.parse_args()
root = Path(args.root)
artifact_dir = root / "artifacts"
artifact_dir.mkdir(parents=True, exist_ok=True)
records = build_records()
if not records:
raise SystemExit("No records found. Build datasets first.")
corpus = [r["text"] for r in records]
vectorizer = TfidfVectorizer(
analyzer="char_wb",
ngram_range=(3, 6),
min_df=1,
max_features=250_000,
sublinear_tf=True,
norm="l2",
)
matrix = vectorizer.fit_transform(corpus)
nn = NearestNeighbors(n_neighbors=min(12, len(records)), metric="cosine")
nn.fit(matrix)
joblib.dump({"vectorizer": vectorizer, "nn": nn, "matrix": matrix}, artifact_dir / "tinymind_apex_model.joblib")
records_path = artifact_dir / "records.jsonl"
with records_path.open("w", encoding="utf-8") as f:
for record in records:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
digest = hashlib.sha256(records_path.read_bytes()).hexdigest()
manifest = {
"name": "tinymind-apex-runtime",
"version": "1.0.0",
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"records": len(records),
"sources": {
"apexdistill_10000d": str(DATA_ROOT / "distill" / "jsonl" / "apexdistill_gold_10000d.jsonl"),
"toolcall_gold": str(DATA_ROOT / "toolcall" / "jsonl" / "toolcall_gold.jsonl"),
},
"model_type": "tfidf_char_ngram_nearest_neighbor_tool_policy",
"records_sha256": digest,
"artifact": str(artifact_dir / "tinymind_apex_model.joblib"),
"records_path": str(records_path),
}
(artifact_dir / "manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
print(json.dumps(manifest, indent=2, ensure_ascii=False))
return 0
if __name__ == "__main__":
raise SystemExit(main())

Xet Storage Details

Size:
5.13 kB
·
Xet hash:
1edbc517c0e0c0cee9f472ef1a2a87d3f4c04023e23f8d6d5cea3d690ec714c6

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.