Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-8b-remote-handoff /bundle /train /native_micro_train.py
| from __future__ import annotations | |
| from datetime import datetime, timezone | |
| import json | |
| import math | |
| from pathlib import Path | |
| from typing import Any | |
| import torch | |
| from model.architecture import OmegaModel | |
| from model.config import OmegaConfig | |
| def _render(row: dict[str, Any]) -> str: | |
| parts = [] | |
| for message in row.get("messages", []): | |
| role = str(message.get("role", "user")).upper() | |
| content = str(message.get("content", "")).strip() | |
| if content: | |
| parts.append(f"{role}: {content}") | |
| return "\n".join(parts) | |
| def _load_rows(path: str | Path, limit_records: int | None = None) -> list[dict[str, Any]]: | |
| rows: list[dict[str, Any]] = [] | |
| with Path(path).open("r", encoding="utf-8") as handle: | |
| for line in handle: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| rows.append(json.loads(line)) | |
| if limit_records is not None and len(rows) >= limit_records: | |
| break | |
| return rows | |
| def _encode(text: str, seq_len: int, vocab_size: int) -> torch.Tensor: | |
| # Byte-level deterministic tokenizer: compact, self-contained, and enough | |
| # for smoke proof without depending on HF tokenizers. | |
| ids = [1] | |
| ids.extend(4 + (byte % max(1, vocab_size - 4)) for byte in text.encode("utf-8", errors="replace")) | |
| ids.append(2) | |
| ids = ids[:seq_len] | |
| if len(ids) < seq_len: | |
| ids.extend([0] * (seq_len - len(ids))) | |
| return torch.tensor(ids, dtype=torch.long) | |
| def _batch(rows: list[dict[str, Any]], seq_len: int, vocab_size: int) -> torch.Tensor: | |
| return torch.stack([_encode(_render(row), seq_len, vocab_size) for row in rows], dim=0) | |
| def _config(dim: int, layers: int, seq_len: int, vocab_size: int) -> OmegaConfig: | |
| heads = max(1, min(8, dim // 32)) | |
| return OmegaConfig( | |
| vocab_size=vocab_size, | |
| dim=dim, | |
| n_layers=layers, | |
| n_heads=heads, | |
| head_dim=dim // heads, | |
| ffn_mult=2, | |
| layer_pattern="P", | |
| memory_ranks=max(8, dim // 8), | |
| local_window=min(32, seq_len), | |
| timescale_count=4, | |
| low_rank=4, | |
| residual_alpha=layers ** -0.5, | |
| max_seq_len=seq_len, | |
| dropout=0.0, | |
| architecture_mode="purefield", | |
| cnn_core_enabled=True, | |
| self_assessment_enabled=True, | |
| self_assessment_frequency=max(1, layers), | |
| self_assessment_steps=2, | |
| regen_kv_enabled=False, | |
| ) | |
| def _eval_loss(model: OmegaModel, ids: torch.Tensor) -> float: | |
| model.eval() | |
| out = model(ids, labels=ids) | |
| return float(out["loss"].detach().cpu()) | |
| def run_native_micro_train( | |
| out_dir: str | Path, | |
| *, | |
| dataset: str | Path, | |
| max_steps: int = 8, | |
| eval_records: int = 16, | |
| limit_records: int | None = None, | |
| dim: int = 128, | |
| layers: int = 3, | |
| seq_len: int = 128, | |
| vocab_size: int = 512, | |
| learning_rate: float = 3e-4, | |
| seed: int = 20260527, | |
| ) -> dict[str, Any]: | |
| torch.manual_seed(seed) | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| rows = _load_rows(dataset, limit_records=limit_records) | |
| if len(rows) < 4: | |
| raise ValueError("native micro-train requires at least 4 rows") | |
| eval_n = max(1, min(eval_records, len(rows) // 3)) | |
| eval_rows = rows[:eval_n] | |
| train_rows = rows[eval_n:] | |
| cfg = _config(dim, layers, seq_len, vocab_size) | |
| model = OmegaModel(cfg) | |
| optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.0) | |
| eval_ids = _batch(eval_rows, seq_len, vocab_size) | |
| train_ids = _batch(train_rows, seq_len, vocab_size) | |
| pre_loss = _eval_loss(model, eval_ids) | |
| losses: list[float] = [] | |
| model.train() | |
| steps = max(1, int(max_steps)) | |
| for step in range(steps): | |
| sample = train_ids[step % train_ids.shape[0] : step % train_ids.shape[0] + 1] | |
| optimizer.zero_grad(set_to_none=True) | |
| out_dict = model(sample, labels=sample) | |
| loss = out_dict["loss"] | |
| loss.backward() | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) | |
| optimizer.step() | |
| losses.append(float(loss.detach().cpu())) | |
| post_loss = _eval_loss(model, eval_ids) | |
| report = { | |
| "schema": "tinymind.native_micro_train.v1", | |
| "created_at": datetime.now(timezone.utc).isoformat(), | |
| "dataset": str(dataset), | |
| "summary": { | |
| "records_loaded": len(rows), | |
| "train_records": len(train_rows), | |
| "eval_records": len(eval_rows), | |
| "dim": dim, | |
| "layers": layers, | |
| "seq_len": seq_len, | |
| "vocab_size": vocab_size, | |
| }, | |
| "metrics": { | |
| "pre_eval_loss": pre_loss, | |
| "post_eval_loss": post_loss, | |
| "pre_eval_loss_finite": math.isfinite(pre_loss), | |
| "post_eval_loss_finite": math.isfinite(post_loss), | |
| "train_loss_first": losses[0], | |
| "train_loss_last": losses[-1], | |
| "train_steps_completed": len(losses), | |
| "post_minus_pre_eval_loss": post_loss - pre_loss, | |
| }, | |
| "claim_gate": { | |
| "native_training_proven": math.isfinite(pre_loss) and math.isfinite(post_loss) and len(losses) == steps, | |
| "tier0_claim_allowed": False, | |
| "world_best_claim_allowed": False, | |
| "reason": "This is a micro-train smoke proof for TinyMind-native learning, not broad capability evidence.", | |
| }, | |
| } | |
| path = out / "native_micro_train_report.json" | |
| report["json_path"] = str(path) | |
| path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8") | |
| return report | |
Xet Storage Details
- Size:
- 5.65 kB
- Xet hash:
- cc8b8487046e7661d425f042da21a1a81d0bee869bb891d50320e26bc2596507
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.