bbkdevops's picture
download
raw
7.67 kB
from __future__ import annotations
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
import json
import math
from pathlib import Path
from typing import Any
@dataclass(frozen=True)
class Native8BProfile:
name: str
layers: int
dim: int
lanes: int
seq_len: int
vocab_size: int
virtual_dim: int
basis_rank: int
facets: int
train_batch_size: int
learning_rate: float
max_steps: int
tier: str
purpose: str
def _estimate_axiom_regenesis_params(profile: Native8BProfile) -> dict[str, Any]:
d = profile.dim
layers = profile.layers
vocab = profile.vocab_size
lanes = profile.lanes
memory_slots = max(4, min(16, lanes))
memory_rank = max(8, min(64, d // 4))
regen_rank = 4
regen_top_k = 4
top_level = (vocab * d) + (profile.seq_len * d) + (d * vocab) + (d * 4)
axiom = (2 * d * profile.basis_rank) + (profile.facets * profile.basis_rank)
local_exact = 4 * d * d
memory = (
2 * (d * memory_slots * memory_rank + memory_slots * memory_rank)
+ (d * memory_slots + memory_slots)
+ (memory_rank * d)
)
router = (d * lanes + lanes) + (lanes * d)
regen = (
vocab * d
+ 2 * d
+ 2 * (d * regen_rank * d)
+ (2 * d * d)
+ (d * d + d)
)
ffn = (d * (2 * d) + (2 * d)) + ((2 * d) * d + d)
norms = 3 * d
per_layer = axiom + local_exact + memory + router + regen + ffn + norms
self_assess = (d * (2 * d) + (2 * d)) + ((2 * d) * d + d) + (d * 4 + 4)
total = top_level + layers * per_layer + self_assess
int4_gib = total * 0.5 / (1024**3)
bf16_gib = total * 2 / (1024**3)
return {
"estimated_parameters": int(total),
"estimated_parameters_b": total / 1_000_000_000,
"per_layer_parameters": int(per_layer),
"memory_slots": memory_slots,
"memory_rank": memory_rank,
"regen_top_k": regen_top_k,
"regen_rank": regen_rank,
"estimated_weight_gib": {
"int4": int4_gib,
"bf16": bf16_gib,
},
"training_feasibility": {
"rtx_3090_full_train": False,
"rtx_3090_inference_int4_possible_after_runtime_export": int4_gib < 18.0,
"remote_gpu_training_required": total > 1_000_000_000,
},
}
def _profiles() -> list[Native8BProfile]:
return [
Native8BProfile(
name="local_bridge_3090_proven",
layers=24,
dim=192,
lanes=12,
seq_len=192,
vocab_size=512,
virtual_dim=32_768,
basis_rank=48,
facets=12,
train_batch_size=4,
learning_rate=5e-5,
max_steps=1_800,
tier="local",
purpose="Continue the proven 25.96M native checkpoint until raw language stops breaking.",
),
Native8BProfile(
name="local_ceiling_3090",
layers=36,
dim=512,
lanes=32,
seq_len=384,
vocab_size=512,
virtual_dim=262_144,
basis_rank=96,
facets=24,
train_batch_size=1,
learning_rate=3e-5,
max_steps=800,
tier="local_ceiling",
purpose="Find the largest local native capacity that still trains on a 24GB 3090 without fake claims.",
),
Native8BProfile(
name="axiom_regenesis_8b_target",
layers=48,
dim=2816,
lanes=64,
seq_len=1024,
vocab_size=4096,
virtual_dim=1_048_576,
basis_rank=256,
facets=64,
train_batch_size=1,
learning_rate=1.2e-5,
max_steps=50_000,
tier="remote_required",
purpose="Actual 8B-class TinyMind-native target. Train on Colab/HF multi-GPU before any quality claim.",
),
]
def _train_command(profile: Native8BProfile, dataset: str, out_dir: str) -> str:
return (
"python -m train.cli native-axiom-regenesis-train "
f"--dataset {dataset} "
f"--out-dir {out_dir}/{profile.name} "
f"--max-steps {profile.max_steps} --eval-records 256 --limit-records 10000 "
f"--dim {profile.dim} --layers {profile.layers} --lanes {profile.lanes} "
f"--seq-len {profile.seq_len} --vocab-size {profile.vocab_size} --tokenizer-mode char_v1 "
f"--virtual-dim {profile.virtual_dim} --basis-rank {profile.basis_rank} --facets {profile.facets} "
f"--learning-rate {profile.learning_rate} --train-batch-size {profile.train_batch_size} --device cuda"
)
def build_native_8b_target_report(
out_dir: str | Path,
*,
dataset: str = "reports/omni_round_curriculum_xl_latest/omni_round_curriculum.jsonl",
) -> dict[str, Any]:
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
profiles = _profiles()
enriched = []
for profile in profiles:
estimate = _estimate_axiom_regenesis_params(profile)
payload = asdict(profile)
payload["estimate"] = estimate
payload["train_command"] = _train_command(profile, dataset, "reports/native_8b_target_runs")
enriched.append(payload)
target = next(item for item in enriched if item["name"] == "axiom_regenesis_8b_target")
target_config_path = out / "axiom_regenesis_8b_target_config.json"
target_config_path.write_text(json.dumps(target, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
commands_path = out / "native_8b_training_commands.ps1"
commands_path.write_text(
"\n".join(
[
"# TinyMind native 8B target commands. Run local profiles first; remote_required needs Colab/HF GPU.",
*(item["train_command"] for item in enriched),
"",
]
),
encoding="utf-8",
)
target_params = target["estimate"]["estimated_parameters_b"]
report = {
"schema": "tinymind.native_8b_target.v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"dataset": dataset,
"strategy": {
"not_data_only": True,
"capacity_expansion": "24x192 proven -> 36x512 local ceiling -> 48x3456 8B remote target",
"decoder_objective": "repeat unlikelihood + entropy floor + constrained deterministic generation",
"memory": "ReGenesis/Evidence Ledger path keeps historical KV from scaling with context",
"teacher_use": "Codex-like behavior can be distilled from operational traces; no Codex weights are copied.",
},
"profiles": enriched,
"artifacts": {
"target_config_path": str(target_config_path),
"commands_path": str(commands_path),
},
"claim_gate": {
"native_8b_target_config_created": True,
"estimated_8b_class": bool(7.5 <= target_params <= 8.8),
"actual_8b_checkpoint_exists": False,
"rtx3090_full_train_claim_allowed": False,
"quality_above_larger_models_claim_allowed": False,
"leaderboard_safe_claim_allowed": False,
"world_best_claim_allowed": False,
"reason": "This creates the real 8B target path and commands. Quality claims require completed remote training plus external/raw benchmark evidence.",
},
}
path = out / "native_8b_target_report.json"
report["json_path"] = str(path)
path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
return report

Xet Storage Details

Size:
7.67 kB
·
Xet hash:
021933b07423e5a873c99bbb3c0ad30f988d10079574ef4d65772360dd3f3dfb

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.