bbkdevops's picture
download
raw
5.93 kB
from __future__ import annotations
from datetime import datetime, timezone
import json
import math
from pathlib import Path
from typing import Any
import torch
from torch import nn
class FactorizedVirtualWidthBridge(nn.Module):
"""Low-rank virtual-width bridge that never materializes virtual activations.
The virtual dimension is represented by hashed rank coordinates and metadata.
This keeps the real activation width at ``physical_dim`` while allowing the
planner to budget a much larger symbolic width for routing/capacity studies.
"""
def __init__(self, physical_dim: int, virtual_dim: int, rank: int, lanes: int = 64):
super().__init__()
self.physical_dim = int(physical_dim)
self.virtual_dim = int(virtual_dim)
self.rank = int(rank)
self.lanes = int(lanes)
self.down = nn.Linear(self.physical_dim, self.rank, bias=False)
self.rank_gate = nn.Parameter(torch.zeros(self.rank))
self.lane_gate = nn.Parameter(torch.zeros(self.lanes, self.rank))
self.up = nn.Linear(self.rank, self.physical_dim, bias=False)
def forward(self, x: torch.Tensor) -> torch.Tensor:
z = self.down(x)
lane_bias = self.lane_gate.mean(dim=0)
z = torch.tanh(z * torch.sigmoid(self.rank_gate + lane_bias))
return x + self.up(z)
@property
def parameter_count(self) -> int:
return sum(param.numel() for param in self.parameters())
def _dense_virtual_layer_params(virtual_dim: int) -> int:
# Approximate transformer/PureField dense layer budget: projections + FFN.
return 8 * int(virtual_dim) * int(virtual_dim)
def _factorized_params(physical_dim: int, rank: int, lanes: int) -> int:
return 2 * int(physical_dim) * int(rank) + int(rank) + int(lanes) * int(rank)
def _smoke_candidate(physical_dim: int, virtual_dim: int, rank: int, lanes: int) -> dict[str, Any]:
torch.manual_seed(20260527)
bridge = FactorizedVirtualWidthBridge(physical_dim, virtual_dim, rank, lanes)
x = torch.randn(2, 8, physical_dim, requires_grad=True)
y = bridge(x)
loss = y.float().pow(2).mean()
loss.backward()
grad_values = [param.grad for param in bridge.parameters() if param.grad is not None]
grad_finite = bool(grad_values) and all(torch.isfinite(grad).all().item() for grad in grad_values)
return {
"forward_finite": bool(torch.isfinite(y).all().item()),
"backward_finite": grad_finite and bool(torch.isfinite(x.grad).all().item()),
"loss": float(loss.detach().cpu()),
"output_shape": list(y.shape),
"bridge_params": bridge.parameter_count,
}
def _candidate(virtual_dim: int, physical_dim: int, layers: int, rank: int, lanes: int) -> dict[str, Any]:
dense_per_layer = _dense_virtual_layer_params(virtual_dim)
factor_per_layer = _factorized_params(physical_dim, rank, lanes)
dense_total = dense_per_layer * int(layers)
factor_total = factor_per_layer * int(layers)
compression = dense_total / max(1, factor_total)
smoke = _smoke_candidate(physical_dim, virtual_dim, rank, lanes)
return {
"virtual_dim": virtual_dim,
"physical_dim": physical_dim,
"layers": layers,
"rank": rank,
"lanes": lanes,
"materializes_virtual_activations": False,
"dense_virtual_params_estimate": dense_total,
"factorized_bridge_params_estimate": factor_total,
"compression_vs_dense_virtual": compression,
"smoke": smoke,
"score": math.log1p(compression) + math.log1p(physical_dim) + math.log1p(layers) + math.log1p(rank),
}
def build_native_virtual_width_report(
out_dir: str | Path,
*,
virtual_dim: int = 20_480,
physical_dims: list[int] | None = None,
layers: list[int] | None = None,
ranks: list[int] | None = None,
lanes: int = 64,
) -> dict[str, Any]:
physical_values = physical_dims or [512, 768, 1024]
layer_values = layers or [6, 12, 24]
rank_values = ranks or [64, 96, 128, 192]
if virtual_dim <= 0:
raise ValueError("virtual_dim must be positive")
if not physical_values or not layer_values or not rank_values:
raise ValueError("physical_dims, layers, and ranks must not be empty")
candidates = [
_candidate(virtual_dim, physical_dim, layer_count, rank, lanes)
for physical_dim in physical_values
for layer_count in layer_values
for rank in rank_values
]
best = max(candidates, key=lambda item: item["score"])
report = {
"schema": "tinymind.native_virtual_width.v1",
"created_at": datetime.now(timezone.utc).isoformat(),
"target": {
"virtual_dim": virtual_dim,
"method": "factorized_low_rank_virtual_width",
"materializes_virtual_activations": False,
},
"summary": {
"candidate_count": len(candidates),
"physical_dims": physical_values,
"layers": layer_values,
"ranks": rank_values,
"lanes": lanes,
},
"best_candidate": best,
"top_candidates": sorted(candidates, key=lambda item: item["score"], reverse=True)[:8],
"claim_gate": {
"virtual_20480_candidate_ready": virtual_dim >= 20_480 and best["smoke"]["forward_finite"] and best["smoke"]["backward_finite"],
"dense_20480_claim_allowed": False,
"tier0_claim_allowed": False,
"world_best_claim_allowed": False,
"reason": "This proves a small factorized virtual-width bridge, not a dense 20480-wide trained frontier model.",
},
}
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
path = out / "native_virtual_width_report.json"
report["json_path"] = str(path)
path.write_text(json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8")
return report

Xet Storage Details

Size:
5.93 kB
·
Xet hash:
60992a8bc61e0aea12aefd5156e30d0261f541356be84f8ec7404797b9eaebc2

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.