cy0307's picture
Publish Xperience-10M minimal and neural task baseline cards
d0b3765 verified
#!/usr/bin/env python3
"""Small PyTorch heads used by the episode task suite neural baseline."""
from __future__ import annotations
from dataclasses import dataclass
import numpy as np
@dataclass
class NeuralConfig:
epochs: int
learning_rate: float
weight_decay: float
hidden_dim: int
batch_size: int
dropout: float
device: str
seed: int
def _import_torch():
try:
import torch
import torch.nn as nn
import torch.nn.functional as F
except ImportError as exc:
raise RuntimeError(
"PyTorch is required for --include-neural. Install requirements-omni.txt or add torch to the environment."
) from exc
return torch, nn, F
def _resolve_device(torch, device_spec: str):
if device_spec == "auto":
return torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device_spec == "cuda" and not torch.cuda.is_available():
return torch.device("cpu")
return torch.device(device_spec)
def _standardize(X: np.ndarray, train_idx: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
mean = X[train_idx].mean(axis=0).astype(np.float32)
std = X[train_idx].std(axis=0).astype(np.float32)
std = np.where(std < 1e-6, 1.0, std).astype(np.float32)
return ((X - mean) / std).astype(np.float32), mean, std
def _batch_indices(torch, train_idx: np.ndarray, batch_size: int, seed: int, epoch: int):
gen = torch.Generator()
gen.manual_seed(seed + epoch)
idx = torch.from_numpy(train_idx.astype(np.int64))
return idx[torch.randperm(len(idx), generator=gen)]
def _history_epoch(epoch: int, epochs: int) -> bool:
report_every = max(1, epochs // 5)
return epoch == 1 or epoch == epochs or epoch % report_every == 0
def _make_mlp(nn, input_dim: int, output_dim: int, hidden_dim: int, dropout: float):
return nn.Sequential(
nn.LayerNorm(input_dim),
nn.Linear(input_dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, output_dim),
)
def train_classifier(
X: np.ndarray,
y: np.ndarray,
train_idx: np.ndarray,
test_idx: np.ndarray,
n_classes: int,
config: NeuralConfig,
use_class_weights: bool = True,
) -> dict:
torch, nn, F = _import_torch()
device = _resolve_device(torch, config.device)
torch.manual_seed(config.seed)
Xs, mean, std = _standardize(X.astype(np.float32), train_idx)
x_tensor = torch.from_numpy(Xs)
y_tensor = torch.from_numpy(y.astype(np.int64))
model = _make_mlp(nn, X.shape[1], n_classes, config.hidden_dim, config.dropout).to(device)
class_weights = None
if use_class_weights:
counts = np.bincount(y[train_idx], minlength=n_classes).astype(np.float32)
weights = counts.sum() / np.maximum(counts, 1.0) / max(n_classes, 1)
weights = weights / max(float(weights.mean()), 1e-6)
class_weights = torch.from_numpy(weights.astype(np.float32)).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
history = []
for epoch in range(1, config.epochs + 1):
model.train()
perm = _batch_indices(torch, train_idx, config.batch_size, config.seed, epoch)
total_loss = 0.0
total_correct = 0
total_seen = 0
for start in range(0, len(perm), config.batch_size):
idx = perm[start : start + config.batch_size]
xb = x_tensor[idx].to(device)
yb = y_tensor[idx].to(device)
logits = model(xb)
loss = F.cross_entropy(logits, yb, weight=class_weights)
opt.zero_grad(set_to_none=True)
loss.backward()
opt.step()
total_loss += float(loss.detach().cpu()) * len(idx)
total_correct += int((logits.argmax(dim=1) == yb).sum().detach().cpu())
total_seen += len(idx)
if _history_epoch(epoch, config.epochs):
history.append({
"epoch": epoch,
"loss": total_loss / max(total_seen, 1),
"train_accuracy": total_correct / max(total_seen, 1),
})
model.eval()
with torch.no_grad():
logits = model(x_tensor[test_idx].to(device))
probs = F.softmax(logits, dim=1).cpu().numpy().astype(np.float32)
return {
"pred": np.argmax(probs, axis=1).astype(np.int64),
"prob": probs,
"history": history,
"mean": mean,
"std": std,
"state_dict": {k: v.detach().cpu() for k, v in model.state_dict().items()},
"device": str(device),
}
def train_multilabel(
X: np.ndarray,
Y: np.ndarray,
train_idx: np.ndarray,
test_idx: np.ndarray,
config: NeuralConfig,
) -> dict:
torch, nn, F = _import_torch()
device = _resolve_device(torch, config.device)
torch.manual_seed(config.seed)
Xs, mean, std = _standardize(X.astype(np.float32), train_idx)
x_tensor = torch.from_numpy(Xs)
y_tensor = torch.from_numpy(Y.astype(np.float32))
model = _make_mlp(nn, X.shape[1], Y.shape[1], config.hidden_dim, config.dropout).to(device)
counts = Y[train_idx].sum(axis=0).astype(np.float32)
neg = len(train_idx) - counts
pos_weight = np.clip(neg / np.maximum(counts, 1.0), 1.0, 20.0)
pos_weight_tensor = torch.from_numpy(pos_weight.astype(np.float32)).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
history = []
for epoch in range(1, config.epochs + 1):
model.train()
perm = _batch_indices(torch, train_idx, config.batch_size, config.seed, epoch)
total_loss = 0.0
total_seen = 0
for start in range(0, len(perm), config.batch_size):
idx = perm[start : start + config.batch_size]
xb = x_tensor[idx].to(device)
yb = y_tensor[idx].to(device)
logits = model(xb)
loss = F.binary_cross_entropy_with_logits(logits, yb, pos_weight=pos_weight_tensor)
opt.zero_grad(set_to_none=True)
loss.backward()
opt.step()
total_loss += float(loss.detach().cpu()) * len(idx)
total_seen += len(idx)
if _history_epoch(epoch, config.epochs):
history.append({"epoch": epoch, "loss": total_loss / max(total_seen, 1)})
model.eval()
with torch.no_grad():
prob = torch.sigmoid(model(x_tensor[test_idx].to(device))).cpu().numpy().astype(np.float32)
return {
"prob": prob,
"pred": (prob >= 0.5).astype(np.float32),
"history": history,
"mean": mean,
"std": std,
"state_dict": {k: v.detach().cpu() for k, v in model.state_dict().items()},
"device": str(device),
}
def train_regressor(
X: np.ndarray,
Y: np.ndarray,
train_idx: np.ndarray,
test_idx: np.ndarray,
config: NeuralConfig,
) -> dict:
torch, nn, F = _import_torch()
device = _resolve_device(torch, config.device)
torch.manual_seed(config.seed)
Xs, x_mean, x_std = _standardize(X.astype(np.float32), train_idx)
y_mean = Y[train_idx].mean(axis=0).astype(np.float32)
y_std = Y[train_idx].std(axis=0).astype(np.float32)
y_std = np.where(y_std < 1e-6, 1.0, y_std).astype(np.float32)
Ys = ((Y - y_mean) / y_std).astype(np.float32)
x_tensor = torch.from_numpy(Xs)
y_tensor = torch.from_numpy(Ys)
model = _make_mlp(nn, X.shape[1], Y.shape[1], config.hidden_dim, config.dropout).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
history = []
for epoch in range(1, config.epochs + 1):
model.train()
perm = _batch_indices(torch, train_idx, config.batch_size, config.seed, epoch)
total_loss = 0.0
total_seen = 0
for start in range(0, len(perm), config.batch_size):
idx = perm[start : start + config.batch_size]
xb = x_tensor[idx].to(device)
yb = y_tensor[idx].to(device)
pred = model(xb)
loss = F.mse_loss(pred, yb)
opt.zero_grad(set_to_none=True)
loss.backward()
opt.step()
total_loss += float(loss.detach().cpu()) * len(idx)
total_seen += len(idx)
if _history_epoch(epoch, config.epochs):
history.append({"epoch": epoch, "loss": total_loss / max(total_seen, 1)})
model.eval()
with torch.no_grad():
pred_scaled = model(x_tensor[test_idx].to(device)).cpu().numpy().astype(np.float32)
pred = pred_scaled * y_std + y_mean
return {
"pred": pred.astype(np.float32),
"history": history,
"x_mean": x_mean,
"x_std": x_std,
"y_mean": y_mean,
"y_std": y_std,
"state_dict": {k: v.detach().cpu() for k, v in model.state_dict().items()},
"device": str(device),
}
def save_torch_model(path, payload: dict) -> None:
torch, _nn, _F = _import_torch()
path.parent.mkdir(parents=True, exist_ok=True)
torch.save(payload, path)