ARBS / testing /test_200_step_smoke.py
CLIWorks's picture
Upload folder using huggingface_hub
d8bc908 verified
import torch
import sys
import os
import math
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
def _cuda_available():
if not torch.cuda.is_available():
return False
free, total = torch.cuda.mem_get_info()
if total < 10e9:
return False
return True
def test_200_step_smoke():
if not torch.cuda.is_available():
print(" SKIP test_200_step_smoke (no CUDA)")
return
free, total = torch.cuda.mem_get_info()
if total < 7.5e9:
print(f" SKIP test_200_step_smoke (GPU {total/1e9:.1f}GB < 7.5GB)")
return
from arbitor.main import ARBModel
from arbitor.kernel.ternary_scale import TScaleType
from arbitor.config import VOCAB
model = ARBModel(
tscale_type=TScaleType.T32,
enable_image=False,
enable_audio=False,
enable_vq=True,
enable_graph=True,
enable_memory_modules=True,
enable_moe=False,
).cuda()
data = torch.tensor(
list(open("training/data/tinyshakespeare.txt", "rb").read()),
dtype=torch.long
)
train_data = data[:int(0.9 * data.numel())]
def get_batch(data, bs, ctx):
ix = torch.randint(0, data.numel() - ctx - 1, (bs,))
x = torch.stack([data[i:i+ctx] for i in ix]).cuda()
return x, x[:, 3:].contiguous()
losses = []
for step in range(200):
model.zero_grad(set_to_none=True)
accum_loss = 0.0
for _ in range(2):
x, t = get_batch(train_data, 1, 64)
_, lc, _, _ = model(x, targets=t)
loss = lc.total / 2
assert torch.isfinite(loss).all(), f"Non-finite loss at step {step}"
accum_loss += lc.total.item()
model._ternary_update_memory(loss_components=lc)
losses.append(accum_loss / 2)
assert all(math.isfinite(l) for l in losses), "Non-finite loss detected"
print(f" PASS test_200_step_smoke: {losses[0]:.2f} -> {losses[-1]:.2f} (min={min(losses):.2f}, max={max(losses):.2f})")