| import torch |
| import sys |
| import os |
| import math |
|
|
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) |
|
|
|
|
| def _cuda_available(): |
| if not torch.cuda.is_available(): |
| return False |
| free, total = torch.cuda.mem_get_info() |
| if total < 10e9: |
| return False |
| return True |
|
|
|
|
| def test_200_step_smoke(): |
| if not torch.cuda.is_available(): |
| print(" SKIP test_200_step_smoke (no CUDA)") |
| return |
| free, total = torch.cuda.mem_get_info() |
| if total < 7.5e9: |
| print(f" SKIP test_200_step_smoke (GPU {total/1e9:.1f}GB < 7.5GB)") |
| return |
| from arbitor.main import ARBModel |
| from arbitor.kernel.ternary_scale import TScaleType |
| from arbitor.config import VOCAB |
|
|
| model = ARBModel( |
| tscale_type=TScaleType.T32, |
| enable_image=False, |
| enable_audio=False, |
| enable_vq=True, |
| enable_graph=True, |
| enable_memory_modules=True, |
| enable_moe=False, |
| ).cuda() |
|
|
| data = torch.tensor( |
| list(open("training/data/tinyshakespeare.txt", "rb").read()), |
| dtype=torch.long |
| ) |
| train_data = data[:int(0.9 * data.numel())] |
|
|
| def get_batch(data, bs, ctx): |
| ix = torch.randint(0, data.numel() - ctx - 1, (bs,)) |
| x = torch.stack([data[i:i+ctx] for i in ix]).cuda() |
| return x, x[:, 3:].contiguous() |
|
|
| losses = [] |
| for step in range(200): |
| model.zero_grad(set_to_none=True) |
| accum_loss = 0.0 |
| for _ in range(2): |
| x, t = get_batch(train_data, 1, 64) |
| _, lc, _, _ = model(x, targets=t) |
| loss = lc.total / 2 |
| assert torch.isfinite(loss).all(), f"Non-finite loss at step {step}" |
| accum_loss += lc.total.item() |
| model._ternary_update_memory(loss_components=lc) |
| losses.append(accum_loss / 2) |
|
|
| assert all(math.isfinite(l) for l in losses), "Non-finite loss detected" |
| print(f" PASS test_200_step_smoke: {losses[0]:.2f} -> {losses[-1]:.2f} (min={min(losses):.2f}, max={max(losses):.2f})") |
|
|