CLIWorks
/

ARBS

Model card Files Files and versions

Metrics Training metrics Community

ARBS / testing /test_200_step_smoke.py

CLIWorks's picture

Upload folder using huggingface_hub

d8bc908 verified 1 day ago

history blame contribute delete

2.03 kB

	import torch
	import sys
	import os
	import math

	sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))


	def _cuda_available():
	if not torch.cuda.is_available():
	return False
	free, total = torch.cuda.mem_get_info()
	if total < 10e9:
	return False
	return True


	def test_200_step_smoke():
	if not torch.cuda.is_available():
	print(" SKIP test_200_step_smoke (no CUDA)")
	return
	free, total = torch.cuda.mem_get_info()
	if total < 7.5e9:
	print(f" SKIP test_200_step_smoke (GPU {total/1e9:.1f}GB < 7.5GB)")
	return
	from arbitor.main import ARBModel
	from arbitor.kernel.ternary_scale import TScaleType
	from arbitor.config import VOCAB

	model = ARBModel(
	tscale_type=TScaleType.T32,
	enable_image=False,
	enable_audio=False,
	enable_vq=True,
	enable_graph=True,
	enable_memory_modules=True,
	enable_moe=False,
	).cuda()

	data = torch.tensor(
	list(open("training/data/tinyshakespeare.txt", "rb").read()),
	dtype=torch.long
	)
	train_data = data[:int(0.9 * data.numel())]

	def get_batch(data, bs, ctx):
	ix = torch.randint(0, data.numel() - ctx - 1, (bs,))
	x = torch.stack([data[i:i+ctx] for i in ix]).cuda()
	return x, x[:, 3:].contiguous()

	losses = []
	for step in range(200):
	model.zero_grad(set_to_none=True)
	accum_loss = 0.0
	for _ in range(2):
	x, t = get_batch(train_data, 1, 64)
	_, lc, _, _ = model(x, targets=t)
	loss = lc.total / 2
	assert torch.isfinite(loss).all(), f"Non-finite loss at step {step}"
	accum_loss += lc.total.item()
	model._ternary_update_memory(loss_components=lc)
	losses.append(accum_loss / 2)

	assert all(math.isfinite(l) for l in losses), "Non-finite loss detected"
	print(f" PASS test_200_step_smoke: {losses[0]:.2f} -> {losses[-1]:.2f} (min={min(losses):.2f}, max={max(losses):.2f})")