Vedisasi
/

UltraThinking-LLM-Training

Model card Files Files and versions

UltraThinking-LLM-Training / tests /smoke_test.py

Vedisasi's picture

Upload folder using huggingface_hub

54c5666 verified 6 months ago

history blame contribute delete

2.44 kB

	import torch

	from src.models.ultrathink import UltraThinkModel, UltraThinkConfig
	from src.models.architecture import ModelConfig


	def build_tiny_ultrathink():
	# Tiny config for quick CPU smoke test
	vocab_size = 50257
	n_embd = 128
	n_head = 8
	head_dim = n_embd // n_head # 16

	model_config = ModelConfig(
	vocab_size=vocab_size,
	n_positions=128,
	n_embd=n_embd,
	n_layer=2,
	n_head=n_head,
	n_kv_head=4,
	rotary_dim=head_dim, # must match head_dim for RoPE broadcasting
	intermediate_size=512,
	activation="swiglu",
	norm_type="rmsnorm",
	norm_eps=1e-5,
	dropout=0.0,
	attention_dropout=0.0,
	residual_dropout=0.0,
	embed_dropout=0.0,
	tie_word_embeddings=True,
	use_cache=True,
	attention_bias=False,
	mlp_bias=False,
	flash_attention=False, # CPU safe
	sliding_window=None,
	gradient_checkpointing=False,
	max_position_embeddings=128,
	)

	cfg = UltraThinkConfig(
	model_config=model_config,
	enable_dre=False,
	enable_constitutional=False,
	enable_moe=False,
	enable_multimodal=False,
	enable_rlhf=False,
	batch_size=2,
	gradient_accumulation=1,
	learning_rate=1e-4,
	warmup_steps=0,
	max_steps=10,
	gradient_checkpointing=False,
	mixed_precision="fp32",
	compile_model=False,
	max_new_tokens=32,
	temperature=0.7,
	top_p=0.9,
	top_k=20,
	repetition_penalty=1.0,
	)
	return UltraThinkModel(cfg)


	def run_forward_pass():
	device = torch.device("cpu")
	model = build_tiny_ultrathink().to(device)

	batch_size = 2
	seq_len = 16
	vocab_size = model.config.model_config.vocab_size

	input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=device)
	labels = torch.randint(0, vocab_size, (batch_size, seq_len), device=device)

	with torch.no_grad():
	outputs = model(input_ids=input_ids, labels=labels)
	loss = outputs["loss"].item() if outputs["loss"] is not None else None
	logits_shape = tuple(outputs["logits"].shape)

	print("Smoke test OK")
	print(f"Loss: {loss}")
	print(f"Logits shape: {logits_shape}")


	if __name__ == "__main__":
	run_forward_pass()