| import torch | |
| from src.models.ultrathink import UltraThinkModel, UltraThinkConfig | |
| from src.models.architecture import ModelConfig | |
| def tiny_model(): | |
| cfg = UltraThinkConfig( | |
| model_config=ModelConfig( | |
| vocab_size=256, | |
| n_positions=64, | |
| n_embd=64, | |
| n_layer=2, | |
| n_head=4, | |
| n_kv_head=4, | |
| intermediate_size=128, | |
| activation="relu", | |
| dropout=0.0, | |
| attention_dropout=0.0, | |
| flash_attention=False, | |
| gradient_checkpointing=False, | |
| ) | |
| ) | |
| return UltraThinkModel(cfg) | |
| def test_forward_smoke(): | |
| model = tiny_model() | |
| model.eval() | |
| input_ids = torch.randint(0, 256, (2, 16)) | |
| attn = torch.ones_like(input_ids) | |
| with torch.no_grad(): | |
| out = model(input_ids=input_ids, attention_mask=attn, labels=input_ids) | |
| assert "loss" in out and torch.isfinite(out["loss"]) | |