abersbail's picture
Add improved aber small model Space
f907cd1 verified
import random
import torch
from .data import build_training_text
from .model import AberLanguageModel
from .tokenizer import WordTokenizer
def set_seed(seed: int):
random.seed(seed)
torch.manual_seed(seed)
def create_model_and_tokenizer(config, extra_text=""):
text = build_training_text(extra_text)
tokenizer = WordTokenizer().fit(text)
encoded = tokenizer.encode(text, add_bos=True, add_eos=True)
encoded = torch.tensor(encoded, dtype=torch.long)
model = AberLanguageModel(
vocab_size=tokenizer.vocab_size,
embed_dim=config.embed_dim,
hidden_dim=config.hidden_dim,
num_layers=config.num_layers,
dropout=config.dropout,
)
return model, tokenizer, encoded
def build_batch(encoded, seq_len, batch_size):
max_start = max(1, len(encoded) - seq_len - 1)
starts = torch.randint(0, max_start, (batch_size,))
x = torch.stack([encoded[start : start + seq_len] for start in starts])
y = torch.stack([encoded[start + 1 : start + seq_len + 1] for start in starts])
return x, y
def train_model(model, encoded, config, steps):
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
model.train()
losses = []
for _ in range(steps):
xb, yb = build_batch(encoded, config.seq_len, config.batch_size)
_, _, loss = model(xb, targets=yb)
optimizer.zero_grad()
loss.backward()
optimizer.step()
losses.append(float(loss.item()))
return losses