abersbail's picture
Add improved aber small model Space
f907cd1 verified
from pathlib import Path
import shutil
import torch
from .config import AberConfig
from .model import AberLanguageModel
from .tokenizer import WordTokenizer
from .trainer import create_model_and_tokenizer, set_seed, train_model
class AberLLMService:
def __init__(self, config: AberConfig):
self.config = config
torch.set_num_threads(max(1, self.config.cpu_threads))
self.model = None
self.tokenizer = None
def generate(self, prompt: str, max_new_tokens: int, temperature: float, top_k: int):
clean_prompt = prompt or "User: hello\naber:"
self._ensure_ready()
encoded = self.tokenizer.encode(clean_prompt, add_bos=True)
idx = torch.tensor(encoded, dtype=torch.long).unsqueeze(0)
self.model.eval()
with torch.inference_mode():
output = self.model.generate(
idx=idx,
max_new_tokens=max_new_tokens,
eos_id=self.tokenizer.eos_id,
temperature=temperature,
top_k=top_k,
)
text = self.tokenizer.decode(output[0].tolist())
status = (
f"Generated with aber. "
f"Architecture=word-level GRU, Vocab={self.tokenizer.vocab_size}, Hidden={self.config.hidden_dim}."
)
return text, status
def train(self, extra_text: str, steps: int):
steps = max(1, steps)
checkpoint_exists = self.config.checkpoint_path.exists()
training_text = extra_text or ""
if checkpoint_exists:
self._load_or_initialize(extra_text="")
model, tokenizer, encoded = create_model_and_tokenizer(self.config, training_text)
if checkpoint_exists and self.model is not None and self.tokenizer is not None:
if tokenizer.stoi == self.tokenizer.stoi:
model.load_state_dict(self.model.state_dict())
losses = train_model(model, encoded, self.config, steps)
self.model = model
self.tokenizer = tokenizer
self._save_checkpoint(extra_text=training_text)
return (
f"aber training finished.\n"
f"Steps: {steps}\n"
f"Start Loss: {losses[0]:.4f}\n"
f"End Loss: {losses[-1]:.4f}\n"
f"Checkpoint: {self.config.checkpoint_path}"
)
def reset(self):
checkpoint_dir = self.config.checkpoint_path.parent
if checkpoint_dir.exists():
shutil.rmtree(checkpoint_dir)
self.model = None
self.tokenizer = None
return "aber reset complete. Next train or generate call will rebuild the model from scratch."
def _ensure_ready(self):
if self.model is not None and self.tokenizer is not None:
return
self._load_or_initialize(extra_text="")
def _load_or_initialize(self, extra_text: str):
checkpoint = self.config.checkpoint_path
if checkpoint.exists():
state = torch.load(checkpoint, map_location="cpu")
self.tokenizer = WordTokenizer.from_state_dict(state["tokenizer"])
self.model = AberLanguageModel(
vocab_size=state["config"]["vocab_size"],
embed_dim=state["config"]["embed_dim"],
hidden_dim=state["config"]["hidden_dim"],
num_layers=state["config"]["num_layers"],
dropout=state["config"]["dropout"],
)
self.model.load_state_dict(state["model"])
self.model.eval()
return
set_seed(self.config.seed)
self.model, self.tokenizer, encoded = create_model_and_tokenizer(self.config, extra_text)
train_model(self.model, encoded, self.config, self.config.bootstrap_steps)
self._save_checkpoint(extra_text=extra_text)
def _save_checkpoint(self, extra_text: str):
checkpoint = self.config.checkpoint_path
checkpoint.parent.mkdir(parents=True, exist_ok=True)
torch.save(
{
"model": self.model.state_dict(),
"tokenizer": self.tokenizer.state_dict(),
"config": {
"vocab_size": self.tokenizer.vocab_size,
"embed_dim": self.config.embed_dim,
"hidden_dim": self.config.hidden_dim,
"num_layers": self.config.num_layers,
"dropout": self.config.dropout,
"seq_len": self.config.seq_len,
"extra_text": extra_text,
},
},
checkpoint,
)