Spaces:
Sleeping
Sleeping
Add improved aber small model Space
Browse files- README.md +21 -6
- aber_llm/__init__.py +4 -0
- aber_llm/config.py +24 -0
- aber_llm/data.py +87 -0
- aber_llm/model.py +48 -0
- aber_llm/service.py +120 -0
- aber_llm/tokenizer.py +73 -0
- aber_llm/trainer.py +51 -0
- app.py +89 -0
- requirements.txt +2 -0
README.md
CHANGED
|
@@ -1,12 +1,27 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
colorTo: pink
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version: 6.10.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: aber Small Model
|
| 3 |
+
colorFrom: green
|
| 4 |
+
colorTo: blue
|
|
|
|
| 5 |
sdk: gradio
|
|
|
|
| 6 |
app_file: app.py
|
| 7 |
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# aber Small Model
|
| 12 |
+
|
| 13 |
+
This is an improved local small language model project written in Python from scratch.
|
| 14 |
+
|
| 15 |
+
## What is different
|
| 16 |
+
|
| 17 |
+
- Model name is `aber`
|
| 18 |
+
- Uses a different architecture than the previous tiny character model
|
| 19 |
+
- Uses a word-level tokenizer
|
| 20 |
+
- Uses a GRU language model for more readable short outputs
|
| 21 |
+
- Trains and runs locally on CPU
|
| 22 |
+
|
| 23 |
+
## Important
|
| 24 |
+
|
| 25 |
+
- No external pretrained LLM is used
|
| 26 |
+
- This is still a small educational model
|
| 27 |
+
- It is designed for lightweight Hugging Face CPU Spaces and local Python use
|
aber_llm/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .config import AberConfig
|
| 2 |
+
from .service import AberLLMService
|
| 3 |
+
|
| 4 |
+
__all__ = ["AberConfig", "AberLLMService"]
|
aber_llm/config.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@dataclass
|
| 6 |
+
class AberConfig:
|
| 7 |
+
seq_len: int = 40
|
| 8 |
+
batch_size: int = 24
|
| 9 |
+
embed_dim: int = 96
|
| 10 |
+
hidden_dim: int = 160
|
| 11 |
+
num_layers: int = 2
|
| 12 |
+
dropout: float = 0.15
|
| 13 |
+
learning_rate: float = 2.5e-3
|
| 14 |
+
bootstrap_steps: int = 90
|
| 15 |
+
cpu_threads: int = 4
|
| 16 |
+
seed: int = 42
|
| 17 |
+
|
| 18 |
+
@property
|
| 19 |
+
def root_dir(self) -> Path:
|
| 20 |
+
return Path(__file__).resolve().parents[1]
|
| 21 |
+
|
| 22 |
+
@property
|
| 23 |
+
def checkpoint_path(self) -> Path:
|
| 24 |
+
return self.root_dir / "artifacts" / "aber_checkpoint.pt"
|
aber_llm/data.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
BASE_CORPUS = """
|
| 2 |
+
System: You are aber, a small local language model.
|
| 3 |
+
User: hello
|
| 4 |
+
aber: Hello. I am aber, a tiny local model written in Python.
|
| 5 |
+
|
| 6 |
+
User: what are you
|
| 7 |
+
aber: I am a small language model trained from scratch on local text.
|
| 8 |
+
|
| 9 |
+
User: explain machine learning
|
| 10 |
+
aber: Machine learning builds patterns from data so a model can make predictions or decisions.
|
| 11 |
+
|
| 12 |
+
User: explain deep learning
|
| 13 |
+
aber: Deep learning uses neural networks with many layers to learn useful representations.
|
| 14 |
+
|
| 15 |
+
User: explain natural language processing
|
| 16 |
+
aber: Natural language processing helps computers read, generate, and understand human language.
|
| 17 |
+
|
| 18 |
+
User: what is regression
|
| 19 |
+
aber: Regression predicts a continuous value such as price, score, or demand.
|
| 20 |
+
|
| 21 |
+
User: what is classification
|
| 22 |
+
aber: Classification predicts a category or label for an example.
|
| 23 |
+
|
| 24 |
+
User: what is clustering
|
| 25 |
+
aber: Clustering groups similar data points without labeled targets.
|
| 26 |
+
|
| 27 |
+
User: what is preprocessing
|
| 28 |
+
aber: Preprocessing cleans, encodes, scales, and prepares data before modeling.
|
| 29 |
+
|
| 30 |
+
User: what is evaluation
|
| 31 |
+
aber: Evaluation measures how well a model performs on data it did not train on.
|
| 32 |
+
|
| 33 |
+
User: how do I learn coding
|
| 34 |
+
aber: Practice small projects, read errors carefully, and improve one step at a time.
|
| 35 |
+
|
| 36 |
+
User: give a short greeting
|
| 37 |
+
aber: Hello. I am ready to help with a short answer.
|
| 38 |
+
|
| 39 |
+
User: write a short poem
|
| 40 |
+
aber: Quiet light on morning glass, slow thoughts gather as clouds pass.
|
| 41 |
+
|
| 42 |
+
User: what is Python
|
| 43 |
+
aber: Python is a programming language known for clarity, flexibility, and strong libraries.
|
| 44 |
+
|
| 45 |
+
User: what is a feature
|
| 46 |
+
aber: A feature is an input variable used by a model.
|
| 47 |
+
|
| 48 |
+
User: what is a target
|
| 49 |
+
aber: A target is the value or label a model tries to predict.
|
| 50 |
+
|
| 51 |
+
User: what is overfitting
|
| 52 |
+
aber: Overfitting happens when a model memorizes training data and generalizes poorly.
|
| 53 |
+
|
| 54 |
+
User: what is underfitting
|
| 55 |
+
aber: Underfitting happens when a model is too simple to learn important patterns.
|
| 56 |
+
|
| 57 |
+
User: what is a transformer
|
| 58 |
+
aber: A transformer is a model architecture that uses attention between tokens.
|
| 59 |
+
|
| 60 |
+
User: what is a tokenizer
|
| 61 |
+
aber: A tokenizer converts text into smaller pieces that a model can process.
|
| 62 |
+
|
| 63 |
+
User: what is local ai
|
| 64 |
+
aber: Local AI runs on your own machine so you can control training, files, and execution.
|
| 65 |
+
|
| 66 |
+
User: how should I debug code
|
| 67 |
+
aber: Reproduce the issue, isolate the failing step, inspect values, and test one fix at a time.
|
| 68 |
+
|
| 69 |
+
User: summarize good study habits
|
| 70 |
+
aber: Good study habits use planning, active recall, spaced review, and regular breaks.
|
| 71 |
+
|
| 72 |
+
User: what is data science
|
| 73 |
+
aber: Data science combines programming, statistics, and domain knowledge to learn from data.
|
| 74 |
+
|
| 75 |
+
User: what is a neural network
|
| 76 |
+
aber: A neural network is a layered function that transforms input signals into predictions.
|
| 77 |
+
|
| 78 |
+
User: give motivation
|
| 79 |
+
aber: Small repeated effort beats waiting for perfect motivation.
|
| 80 |
+
""".strip()
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def build_training_text(extra_text: str = "") -> str:
|
| 84 |
+
extra = " ".join((extra_text or "").split())
|
| 85 |
+
if not extra:
|
| 86 |
+
return BASE_CORPUS
|
| 87 |
+
return BASE_CORPUS + "\n\n" + extra
|
aber_llm/model.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torch import nn
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class AberLanguageModel(nn.Module):
|
| 6 |
+
def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout):
|
| 7 |
+
super().__init__()
|
| 8 |
+
self.embedding = nn.Embedding(vocab_size, embed_dim)
|
| 9 |
+
self.gru = nn.GRU(
|
| 10 |
+
input_size=embed_dim,
|
| 11 |
+
hidden_size=hidden_dim,
|
| 12 |
+
num_layers=num_layers,
|
| 13 |
+
dropout=dropout if num_layers > 1 else 0.0,
|
| 14 |
+
batch_first=True,
|
| 15 |
+
)
|
| 16 |
+
self.dropout = nn.Dropout(dropout)
|
| 17 |
+
self.head = nn.Linear(hidden_dim, vocab_size)
|
| 18 |
+
|
| 19 |
+
def forward(self, idx, hidden=None, targets=None):
|
| 20 |
+
emb = self.embedding(idx)
|
| 21 |
+
out, hidden = self.gru(emb, hidden)
|
| 22 |
+
out = self.dropout(out)
|
| 23 |
+
logits = self.head(out)
|
| 24 |
+
|
| 25 |
+
loss = None
|
| 26 |
+
if targets is not None:
|
| 27 |
+
loss = nn.functional.cross_entropy(
|
| 28 |
+
logits.reshape(-1, logits.size(-1)),
|
| 29 |
+
targets.reshape(-1),
|
| 30 |
+
)
|
| 31 |
+
return logits, hidden, loss
|
| 32 |
+
|
| 33 |
+
def generate(self, idx, max_new_tokens, eos_id, temperature=1.0, top_k=8):
|
| 34 |
+
hidden = None
|
| 35 |
+
for _ in range(max_new_tokens):
|
| 36 |
+
logits, hidden, _ = self(idx[:, -1:], hidden)
|
| 37 |
+
next_logits = logits[:, -1, :] / max(temperature, 1e-4)
|
| 38 |
+
|
| 39 |
+
if top_k is not None and top_k > 0:
|
| 40 |
+
values, _ = torch.topk(next_logits, min(top_k, next_logits.size(-1)))
|
| 41 |
+
next_logits[next_logits < values[:, [-1]]] = float("-inf")
|
| 42 |
+
|
| 43 |
+
probs = torch.softmax(next_logits, dim=-1)
|
| 44 |
+
next_token = torch.multinomial(probs, num_samples=1)
|
| 45 |
+
idx = torch.cat([idx, next_token], dim=1)
|
| 46 |
+
if int(next_token.item()) == eos_id:
|
| 47 |
+
break
|
| 48 |
+
return idx
|
aber_llm/service.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import shutil
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
from .config import AberConfig
|
| 7 |
+
from .model import AberLanguageModel
|
| 8 |
+
from .tokenizer import WordTokenizer
|
| 9 |
+
from .trainer import create_model_and_tokenizer, set_seed, train_model
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class AberLLMService:
|
| 13 |
+
def __init__(self, config: AberConfig):
|
| 14 |
+
self.config = config
|
| 15 |
+
torch.set_num_threads(max(1, self.config.cpu_threads))
|
| 16 |
+
self.model = None
|
| 17 |
+
self.tokenizer = None
|
| 18 |
+
|
| 19 |
+
def generate(self, prompt: str, max_new_tokens: int, temperature: float, top_k: int):
|
| 20 |
+
clean_prompt = prompt or "User: hello\naber:"
|
| 21 |
+
self._ensure_ready()
|
| 22 |
+
encoded = self.tokenizer.encode(clean_prompt, add_bos=True)
|
| 23 |
+
idx = torch.tensor(encoded, dtype=torch.long).unsqueeze(0)
|
| 24 |
+
self.model.eval()
|
| 25 |
+
|
| 26 |
+
with torch.inference_mode():
|
| 27 |
+
output = self.model.generate(
|
| 28 |
+
idx=idx,
|
| 29 |
+
max_new_tokens=max_new_tokens,
|
| 30 |
+
eos_id=self.tokenizer.eos_id,
|
| 31 |
+
temperature=temperature,
|
| 32 |
+
top_k=top_k,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
text = self.tokenizer.decode(output[0].tolist())
|
| 36 |
+
status = (
|
| 37 |
+
f"Generated with aber. "
|
| 38 |
+
f"Architecture=word-level GRU, Vocab={self.tokenizer.vocab_size}, Hidden={self.config.hidden_dim}."
|
| 39 |
+
)
|
| 40 |
+
return text, status
|
| 41 |
+
|
| 42 |
+
def train(self, extra_text: str, steps: int):
|
| 43 |
+
steps = max(1, steps)
|
| 44 |
+
checkpoint_exists = self.config.checkpoint_path.exists()
|
| 45 |
+
training_text = extra_text or ""
|
| 46 |
+
|
| 47 |
+
if checkpoint_exists:
|
| 48 |
+
self._load_or_initialize(extra_text="")
|
| 49 |
+
|
| 50 |
+
model, tokenizer, encoded = create_model_and_tokenizer(self.config, training_text)
|
| 51 |
+
if checkpoint_exists and self.model is not None and self.tokenizer is not None:
|
| 52 |
+
if tokenizer.stoi == self.tokenizer.stoi:
|
| 53 |
+
model.load_state_dict(self.model.state_dict())
|
| 54 |
+
|
| 55 |
+
losses = train_model(model, encoded, self.config, steps)
|
| 56 |
+
self.model = model
|
| 57 |
+
self.tokenizer = tokenizer
|
| 58 |
+
self._save_checkpoint(extra_text=training_text)
|
| 59 |
+
|
| 60 |
+
return (
|
| 61 |
+
f"aber training finished.\n"
|
| 62 |
+
f"Steps: {steps}\n"
|
| 63 |
+
f"Start Loss: {losses[0]:.4f}\n"
|
| 64 |
+
f"End Loss: {losses[-1]:.4f}\n"
|
| 65 |
+
f"Checkpoint: {self.config.checkpoint_path}"
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
def reset(self):
|
| 69 |
+
checkpoint_dir = self.config.checkpoint_path.parent
|
| 70 |
+
if checkpoint_dir.exists():
|
| 71 |
+
shutil.rmtree(checkpoint_dir)
|
| 72 |
+
self.model = None
|
| 73 |
+
self.tokenizer = None
|
| 74 |
+
return "aber reset complete. Next train or generate call will rebuild the model from scratch."
|
| 75 |
+
|
| 76 |
+
def _ensure_ready(self):
|
| 77 |
+
if self.model is not None and self.tokenizer is not None:
|
| 78 |
+
return
|
| 79 |
+
self._load_or_initialize(extra_text="")
|
| 80 |
+
|
| 81 |
+
def _load_or_initialize(self, extra_text: str):
|
| 82 |
+
checkpoint = self.config.checkpoint_path
|
| 83 |
+
if checkpoint.exists():
|
| 84 |
+
state = torch.load(checkpoint, map_location="cpu")
|
| 85 |
+
self.tokenizer = WordTokenizer.from_state_dict(state["tokenizer"])
|
| 86 |
+
self.model = AberLanguageModel(
|
| 87 |
+
vocab_size=state["config"]["vocab_size"],
|
| 88 |
+
embed_dim=state["config"]["embed_dim"],
|
| 89 |
+
hidden_dim=state["config"]["hidden_dim"],
|
| 90 |
+
num_layers=state["config"]["num_layers"],
|
| 91 |
+
dropout=state["config"]["dropout"],
|
| 92 |
+
)
|
| 93 |
+
self.model.load_state_dict(state["model"])
|
| 94 |
+
self.model.eval()
|
| 95 |
+
return
|
| 96 |
+
|
| 97 |
+
set_seed(self.config.seed)
|
| 98 |
+
self.model, self.tokenizer, encoded = create_model_and_tokenizer(self.config, extra_text)
|
| 99 |
+
train_model(self.model, encoded, self.config, self.config.bootstrap_steps)
|
| 100 |
+
self._save_checkpoint(extra_text=extra_text)
|
| 101 |
+
|
| 102 |
+
def _save_checkpoint(self, extra_text: str):
|
| 103 |
+
checkpoint = self.config.checkpoint_path
|
| 104 |
+
checkpoint.parent.mkdir(parents=True, exist_ok=True)
|
| 105 |
+
torch.save(
|
| 106 |
+
{
|
| 107 |
+
"model": self.model.state_dict(),
|
| 108 |
+
"tokenizer": self.tokenizer.state_dict(),
|
| 109 |
+
"config": {
|
| 110 |
+
"vocab_size": self.tokenizer.vocab_size,
|
| 111 |
+
"embed_dim": self.config.embed_dim,
|
| 112 |
+
"hidden_dim": self.config.hidden_dim,
|
| 113 |
+
"num_layers": self.config.num_layers,
|
| 114 |
+
"dropout": self.config.dropout,
|
| 115 |
+
"seq_len": self.config.seq_len,
|
| 116 |
+
"extra_text": extra_text,
|
| 117 |
+
},
|
| 118 |
+
},
|
| 119 |
+
checkpoint,
|
| 120 |
+
)
|
aber_llm/tokenizer.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
TOKEN_PATTERN = re.compile(r"\n|[A-Za-z0-9_']+|[^\w\s]")
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class WordTokenizer:
|
| 8 |
+
def __init__(self):
|
| 9 |
+
self.special_tokens = ["<pad>", "<unk>", "<bos>", "<eos>"]
|
| 10 |
+
self.stoi = {}
|
| 11 |
+
self.itos = {}
|
| 12 |
+
|
| 13 |
+
@property
|
| 14 |
+
def pad_id(self):
|
| 15 |
+
return self.stoi["<pad>"]
|
| 16 |
+
|
| 17 |
+
@property
|
| 18 |
+
def bos_id(self):
|
| 19 |
+
return self.stoi["<bos>"]
|
| 20 |
+
|
| 21 |
+
@property
|
| 22 |
+
def eos_id(self):
|
| 23 |
+
return self.stoi["<eos>"]
|
| 24 |
+
|
| 25 |
+
@property
|
| 26 |
+
def vocab_size(self):
|
| 27 |
+
return len(self.stoi)
|
| 28 |
+
|
| 29 |
+
def tokenize(self, text: str):
|
| 30 |
+
return TOKEN_PATTERN.findall(text)
|
| 31 |
+
|
| 32 |
+
def fit(self, text: str):
|
| 33 |
+
vocab = self.special_tokens + sorted(set(self.tokenize(text)))
|
| 34 |
+
self.stoi = {token: idx for idx, token in enumerate(vocab)}
|
| 35 |
+
self.itos = {idx: token for token, idx in self.stoi.items()}
|
| 36 |
+
return self
|
| 37 |
+
|
| 38 |
+
def encode(self, text: str, add_bos: bool = False, add_eos: bool = False):
|
| 39 |
+
tokens = self.tokenize(text)
|
| 40 |
+
ids = [self.stoi.get(token, self.stoi["<unk>"]) for token in tokens]
|
| 41 |
+
if add_bos:
|
| 42 |
+
ids = [self.bos_id] + ids
|
| 43 |
+
if add_eos:
|
| 44 |
+
ids = ids + [self.eos_id]
|
| 45 |
+
return ids
|
| 46 |
+
|
| 47 |
+
def decode(self, ids):
|
| 48 |
+
tokens = []
|
| 49 |
+
for idx in ids:
|
| 50 |
+
token = self.itos.get(int(idx), "<unk>")
|
| 51 |
+
if token in self.special_tokens:
|
| 52 |
+
continue
|
| 53 |
+
tokens.append(token)
|
| 54 |
+
|
| 55 |
+
text = ""
|
| 56 |
+
for token in tokens:
|
| 57 |
+
if token == "\n":
|
| 58 |
+
text = text.rstrip() + "\n"
|
| 59 |
+
elif token in {".", ",", "!", "?", ":", ";"}:
|
| 60 |
+
text = text.rstrip() + token + " "
|
| 61 |
+
else:
|
| 62 |
+
text += token + " "
|
| 63 |
+
return text.strip()
|
| 64 |
+
|
| 65 |
+
def state_dict(self):
|
| 66 |
+
return {"stoi": self.stoi}
|
| 67 |
+
|
| 68 |
+
@classmethod
|
| 69 |
+
def from_state_dict(cls, state):
|
| 70 |
+
tok = cls()
|
| 71 |
+
tok.stoi = dict(state["stoi"])
|
| 72 |
+
tok.itos = {idx: token for token, idx in tok.stoi.items()}
|
| 73 |
+
return tok
|
aber_llm/trainer.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
from .data import build_training_text
|
| 6 |
+
from .model import AberLanguageModel
|
| 7 |
+
from .tokenizer import WordTokenizer
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def set_seed(seed: int):
|
| 11 |
+
random.seed(seed)
|
| 12 |
+
torch.manual_seed(seed)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def create_model_and_tokenizer(config, extra_text=""):
|
| 16 |
+
text = build_training_text(extra_text)
|
| 17 |
+
tokenizer = WordTokenizer().fit(text)
|
| 18 |
+
encoded = tokenizer.encode(text, add_bos=True, add_eos=True)
|
| 19 |
+
encoded = torch.tensor(encoded, dtype=torch.long)
|
| 20 |
+
model = AberLanguageModel(
|
| 21 |
+
vocab_size=tokenizer.vocab_size,
|
| 22 |
+
embed_dim=config.embed_dim,
|
| 23 |
+
hidden_dim=config.hidden_dim,
|
| 24 |
+
num_layers=config.num_layers,
|
| 25 |
+
dropout=config.dropout,
|
| 26 |
+
)
|
| 27 |
+
return model, tokenizer, encoded
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def build_batch(encoded, seq_len, batch_size):
|
| 31 |
+
max_start = max(1, len(encoded) - seq_len - 1)
|
| 32 |
+
starts = torch.randint(0, max_start, (batch_size,))
|
| 33 |
+
x = torch.stack([encoded[start : start + seq_len] for start in starts])
|
| 34 |
+
y = torch.stack([encoded[start + 1 : start + seq_len + 1] for start in starts])
|
| 35 |
+
return x, y
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def train_model(model, encoded, config, steps):
|
| 39 |
+
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
|
| 40 |
+
model.train()
|
| 41 |
+
losses = []
|
| 42 |
+
|
| 43 |
+
for _ in range(steps):
|
| 44 |
+
xb, yb = build_batch(encoded, config.seq_len, config.batch_size)
|
| 45 |
+
_, _, loss = model(xb, targets=yb)
|
| 46 |
+
optimizer.zero_grad()
|
| 47 |
+
loss.backward()
|
| 48 |
+
optimizer.step()
|
| 49 |
+
losses.append(float(loss.item()))
|
| 50 |
+
|
| 51 |
+
return losses
|
app.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
from aber_llm.config import AberConfig
|
| 4 |
+
from aber_llm.service import AberLLMService
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
config = AberConfig()
|
| 8 |
+
service = AberLLMService(config=config)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def generate_text(prompt, max_new_tokens, temperature, top_k):
|
| 12 |
+
return service.generate(
|
| 13 |
+
prompt=prompt,
|
| 14 |
+
max_new_tokens=int(max_new_tokens),
|
| 15 |
+
temperature=float(temperature),
|
| 16 |
+
top_k=int(top_k),
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def train_model(extra_text, steps):
|
| 21 |
+
return service.train(extra_text=extra_text, steps=int(steps))
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def reset_model():
|
| 25 |
+
return service.reset()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
with gr.Blocks(
|
| 29 |
+
title="aber Small Model",
|
| 30 |
+
theme=gr.themes.Soft(primary_hue="green", secondary_hue="blue"),
|
| 31 |
+
) as demo:
|
| 32 |
+
gr.Markdown(
|
| 33 |
+
"""
|
| 34 |
+
# aber
|
| 35 |
+
An improved small language model written in Python from scratch.
|
| 36 |
+
|
| 37 |
+
- Model name: `aber`
|
| 38 |
+
- No external pretrained LLM
|
| 39 |
+
- Word-level tokenizer
|
| 40 |
+
- GRU language model
|
| 41 |
+
- Local CPU training and generation
|
| 42 |
+
"""
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
with gr.Tab("Generate"):
|
| 46 |
+
prompt_input = gr.Textbox(
|
| 47 |
+
label="Prompt",
|
| 48 |
+
value="User: hello\naber:",
|
| 49 |
+
lines=6,
|
| 50 |
+
)
|
| 51 |
+
with gr.Row():
|
| 52 |
+
max_tokens_input = gr.Slider(10, 160, value=72, step=2, label="Max New Tokens")
|
| 53 |
+
temperature_input = gr.Slider(0.2, 1.3, value=0.75, step=0.05, label="Temperature")
|
| 54 |
+
top_k_input = gr.Slider(1, 20, value=8, step=1, label="Top-K")
|
| 55 |
+
generate_button = gr.Button("Generate", variant="primary")
|
| 56 |
+
output_text = gr.Textbox(label="Output", lines=10)
|
| 57 |
+
output_status = gr.Textbox(label="Status", lines=4)
|
| 58 |
+
|
| 59 |
+
with gr.Tab("Train"):
|
| 60 |
+
extra_text_input = gr.Textbox(
|
| 61 |
+
label="Extra Training Text",
|
| 62 |
+
placeholder="Add more local text to train aber on your own data.",
|
| 63 |
+
lines=10,
|
| 64 |
+
)
|
| 65 |
+
steps_input = gr.Slider(10, 400, value=120, step=10, label="Training Steps")
|
| 66 |
+
train_button = gr.Button("Train / Continue Training", variant="primary")
|
| 67 |
+
reset_button = gr.Button("Reset aber")
|
| 68 |
+
train_status = gr.Textbox(label="Training Status", lines=6)
|
| 69 |
+
|
| 70 |
+
generate_button.click(
|
| 71 |
+
fn=generate_text,
|
| 72 |
+
inputs=[prompt_input, max_tokens_input, temperature_input, top_k_input],
|
| 73 |
+
outputs=[output_text, output_status],
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
train_button.click(
|
| 77 |
+
fn=train_model,
|
| 78 |
+
inputs=[extra_text_input, steps_input],
|
| 79 |
+
outputs=[train_status],
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
reset_button.click(
|
| 83 |
+
fn=reset_model,
|
| 84 |
+
outputs=[train_status],
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
if __name__ == "__main__":
|
| 89 |
+
demo.launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=5.23.0
|
| 2 |
+
torch>=2.3.0
|