abersbail commited on
Commit
f907cd1
·
verified ·
1 Parent(s): 69c87c1

Add improved aber small model Space

Browse files
README.md CHANGED
@@ -1,12 +1,27 @@
1
  ---
2
- title: Aber Small Model Python
3
- emoji: 🐢
4
- colorFrom: pink
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 6.10.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: aber Small Model
3
+ colorFrom: green
4
+ colorTo: blue
 
5
  sdk: gradio
 
6
  app_file: app.py
7
  pinned: false
8
+ license: mit
9
  ---
10
 
11
+ # aber Small Model
12
+
13
+ This is an improved local small language model project written in Python from scratch.
14
+
15
+ ## What is different
16
+
17
+ - Model name is `aber`
18
+ - Uses a different architecture than the previous tiny character model
19
+ - Uses a word-level tokenizer
20
+ - Uses a GRU language model for more readable short outputs
21
+ - Trains and runs locally on CPU
22
+
23
+ ## Important
24
+
25
+ - No external pretrained LLM is used
26
+ - This is still a small educational model
27
+ - It is designed for lightweight Hugging Face CPU Spaces and local Python use
aber_llm/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .config import AberConfig
2
+ from .service import AberLLMService
3
+
4
+ __all__ = ["AberConfig", "AberLLMService"]
aber_llm/config.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+
5
+ @dataclass
6
+ class AberConfig:
7
+ seq_len: int = 40
8
+ batch_size: int = 24
9
+ embed_dim: int = 96
10
+ hidden_dim: int = 160
11
+ num_layers: int = 2
12
+ dropout: float = 0.15
13
+ learning_rate: float = 2.5e-3
14
+ bootstrap_steps: int = 90
15
+ cpu_threads: int = 4
16
+ seed: int = 42
17
+
18
+ @property
19
+ def root_dir(self) -> Path:
20
+ return Path(__file__).resolve().parents[1]
21
+
22
+ @property
23
+ def checkpoint_path(self) -> Path:
24
+ return self.root_dir / "artifacts" / "aber_checkpoint.pt"
aber_llm/data.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BASE_CORPUS = """
2
+ System: You are aber, a small local language model.
3
+ User: hello
4
+ aber: Hello. I am aber, a tiny local model written in Python.
5
+
6
+ User: what are you
7
+ aber: I am a small language model trained from scratch on local text.
8
+
9
+ User: explain machine learning
10
+ aber: Machine learning builds patterns from data so a model can make predictions or decisions.
11
+
12
+ User: explain deep learning
13
+ aber: Deep learning uses neural networks with many layers to learn useful representations.
14
+
15
+ User: explain natural language processing
16
+ aber: Natural language processing helps computers read, generate, and understand human language.
17
+
18
+ User: what is regression
19
+ aber: Regression predicts a continuous value such as price, score, or demand.
20
+
21
+ User: what is classification
22
+ aber: Classification predicts a category or label for an example.
23
+
24
+ User: what is clustering
25
+ aber: Clustering groups similar data points without labeled targets.
26
+
27
+ User: what is preprocessing
28
+ aber: Preprocessing cleans, encodes, scales, and prepares data before modeling.
29
+
30
+ User: what is evaluation
31
+ aber: Evaluation measures how well a model performs on data it did not train on.
32
+
33
+ User: how do I learn coding
34
+ aber: Practice small projects, read errors carefully, and improve one step at a time.
35
+
36
+ User: give a short greeting
37
+ aber: Hello. I am ready to help with a short answer.
38
+
39
+ User: write a short poem
40
+ aber: Quiet light on morning glass, slow thoughts gather as clouds pass.
41
+
42
+ User: what is Python
43
+ aber: Python is a programming language known for clarity, flexibility, and strong libraries.
44
+
45
+ User: what is a feature
46
+ aber: A feature is an input variable used by a model.
47
+
48
+ User: what is a target
49
+ aber: A target is the value or label a model tries to predict.
50
+
51
+ User: what is overfitting
52
+ aber: Overfitting happens when a model memorizes training data and generalizes poorly.
53
+
54
+ User: what is underfitting
55
+ aber: Underfitting happens when a model is too simple to learn important patterns.
56
+
57
+ User: what is a transformer
58
+ aber: A transformer is a model architecture that uses attention between tokens.
59
+
60
+ User: what is a tokenizer
61
+ aber: A tokenizer converts text into smaller pieces that a model can process.
62
+
63
+ User: what is local ai
64
+ aber: Local AI runs on your own machine so you can control training, files, and execution.
65
+
66
+ User: how should I debug code
67
+ aber: Reproduce the issue, isolate the failing step, inspect values, and test one fix at a time.
68
+
69
+ User: summarize good study habits
70
+ aber: Good study habits use planning, active recall, spaced review, and regular breaks.
71
+
72
+ User: what is data science
73
+ aber: Data science combines programming, statistics, and domain knowledge to learn from data.
74
+
75
+ User: what is a neural network
76
+ aber: A neural network is a layered function that transforms input signals into predictions.
77
+
78
+ User: give motivation
79
+ aber: Small repeated effort beats waiting for perfect motivation.
80
+ """.strip()
81
+
82
+
83
+ def build_training_text(extra_text: str = "") -> str:
84
+ extra = " ".join((extra_text or "").split())
85
+ if not extra:
86
+ return BASE_CORPUS
87
+ return BASE_CORPUS + "\n\n" + extra
aber_llm/model.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+
5
+ class AberLanguageModel(nn.Module):
6
+ def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout):
7
+ super().__init__()
8
+ self.embedding = nn.Embedding(vocab_size, embed_dim)
9
+ self.gru = nn.GRU(
10
+ input_size=embed_dim,
11
+ hidden_size=hidden_dim,
12
+ num_layers=num_layers,
13
+ dropout=dropout if num_layers > 1 else 0.0,
14
+ batch_first=True,
15
+ )
16
+ self.dropout = nn.Dropout(dropout)
17
+ self.head = nn.Linear(hidden_dim, vocab_size)
18
+
19
+ def forward(self, idx, hidden=None, targets=None):
20
+ emb = self.embedding(idx)
21
+ out, hidden = self.gru(emb, hidden)
22
+ out = self.dropout(out)
23
+ logits = self.head(out)
24
+
25
+ loss = None
26
+ if targets is not None:
27
+ loss = nn.functional.cross_entropy(
28
+ logits.reshape(-1, logits.size(-1)),
29
+ targets.reshape(-1),
30
+ )
31
+ return logits, hidden, loss
32
+
33
+ def generate(self, idx, max_new_tokens, eos_id, temperature=1.0, top_k=8):
34
+ hidden = None
35
+ for _ in range(max_new_tokens):
36
+ logits, hidden, _ = self(idx[:, -1:], hidden)
37
+ next_logits = logits[:, -1, :] / max(temperature, 1e-4)
38
+
39
+ if top_k is not None and top_k > 0:
40
+ values, _ = torch.topk(next_logits, min(top_k, next_logits.size(-1)))
41
+ next_logits[next_logits < values[:, [-1]]] = float("-inf")
42
+
43
+ probs = torch.softmax(next_logits, dim=-1)
44
+ next_token = torch.multinomial(probs, num_samples=1)
45
+ idx = torch.cat([idx, next_token], dim=1)
46
+ if int(next_token.item()) == eos_id:
47
+ break
48
+ return idx
aber_llm/service.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import shutil
3
+
4
+ import torch
5
+
6
+ from .config import AberConfig
7
+ from .model import AberLanguageModel
8
+ from .tokenizer import WordTokenizer
9
+ from .trainer import create_model_and_tokenizer, set_seed, train_model
10
+
11
+
12
+ class AberLLMService:
13
+ def __init__(self, config: AberConfig):
14
+ self.config = config
15
+ torch.set_num_threads(max(1, self.config.cpu_threads))
16
+ self.model = None
17
+ self.tokenizer = None
18
+
19
+ def generate(self, prompt: str, max_new_tokens: int, temperature: float, top_k: int):
20
+ clean_prompt = prompt or "User: hello\naber:"
21
+ self._ensure_ready()
22
+ encoded = self.tokenizer.encode(clean_prompt, add_bos=True)
23
+ idx = torch.tensor(encoded, dtype=torch.long).unsqueeze(0)
24
+ self.model.eval()
25
+
26
+ with torch.inference_mode():
27
+ output = self.model.generate(
28
+ idx=idx,
29
+ max_new_tokens=max_new_tokens,
30
+ eos_id=self.tokenizer.eos_id,
31
+ temperature=temperature,
32
+ top_k=top_k,
33
+ )
34
+
35
+ text = self.tokenizer.decode(output[0].tolist())
36
+ status = (
37
+ f"Generated with aber. "
38
+ f"Architecture=word-level GRU, Vocab={self.tokenizer.vocab_size}, Hidden={self.config.hidden_dim}."
39
+ )
40
+ return text, status
41
+
42
+ def train(self, extra_text: str, steps: int):
43
+ steps = max(1, steps)
44
+ checkpoint_exists = self.config.checkpoint_path.exists()
45
+ training_text = extra_text or ""
46
+
47
+ if checkpoint_exists:
48
+ self._load_or_initialize(extra_text="")
49
+
50
+ model, tokenizer, encoded = create_model_and_tokenizer(self.config, training_text)
51
+ if checkpoint_exists and self.model is not None and self.tokenizer is not None:
52
+ if tokenizer.stoi == self.tokenizer.stoi:
53
+ model.load_state_dict(self.model.state_dict())
54
+
55
+ losses = train_model(model, encoded, self.config, steps)
56
+ self.model = model
57
+ self.tokenizer = tokenizer
58
+ self._save_checkpoint(extra_text=training_text)
59
+
60
+ return (
61
+ f"aber training finished.\n"
62
+ f"Steps: {steps}\n"
63
+ f"Start Loss: {losses[0]:.4f}\n"
64
+ f"End Loss: {losses[-1]:.4f}\n"
65
+ f"Checkpoint: {self.config.checkpoint_path}"
66
+ )
67
+
68
+ def reset(self):
69
+ checkpoint_dir = self.config.checkpoint_path.parent
70
+ if checkpoint_dir.exists():
71
+ shutil.rmtree(checkpoint_dir)
72
+ self.model = None
73
+ self.tokenizer = None
74
+ return "aber reset complete. Next train or generate call will rebuild the model from scratch."
75
+
76
+ def _ensure_ready(self):
77
+ if self.model is not None and self.tokenizer is not None:
78
+ return
79
+ self._load_or_initialize(extra_text="")
80
+
81
+ def _load_or_initialize(self, extra_text: str):
82
+ checkpoint = self.config.checkpoint_path
83
+ if checkpoint.exists():
84
+ state = torch.load(checkpoint, map_location="cpu")
85
+ self.tokenizer = WordTokenizer.from_state_dict(state["tokenizer"])
86
+ self.model = AberLanguageModel(
87
+ vocab_size=state["config"]["vocab_size"],
88
+ embed_dim=state["config"]["embed_dim"],
89
+ hidden_dim=state["config"]["hidden_dim"],
90
+ num_layers=state["config"]["num_layers"],
91
+ dropout=state["config"]["dropout"],
92
+ )
93
+ self.model.load_state_dict(state["model"])
94
+ self.model.eval()
95
+ return
96
+
97
+ set_seed(self.config.seed)
98
+ self.model, self.tokenizer, encoded = create_model_and_tokenizer(self.config, extra_text)
99
+ train_model(self.model, encoded, self.config, self.config.bootstrap_steps)
100
+ self._save_checkpoint(extra_text=extra_text)
101
+
102
+ def _save_checkpoint(self, extra_text: str):
103
+ checkpoint = self.config.checkpoint_path
104
+ checkpoint.parent.mkdir(parents=True, exist_ok=True)
105
+ torch.save(
106
+ {
107
+ "model": self.model.state_dict(),
108
+ "tokenizer": self.tokenizer.state_dict(),
109
+ "config": {
110
+ "vocab_size": self.tokenizer.vocab_size,
111
+ "embed_dim": self.config.embed_dim,
112
+ "hidden_dim": self.config.hidden_dim,
113
+ "num_layers": self.config.num_layers,
114
+ "dropout": self.config.dropout,
115
+ "seq_len": self.config.seq_len,
116
+ "extra_text": extra_text,
117
+ },
118
+ },
119
+ checkpoint,
120
+ )
aber_llm/tokenizer.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ TOKEN_PATTERN = re.compile(r"\n|[A-Za-z0-9_']+|[^\w\s]")
5
+
6
+
7
+ class WordTokenizer:
8
+ def __init__(self):
9
+ self.special_tokens = ["<pad>", "<unk>", "<bos>", "<eos>"]
10
+ self.stoi = {}
11
+ self.itos = {}
12
+
13
+ @property
14
+ def pad_id(self):
15
+ return self.stoi["<pad>"]
16
+
17
+ @property
18
+ def bos_id(self):
19
+ return self.stoi["<bos>"]
20
+
21
+ @property
22
+ def eos_id(self):
23
+ return self.stoi["<eos>"]
24
+
25
+ @property
26
+ def vocab_size(self):
27
+ return len(self.stoi)
28
+
29
+ def tokenize(self, text: str):
30
+ return TOKEN_PATTERN.findall(text)
31
+
32
+ def fit(self, text: str):
33
+ vocab = self.special_tokens + sorted(set(self.tokenize(text)))
34
+ self.stoi = {token: idx for idx, token in enumerate(vocab)}
35
+ self.itos = {idx: token for token, idx in self.stoi.items()}
36
+ return self
37
+
38
+ def encode(self, text: str, add_bos: bool = False, add_eos: bool = False):
39
+ tokens = self.tokenize(text)
40
+ ids = [self.stoi.get(token, self.stoi["<unk>"]) for token in tokens]
41
+ if add_bos:
42
+ ids = [self.bos_id] + ids
43
+ if add_eos:
44
+ ids = ids + [self.eos_id]
45
+ return ids
46
+
47
+ def decode(self, ids):
48
+ tokens = []
49
+ for idx in ids:
50
+ token = self.itos.get(int(idx), "<unk>")
51
+ if token in self.special_tokens:
52
+ continue
53
+ tokens.append(token)
54
+
55
+ text = ""
56
+ for token in tokens:
57
+ if token == "\n":
58
+ text = text.rstrip() + "\n"
59
+ elif token in {".", ",", "!", "?", ":", ";"}:
60
+ text = text.rstrip() + token + " "
61
+ else:
62
+ text += token + " "
63
+ return text.strip()
64
+
65
+ def state_dict(self):
66
+ return {"stoi": self.stoi}
67
+
68
+ @classmethod
69
+ def from_state_dict(cls, state):
70
+ tok = cls()
71
+ tok.stoi = dict(state["stoi"])
72
+ tok.itos = {idx: token for token, idx in tok.stoi.items()}
73
+ return tok
aber_llm/trainer.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+
3
+ import torch
4
+
5
+ from .data import build_training_text
6
+ from .model import AberLanguageModel
7
+ from .tokenizer import WordTokenizer
8
+
9
+
10
+ def set_seed(seed: int):
11
+ random.seed(seed)
12
+ torch.manual_seed(seed)
13
+
14
+
15
+ def create_model_and_tokenizer(config, extra_text=""):
16
+ text = build_training_text(extra_text)
17
+ tokenizer = WordTokenizer().fit(text)
18
+ encoded = tokenizer.encode(text, add_bos=True, add_eos=True)
19
+ encoded = torch.tensor(encoded, dtype=torch.long)
20
+ model = AberLanguageModel(
21
+ vocab_size=tokenizer.vocab_size,
22
+ embed_dim=config.embed_dim,
23
+ hidden_dim=config.hidden_dim,
24
+ num_layers=config.num_layers,
25
+ dropout=config.dropout,
26
+ )
27
+ return model, tokenizer, encoded
28
+
29
+
30
+ def build_batch(encoded, seq_len, batch_size):
31
+ max_start = max(1, len(encoded) - seq_len - 1)
32
+ starts = torch.randint(0, max_start, (batch_size,))
33
+ x = torch.stack([encoded[start : start + seq_len] for start in starts])
34
+ y = torch.stack([encoded[start + 1 : start + seq_len + 1] for start in starts])
35
+ return x, y
36
+
37
+
38
+ def train_model(model, encoded, config, steps):
39
+ optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
40
+ model.train()
41
+ losses = []
42
+
43
+ for _ in range(steps):
44
+ xb, yb = build_batch(encoded, config.seq_len, config.batch_size)
45
+ _, _, loss = model(xb, targets=yb)
46
+ optimizer.zero_grad()
47
+ loss.backward()
48
+ optimizer.step()
49
+ losses.append(float(loss.item()))
50
+
51
+ return losses
app.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ from aber_llm.config import AberConfig
4
+ from aber_llm.service import AberLLMService
5
+
6
+
7
+ config = AberConfig()
8
+ service = AberLLMService(config=config)
9
+
10
+
11
+ def generate_text(prompt, max_new_tokens, temperature, top_k):
12
+ return service.generate(
13
+ prompt=prompt,
14
+ max_new_tokens=int(max_new_tokens),
15
+ temperature=float(temperature),
16
+ top_k=int(top_k),
17
+ )
18
+
19
+
20
+ def train_model(extra_text, steps):
21
+ return service.train(extra_text=extra_text, steps=int(steps))
22
+
23
+
24
+ def reset_model():
25
+ return service.reset()
26
+
27
+
28
+ with gr.Blocks(
29
+ title="aber Small Model",
30
+ theme=gr.themes.Soft(primary_hue="green", secondary_hue="blue"),
31
+ ) as demo:
32
+ gr.Markdown(
33
+ """
34
+ # aber
35
+ An improved small language model written in Python from scratch.
36
+
37
+ - Model name: `aber`
38
+ - No external pretrained LLM
39
+ - Word-level tokenizer
40
+ - GRU language model
41
+ - Local CPU training and generation
42
+ """
43
+ )
44
+
45
+ with gr.Tab("Generate"):
46
+ prompt_input = gr.Textbox(
47
+ label="Prompt",
48
+ value="User: hello\naber:",
49
+ lines=6,
50
+ )
51
+ with gr.Row():
52
+ max_tokens_input = gr.Slider(10, 160, value=72, step=2, label="Max New Tokens")
53
+ temperature_input = gr.Slider(0.2, 1.3, value=0.75, step=0.05, label="Temperature")
54
+ top_k_input = gr.Slider(1, 20, value=8, step=1, label="Top-K")
55
+ generate_button = gr.Button("Generate", variant="primary")
56
+ output_text = gr.Textbox(label="Output", lines=10)
57
+ output_status = gr.Textbox(label="Status", lines=4)
58
+
59
+ with gr.Tab("Train"):
60
+ extra_text_input = gr.Textbox(
61
+ label="Extra Training Text",
62
+ placeholder="Add more local text to train aber on your own data.",
63
+ lines=10,
64
+ )
65
+ steps_input = gr.Slider(10, 400, value=120, step=10, label="Training Steps")
66
+ train_button = gr.Button("Train / Continue Training", variant="primary")
67
+ reset_button = gr.Button("Reset aber")
68
+ train_status = gr.Textbox(label="Training Status", lines=6)
69
+
70
+ generate_button.click(
71
+ fn=generate_text,
72
+ inputs=[prompt_input, max_tokens_input, temperature_input, top_k_input],
73
+ outputs=[output_text, output_status],
74
+ )
75
+
76
+ train_button.click(
77
+ fn=train_model,
78
+ inputs=[extra_text_input, steps_input],
79
+ outputs=[train_status],
80
+ )
81
+
82
+ reset_button.click(
83
+ fn=reset_model,
84
+ outputs=[train_status],
85
+ )
86
+
87
+
88
+ if __name__ == "__main__":
89
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio>=5.23.0
2
+ torch>=2.3.0