Fu01978 commited on
Commit
ff5d275
·
verified ·
1 Parent(s): fb06f7a

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: mit
4
+ tags:
5
+ - tiny
6
+ - language-model
7
+ - causal-lm
8
+ - from-scratch
9
+ - pytorch
10
+ ---
11
+
12
+ # TinyLM
13
+
14
+ A ~1M parameter causal language model trained from scratch, for fun and experimentation.
15
+
16
+ ## Architecture
17
+
18
+ | Hyperparameter | Value |
19
+ |---|---|
20
+ | Parameters | ~1M |
21
+ | Layers | 4 |
22
+ | Hidden size | 64 |
23
+ | Attention heads | 4 |
24
+ | FFN dim | 192 |
25
+ | Embedding rank | 32 |
26
+ | Context length | 256 |
27
+ | Tokenizer | GPT-2 (50,257 vocab) |
28
+
29
+ Uses a **factored (low-rank) embedding** to keep the vocab projection from eating the entire parameter budget, with weight tying on the output head.
30
+
31
+ ## Training
32
+
33
+ | | |
34
+ |---|---|
35
+ | Datasets | Skylion007/openwebtext (10k samples), roneneldan/TinyStories (10k samples) |
36
+ | Optimizer | AdamW (lr=3e-3, weight_decay=0.01) |
37
+ | Scheduler | Cosine annealing with warm restarts |
38
+ | Mixed precision | fp16 (torch.cuda.amp) |
39
+ | Hardware | Nvidia P100 (Kaggle) |
40
+
41
+ ## Usage
42
+ ```python
43
+ from huggingface_hub import snapshot_download
44
+ import importlib.util
45
+ import torch
46
+
47
+ # Download all files
48
+ snapshot_download(repo_id="Fu01978/TinyLM", local_dir="./tinylm")
49
+
50
+ # Load via included script
51
+ spec = importlib.util.spec_from_file_location("modeling_tinylm", "./tinylm/modeling_tinylm.py")
52
+ module = importlib.util.module_from_spec(spec)
53
+ spec.loader.exec_module(module)
54
+
55
+ model, tokenizer, config = module.load_tinylm("./tinylm")
56
+ model.eval()
57
+
58
+ # Generate
59
+ output = module.generate(model, tokenizer, "Once upon a time")
60
+ print(output)
61
+ ```
62
+
63
+ ## Example Outputs
64
+
65
+ **Prompt:** Once upon a time
66
+ **Output:** Once upon a time there was a little girl named Mrs. She decided to go and be a little girl in the park. One day she had to go on a bed. From then on a lot of bread. She said, "What are you doing?" ...
__pycache__/modeling_tinylm.cpython-312.pyc ADDED
Binary file (8.23 kB). View file
 
config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "TinyLM",
3
+ "vocab_size": 50257,
4
+ "embed_rank": 32,
5
+ "d_model": 64,
6
+ "n_heads": 4,
7
+ "ffn_dim": 192,
8
+ "n_layers": 4,
9
+ "max_seq_len": 256,
10
+ "dropout": 0.0
11
+ }
modeling_tinylm.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+ import torch.nn as nn
4
+ from transformers import GPT2Tokenizer
5
+
6
+
7
+ def load_tinylm(model_dir, device="cpu"):
8
+ # Load config
9
+ with open(f"{model_dir}/config.json") as f:
10
+ config = json.load(f)
11
+
12
+ VOCAB_SIZE = config["vocab_size"]
13
+ EMBED_RANK = config["embed_rank"]
14
+ D_MODEL = config["d_model"]
15
+ N_HEADS = config["n_heads"]
16
+ FFN_DIM = config["ffn_dim"]
17
+ N_LAYERS = config["n_layers"]
18
+ MAX_SEQ_LEN = config["max_seq_len"]
19
+ DROPOUT = config["dropout"]
20
+
21
+ class FactoredEmbedding(nn.Module):
22
+ def __init__(self, vocab_size, rank, d_model):
23
+ super().__init__()
24
+ self.in_proj = nn.Embedding(vocab_size, rank)
25
+ self.out_proj = nn.Linear(rank, d_model, bias=False)
26
+
27
+ def forward(self, x):
28
+ return self.out_proj(self.in_proj(x))
29
+
30
+ class TransformerBlock(nn.Module):
31
+ def __init__(self):
32
+ super().__init__()
33
+ self.ln1 = nn.LayerNorm(D_MODEL)
34
+ self.attn = nn.MultiheadAttention(D_MODEL, N_HEADS, dropout=DROPOUT, batch_first=True)
35
+ self.ln2 = nn.LayerNorm(D_MODEL)
36
+ self.ffn = nn.Sequential(
37
+ nn.Linear(D_MODEL, FFN_DIM),
38
+ nn.GELU(),
39
+ nn.Linear(FFN_DIM, D_MODEL),
40
+ nn.Dropout(DROPOUT),
41
+ )
42
+
43
+ def forward(self, x, attn_mask=None, key_padding_mask=None):
44
+ x_norm = self.ln1(x)
45
+ attn_out, _ = self.attn(x_norm, x_norm, x_norm,
46
+ attn_mask=attn_mask,
47
+ key_padding_mask=key_padding_mask,
48
+ is_causal=True)
49
+ x = x + attn_out
50
+ x = x + self.ffn(self.ln2(x))
51
+ return x
52
+
53
+ class TinyLM(nn.Module):
54
+ def __init__(self):
55
+ super().__init__()
56
+ self.tok_emb = FactoredEmbedding(VOCAB_SIZE, EMBED_RANK, D_MODEL)
57
+ self.pos_emb = nn.Embedding(MAX_SEQ_LEN, D_MODEL)
58
+ self.drop = nn.Dropout(DROPOUT)
59
+ self.blocks = nn.ModuleList([TransformerBlock() for _ in range(N_LAYERS)])
60
+ self.ln_final = nn.LayerNorm(D_MODEL)
61
+ self.head_down = nn.Linear(D_MODEL, EMBED_RANK, bias=False)
62
+ self.head_vocab = nn.Linear(EMBED_RANK, VOCAB_SIZE, bias=False)
63
+ self.head_vocab.weight = nn.Parameter(self.tok_emb.in_proj.weight)
64
+
65
+ def forward(self, idx):
66
+ B, T = idx.shape
67
+ if T > MAX_SEQ_LEN:
68
+ idx = idx[:, :MAX_SEQ_LEN]
69
+ T = idx.shape[1]
70
+ positions = torch.arange(T, device=idx.device).unsqueeze(0)
71
+ x = self.drop(self.tok_emb(idx) + self.pos_emb(positions))
72
+ mask = nn.Transformer.generate_square_subsequent_mask(T, device=idx.device)
73
+ for block in self.blocks:
74
+ x = block(x, attn_mask=mask)
75
+ x = self.ln_final(x)
76
+ x = self.head_down(x)
77
+ return self.head_vocab(x)
78
+
79
+ # Build and load weights
80
+ model = TinyLM().to(device)
81
+ state_dict = torch.load(f"{model_dir}/pytorch_model.bin", map_location=device)
82
+ model.load_state_dict(state_dict)
83
+ model.eval()
84
+
85
+ # Load tokenizer
86
+ tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
87
+ tokenizer.pad_token = tokenizer.eos_token
88
+
89
+ return model, tokenizer, config
90
+
91
+
92
+ def generate(model, tokenizer, prompt, max_new_tokens=100, temperature=0.8, top_k=40, device="cpu"):
93
+ MAX_SEQ_LEN = model.pos_emb.num_embeddings
94
+ model.eval()
95
+ ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
96
+
97
+ with torch.no_grad():
98
+ for _ in range(max_new_tokens):
99
+ idx_cond = ids[:, -MAX_SEQ_LEN:]
100
+ logits = model(idx_cond)
101
+ logits = logits[:, -1, :] / temperature
102
+ if top_k is not None:
103
+ values, _ = torch.topk(logits, top_k)
104
+ logits[logits < values[:, -1:]] = -float("inf")
105
+ probs = torch.softmax(logits, dim=-1)
106
+ next_id = torch.multinomial(probs, num_samples=1)
107
+ if next_id.item() == tokenizer.eos_token_id:
108
+ break
109
+ ids = torch.cat([ids, next_id], dim=1)
110
+
111
+ return tokenizer.decode(ids[0], skip_special_tokens=True)
112
+
113
+
114
+ if __name__ == "__main__":
115
+ model, tokenizer, config = load_tinylm("./tinylm")
116
+ print("Model loaded!")
117
+ print(generate(model, tokenizer, "Once upon a time"))
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c11755f9b669393914fd960a75a855cd4bb5fa80c39853878b6de735fc975794
3
+ size 13635564
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|endoftext|>",
5
+ "eos_token": "<|endoftext|>",
6
+ "errors": "replace",
7
+ "is_local": false,
8
+ "model_max_length": 1024,
9
+ "pad_token": "<|endoftext|>",
10
+ "tokenizer_class": "GPT2Tokenizer",
11
+ "unk_token": "<|endoftext|>"
12
+ }