File size: 11,937 Bytes
989f378 e52d226 b3c01f8 989f378 924690d 8a37136 989f378 8a37136 c7eb2e2 989f378 c7eb2e2 8a37136 989f378 8a37136 989f378 8a37136 989f378 8a37136 989f378 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 | ---
language: en
license: mit
tags:
- tiny-epstein
- epstein-files
- transformers
---
# tiny-epstein-100m
# Actively being trained. work in progress
# THANK YOU FOR CHECKING OUT THIS MODEL <3
# PLEASE LEAVE A LIKE AND SHARE IM TRYING TO GET A JOB AS AN AI ENGINEER
# IF YOU ARE HIRING AI ENGINEERS PLEASE HMU I DONT HAVE PROPER EDUCATION BUT IM OBVIOUSLY VERY KNOWLEDGEABLE
A small transformer model (~100M parameters) trained on the [teyler/epstein-files-20k](https://huggingface.co/datasets/teyler/epstein-files-20k) dataset.
The architecture is inspired by **Tiny Aya** modifications and is designed for efficient on-device inference.
## Model Details
- **Architecture**: Decoder-only transformer with parallel blocks, Grouped Query Attention (GQA), SwiGLU activation, and bias‑free LayerNorm.
- **Sliding Window Attention**: 3:1 local:global ratio (first 75% of layers use sliding window with RoPE; remaining layers use full attention with NoPE).
- **Parameters**: ~100 million
- **Context Length**: 1024 tokens (configurable)
- **Tokenizer**: GPT‑2 (same as used during training)
- **Training Data**: [teyler/epstein-files-20k](https://huggingface.co/datasets/teyler/epstein-files-20k) – 20,000 documents related to the Epstein files.
## Intended Use
This model is primarily for research and experimentation. It can generate continuations of text given a prompt, especially on topics related to the Epstein files.
## How to Use
### Installation
Make sure you have `torch` and `transformers` installed.
If you want to run inference, install the required packages:
```bash
pip install torch transformers
```
Loading the Model and Tokenizer
```python
import torch
from transformers import GPT2TokenizerFast
import os
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset, concatenate_datasets, Dataset
from tqdm import tqdm
import math
from huggingface_hub import hf_hub_download
from huggingface_hub import snapshot_download
# Download the model from Hugging Face Hub
model_path = snapshot_download(repo_id="liminerity/tiny-epstein-100m")
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
# ------------------------------------------------------------------------------
# Configuration (scaled to ~150M for L4 GPU)
# ------------------------------------------------------------------------------
class ModelConfig:
vocab_size = 50257 # will be updated from tokenizer
emb_dim = 768 # embedding dimension
hidden_dim = 2048 # intermediate size (FFN) - reduced
num_layers = 12 # number of transformer layers - reduced
num_heads = 12 # number of query heads - reduced
num_kv_heads = 4 # number of key/value heads (GQA)
max_seq_len = 1024 # shorter sequence length to save memory
window_size = 1024 # sliding window size (match max_seq_len)
sliding_window_ratio = 0.75 # fraction of layers with sliding window
rope_theta = 10000.0 # base for RoPE
dtype = torch.float16 # use mixed precision
bias = False # no bias in linear layers
dropout = 0.0 # no dropout mentioned
gradient_checkpointing = True # enable to save memory
# ------------------------------------------------------------------------------
# Helper modules (unchanged)
# ------------------------------------------------------------------------------
class CohereLayerNorm(nn.Module):
"""LayerNorm without bias (scale only)."""
def __init__(self, emb_dim, eps=1e-5):
super().__init__()
self.eps = eps
self.weight = nn.Parameter(torch.ones(emb_dim))
def forward(self, x):
input_dtype = x.dtype
x = x.to(torch.float32)
mean = x.mean(dim=-1, keepdim=True)
variance = (x - mean).pow(2).mean(dim=-1, keepdim=True)
x = (x - mean) * torch.rsqrt(variance + self.eps)
return (self.weight.to(torch.float32) * x).to(input_dtype)
class FeedForward(nn.Module):
"""SwiGLU MLP."""
def __init__(self, config):
super().__init__()
self.fc1 = nn.Linear(config.emb_dim, config.hidden_dim, bias=config.bias)
self.fc2 = nn.Linear(config.emb_dim, config.hidden_dim, bias=config.bias)
self.fc3 = nn.Linear(config.hidden_dim, config.emb_dim, bias=config.bias)
def forward(self, x):
x_fc1 = self.fc1(x)
x_fc2 = self.fc2(x)
x = F.silu(x_fc1) * x_fc2
return self.fc3(x)
def precompute_rope_freqs(dim, max_seq_len, theta=10000.0, dtype=torch.float32):
"""Precompute rotary position embeddings."""
assert dim % 2 == 0, "dim must be even"
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=dtype)[:(dim // 2)] / dim))
t = torch.arange(max_seq_len, dtype=dtype)
freqs = torch.outer(t, freqs) # shape (max_seq_len, dim//2)
emb = torch.cat((freqs, freqs), dim=-1) # shape (max_seq_len, dim)
return emb.sin(), emb.cos()
def rotate_half(x):
"""Rotates half the hidden dims of the input."""
x1, x2 = x.chunk(2, dim=-1)
return torch.cat((-x2, x1), dim=-1)
def apply_rotary_emb(x, cos, sin):
"""
Apply rotary embeddings to input tensor.
x: (batch, seq_len, num_heads, head_dim)
cos, sin: (seq_len, head_dim)
"""
cos = cos.unsqueeze(0).unsqueeze(2) # (1, seq_len, 1, head_dim)
sin = sin.unsqueeze(0).unsqueeze(2) # (1, seq_len, 1, head_dim)
return (x * cos) + (rotate_half(x) * sin)
class GroupedQueryAttention(nn.Module):
"""Multi-head attention with GQA and optional sliding window mask."""
def __init__(self, config, layer_id):
super().__init__()
self.num_heads = config.num_heads
self.num_kv_heads = config.num_kv_heads
self.head_dim = config.emb_dim // config.num_heads
assert self.num_heads % self.num_kv_heads == 0
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
self.wq = nn.Linear(config.emb_dim, config.num_heads * self.head_dim, bias=config.bias)
self.wk = nn.Linear(config.emb_dim, config.num_kv_heads * self.head_dim, bias=config.bias)
self.wv = nn.Linear(config.emb_dim, config.num_kv_heads * self.head_dim, bias=config.bias)
self.wo = nn.Linear(config.num_heads * self.head_dim, config.emb_dim, bias=config.bias)
total_layers = config.num_layers
num_sliding = int(total_layers * config.sliding_window_ratio)
self.use_sliding = (layer_id < num_sliding)
self.window_size = config.window_size
self.max_seq_len = config.max_seq_len
self.rope_theta = config.rope_theta
self.rope_sin, self.rope_cos = None, None
def init_rope(self, max_seq_len, device):
if self.rope_sin is not None and self.rope_sin.shape[0] >= max_seq_len:
return
sin, cos = precompute_rope_freqs(
self.head_dim, max_seq_len, theta=self.rope_theta, dtype=torch.float32
)
self.rope_sin = sin.to(device)
self.rope_cos = cos.to(device)
def forward(self, x, mask=None):
batch, seq_len, _ = x.shape
device = x.device
if self.use_sliding:
self.init_rope(seq_len, device)
xq = self.wq(x)
xk = self.wk(x)
xv = self.wv(x)
xq = xq.view(batch, seq_len, self.num_heads, self.head_dim)
xk = xk.view(batch, seq_len, self.num_kv_heads, self.head_dim)
xv = xv.view(batch, seq_len, self.num_kv_heads, self.head_dim)
if self.use_sliding:
xq = apply_rotary_emb(xq, self.rope_cos[:seq_len], self.rope_sin[:seq_len])
xk = apply_rotary_emb(xk, self.rope_cos[:seq_len], self.rope_sin[:seq_len])
xk = xk.repeat_interleave(self.num_queries_per_kv, dim=2)
xv = xv.repeat_interleave(self.num_queries_per_kv, dim=2)
xq = xq.transpose(1, 2)
xk = xk.transpose(1, 2)
xv = xv.transpose(1, 2)
scores = torch.matmul(xq, xk.transpose(-2, -1)) / math.sqrt(self.head_dim)
if mask is not None:
scores = scores + mask
else:
mask = torch.full((seq_len, seq_len), float('-inf'), device=device)
mask = torch.triu(mask, diagonal=1)
if self.use_sliding:
for i in range(seq_len):
low = max(0, i - self.window_size + 1)
mask[i, :low] = float('-inf')
scores = scores + mask
probs = F.softmax(scores, dim=-1, dtype=torch.float32).to(xq.dtype)
out = torch.matmul(probs, xv)
out = out.transpose(1, 2).contiguous().view(batch, seq_len, -1)
return self.wo(out)
class ParallelTransformerBlock(nn.Module):
"""Decoder block with parallel attention and MLP."""
def __init__(self, config, layer_id):
super().__init__()
self.norm = CohereLayerNorm(config.emb_dim)
self.attn = GroupedQueryAttention(config, layer_id)
self.mlp = FeedForward(config)
def forward(self, x, mask=None):
residual = x
x = self.norm(x)
attn_out = self.attn(x, mask=mask)
mlp_out = self.mlp(x)
return residual + attn_out + mlp_out
class TinyAya(nn.Module):
"""Tiny Aya 150M model."""
def __init__(self, config):
super().__init__()
self.config = config
self.token_embedding = nn.Embedding(config.vocab_size, config.emb_dim)
self.layers = nn.ModuleList([
ParallelTransformerBlock(config, i) for i in range(config.num_layers)
])
self.norm = CohereLayerNorm(config.emb_dim)
self.lm_head = nn.Linear(config.emb_dim, config.vocab_size, bias=False)
self.lm_head.weight = self.token_embedding.weight
if config.gradient_checkpointing:
self.gradient_checkpointing_enable()
def gradient_checkpointing_enable(self):
self._gradient_checkpointing = True
def forward(self, input_ids, mask=None):
x = self.token_embedding(input_ids)
for layer in self.layers:
if self.training and getattr(self, '_gradient_checkpointing', False):
x = torch.utils.checkpoint.checkpoint(layer, x, mask)
else:
x = layer(x, mask=mask)
x = self.norm(x)
logits = self.lm_head(x)
return logits
@torch.no_grad()
def generate(self, input_ids, max_new_tokens=50, temperature=1.0):
self.eval()
for _ in range(max_new_tokens):
logits = self(input_ids[:, -self.config.max_seq_len:])
next_token_logits = logits[:, -1, :] / temperature
probs = F.softmax(next_token_logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
input_ids = torch.cat([input_ids, next_token], dim=-1)
return input_ids
model = TinyAya(ModelConfig())
state_dict = torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location="cpu")
model.load_state_dict(state_dict)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
```
Text Generation Example
```python
prompt = """Was Jeffrey a good guy?"""
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
with torch.no_grad():
output = model.generate(input_ids, max_new_tokens=50, temperature=0.8)
print("Generated text:")
print(tokenizer.decode(output[0]))
```
Training Details
The model was trained for one epoch on the full dataset using an L4 GPU in Google Colab.
Optimizer: AdamW (lr=1e-4) with gradient clipping (max norm=1.0). Mixed precision (float16) was used.
Limitations
· The model is small and was trained on a limited dataset; it may produce repetitive or nonsensical outputs.
· It has not undergone any safety fine‑tuning; use with caution.
License
MIT
|