| | --- |
| | language: en |
| | license: mit |
| | tags: |
| | - tiny-epstein |
| | - epstein-files |
| | - transformers |
| | --- |
| | |
| | # tiny-epstein-100m |
| | # Actively being trained. work in progress |
| | # THANK YOU FOR CHECKING OUT THIS MODEL <3 |
| | # PLEASE LEAVE A LIKE AND SHARE IM TRYING TO GET A JOB AS AN AI ENGINEER |
| | # IF YOU ARE HIRING AI ENGINEERS PLEASE HMU I DONT HAVE PROPER EDUCATION BUT IM OBVIOUSLY VERY KNOWLEDGEABLE |
| | A small transformer model (~100M parameters) trained on the [teyler/epstein-files-20k](https://huggingface.co/datasets/teyler/epstein-files-20k) dataset. |
| | The architecture is inspired by **Tiny Aya** modifications and is designed for efficient on-device inference. |
| |
|
| | ## Model Details |
| |
|
| | - **Architecture**: Decoder-only transformer with parallel blocks, Grouped Query Attention (GQA), SwiGLU activation, and bias‑free LayerNorm. |
| | - **Sliding Window Attention**: 3:1 local:global ratio (first 75% of layers use sliding window with RoPE; remaining layers use full attention with NoPE). |
| | - **Parameters**: ~100 million |
| | - **Context Length**: 1024 tokens (configurable) |
| | - **Tokenizer**: GPT‑2 (same as used during training) |
| | - **Training Data**: [teyler/epstein-files-20k](https://huggingface.co/datasets/teyler/epstein-files-20k) – 20,000 documents related to the Epstein files. |
| |
|
| | ## Intended Use |
| |
|
| | This model is primarily for research and experimentation. It can generate continuations of text given a prompt, especially on topics related to the Epstein files. |
| |
|
| | ## How to Use |
| |
|
| | ### Installation |
| |
|
| | Make sure you have `torch` and `transformers` installed. |
| | If you want to run inference, install the required packages: |
| |
|
| | ```bash |
| | pip install torch transformers |
| | ``` |
| |
|
| | Loading the Model and Tokenizer |
| |
|
| | ```python |
| | import torch |
| | from transformers import GPT2TokenizerFast |
| | import os |
| | import torch.nn as nn |
| | import torch.nn.functional as F |
| | from torch.utils.data import DataLoader |
| | from transformers import AutoTokenizer |
| | from datasets import load_dataset, concatenate_datasets, Dataset |
| | from tqdm import tqdm |
| | import math |
| | from huggingface_hub import hf_hub_download |
| | from huggingface_hub import snapshot_download |
| | |
| | # Download the model from Hugging Face Hub |
| | model_path = snapshot_download(repo_id="liminerity/tiny-epstein-100m") |
| | |
| | tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') |
| | tokenizer.pad_token = tokenizer.eos_token |
| | # ------------------------------------------------------------------------------ |
| | # Configuration (scaled to ~150M for L4 GPU) |
| | # ------------------------------------------------------------------------------ |
| | class ModelConfig: |
| | vocab_size = 50257 # will be updated from tokenizer |
| | emb_dim = 768 # embedding dimension |
| | hidden_dim = 2048 # intermediate size (FFN) - reduced |
| | num_layers = 12 # number of transformer layers - reduced |
| | num_heads = 12 # number of query heads - reduced |
| | num_kv_heads = 4 # number of key/value heads (GQA) |
| | max_seq_len = 1024 # shorter sequence length to save memory |
| | window_size = 1024 # sliding window size (match max_seq_len) |
| | sliding_window_ratio = 0.75 # fraction of layers with sliding window |
| | rope_theta = 10000.0 # base for RoPE |
| | dtype = torch.float16 # use mixed precision |
| | bias = False # no bias in linear layers |
| | dropout = 0.0 # no dropout mentioned |
| | gradient_checkpointing = True # enable to save memory |
| | |
| | # ------------------------------------------------------------------------------ |
| | # Helper modules (unchanged) |
| | # ------------------------------------------------------------------------------ |
| | class CohereLayerNorm(nn.Module): |
| | """LayerNorm without bias (scale only).""" |
| | def __init__(self, emb_dim, eps=1e-5): |
| | super().__init__() |
| | self.eps = eps |
| | self.weight = nn.Parameter(torch.ones(emb_dim)) |
| | |
| | def forward(self, x): |
| | input_dtype = x.dtype |
| | x = x.to(torch.float32) |
| | mean = x.mean(dim=-1, keepdim=True) |
| | variance = (x - mean).pow(2).mean(dim=-1, keepdim=True) |
| | x = (x - mean) * torch.rsqrt(variance + self.eps) |
| | return (self.weight.to(torch.float32) * x).to(input_dtype) |
| | |
| | |
| | class FeedForward(nn.Module): |
| | """SwiGLU MLP.""" |
| | def __init__(self, config): |
| | super().__init__() |
| | self.fc1 = nn.Linear(config.emb_dim, config.hidden_dim, bias=config.bias) |
| | self.fc2 = nn.Linear(config.emb_dim, config.hidden_dim, bias=config.bias) |
| | self.fc3 = nn.Linear(config.hidden_dim, config.emb_dim, bias=config.bias) |
| | |
| | def forward(self, x): |
| | x_fc1 = self.fc1(x) |
| | x_fc2 = self.fc2(x) |
| | x = F.silu(x_fc1) * x_fc2 |
| | return self.fc3(x) |
| | |
| | |
| | def precompute_rope_freqs(dim, max_seq_len, theta=10000.0, dtype=torch.float32): |
| | """Precompute rotary position embeddings.""" |
| | assert dim % 2 == 0, "dim must be even" |
| | freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=dtype)[:(dim // 2)] / dim)) |
| | t = torch.arange(max_seq_len, dtype=dtype) |
| | freqs = torch.outer(t, freqs) # shape (max_seq_len, dim//2) |
| | emb = torch.cat((freqs, freqs), dim=-1) # shape (max_seq_len, dim) |
| | return emb.sin(), emb.cos() |
| | |
| | |
| | def rotate_half(x): |
| | """Rotates half the hidden dims of the input.""" |
| | x1, x2 = x.chunk(2, dim=-1) |
| | return torch.cat((-x2, x1), dim=-1) |
| | |
| | |
| | def apply_rotary_emb(x, cos, sin): |
| | """ |
| | Apply rotary embeddings to input tensor. |
| | x: (batch, seq_len, num_heads, head_dim) |
| | cos, sin: (seq_len, head_dim) |
| | """ |
| | cos = cos.unsqueeze(0).unsqueeze(2) # (1, seq_len, 1, head_dim) |
| | sin = sin.unsqueeze(0).unsqueeze(2) # (1, seq_len, 1, head_dim) |
| | return (x * cos) + (rotate_half(x) * sin) |
| | |
| | |
| | class GroupedQueryAttention(nn.Module): |
| | """Multi-head attention with GQA and optional sliding window mask.""" |
| | def __init__(self, config, layer_id): |
| | super().__init__() |
| | self.num_heads = config.num_heads |
| | self.num_kv_heads = config.num_kv_heads |
| | self.head_dim = config.emb_dim // config.num_heads |
| | assert self.num_heads % self.num_kv_heads == 0 |
| | self.num_queries_per_kv = self.num_heads // self.num_kv_heads |
| | |
| | self.wq = nn.Linear(config.emb_dim, config.num_heads * self.head_dim, bias=config.bias) |
| | self.wk = nn.Linear(config.emb_dim, config.num_kv_heads * self.head_dim, bias=config.bias) |
| | self.wv = nn.Linear(config.emb_dim, config.num_kv_heads * self.head_dim, bias=config.bias) |
| | self.wo = nn.Linear(config.num_heads * self.head_dim, config.emb_dim, bias=config.bias) |
| | |
| | total_layers = config.num_layers |
| | num_sliding = int(total_layers * config.sliding_window_ratio) |
| | self.use_sliding = (layer_id < num_sliding) |
| | |
| | self.window_size = config.window_size |
| | self.max_seq_len = config.max_seq_len |
| | self.rope_theta = config.rope_theta |
| | self.rope_sin, self.rope_cos = None, None |
| | |
| | def init_rope(self, max_seq_len, device): |
| | if self.rope_sin is not None and self.rope_sin.shape[0] >= max_seq_len: |
| | return |
| | sin, cos = precompute_rope_freqs( |
| | self.head_dim, max_seq_len, theta=self.rope_theta, dtype=torch.float32 |
| | ) |
| | self.rope_sin = sin.to(device) |
| | self.rope_cos = cos.to(device) |
| | |
| | def forward(self, x, mask=None): |
| | batch, seq_len, _ = x.shape |
| | device = x.device |
| | |
| | if self.use_sliding: |
| | self.init_rope(seq_len, device) |
| | |
| | xq = self.wq(x) |
| | xk = self.wk(x) |
| | xv = self.wv(x) |
| | |
| | xq = xq.view(batch, seq_len, self.num_heads, self.head_dim) |
| | xk = xk.view(batch, seq_len, self.num_kv_heads, self.head_dim) |
| | xv = xv.view(batch, seq_len, self.num_kv_heads, self.head_dim) |
| | |
| | if self.use_sliding: |
| | xq = apply_rotary_emb(xq, self.rope_cos[:seq_len], self.rope_sin[:seq_len]) |
| | xk = apply_rotary_emb(xk, self.rope_cos[:seq_len], self.rope_sin[:seq_len]) |
| | |
| | xk = xk.repeat_interleave(self.num_queries_per_kv, dim=2) |
| | xv = xv.repeat_interleave(self.num_queries_per_kv, dim=2) |
| | |
| | xq = xq.transpose(1, 2) |
| | xk = xk.transpose(1, 2) |
| | xv = xv.transpose(1, 2) |
| | |
| | scores = torch.matmul(xq, xk.transpose(-2, -1)) / math.sqrt(self.head_dim) |
| | |
| | if mask is not None: |
| | scores = scores + mask |
| | else: |
| | mask = torch.full((seq_len, seq_len), float('-inf'), device=device) |
| | mask = torch.triu(mask, diagonal=1) |
| | if self.use_sliding: |
| | for i in range(seq_len): |
| | low = max(0, i - self.window_size + 1) |
| | mask[i, :low] = float('-inf') |
| | scores = scores + mask |
| | |
| | probs = F.softmax(scores, dim=-1, dtype=torch.float32).to(xq.dtype) |
| | out = torch.matmul(probs, xv) |
| | out = out.transpose(1, 2).contiguous().view(batch, seq_len, -1) |
| | return self.wo(out) |
| | |
| | |
| | class ParallelTransformerBlock(nn.Module): |
| | """Decoder block with parallel attention and MLP.""" |
| | def __init__(self, config, layer_id): |
| | super().__init__() |
| | self.norm = CohereLayerNorm(config.emb_dim) |
| | self.attn = GroupedQueryAttention(config, layer_id) |
| | self.mlp = FeedForward(config) |
| | |
| | def forward(self, x, mask=None): |
| | residual = x |
| | x = self.norm(x) |
| | attn_out = self.attn(x, mask=mask) |
| | mlp_out = self.mlp(x) |
| | return residual + attn_out + mlp_out |
| | |
| | |
| | class TinyAya(nn.Module): |
| | """Tiny Aya 150M model.""" |
| | def __init__(self, config): |
| | super().__init__() |
| | self.config = config |
| | self.token_embedding = nn.Embedding(config.vocab_size, config.emb_dim) |
| | self.layers = nn.ModuleList([ |
| | ParallelTransformerBlock(config, i) for i in range(config.num_layers) |
| | ]) |
| | self.norm = CohereLayerNorm(config.emb_dim) |
| | self.lm_head = nn.Linear(config.emb_dim, config.vocab_size, bias=False) |
| | self.lm_head.weight = self.token_embedding.weight |
| | |
| | if config.gradient_checkpointing: |
| | self.gradient_checkpointing_enable() |
| | |
| | def gradient_checkpointing_enable(self): |
| | self._gradient_checkpointing = True |
| | |
| | def forward(self, input_ids, mask=None): |
| | x = self.token_embedding(input_ids) |
| | for layer in self.layers: |
| | if self.training and getattr(self, '_gradient_checkpointing', False): |
| | x = torch.utils.checkpoint.checkpoint(layer, x, mask) |
| | else: |
| | x = layer(x, mask=mask) |
| | x = self.norm(x) |
| | logits = self.lm_head(x) |
| | return logits |
| | |
| | @torch.no_grad() |
| | def generate(self, input_ids, max_new_tokens=50, temperature=1.0): |
| | self.eval() |
| | for _ in range(max_new_tokens): |
| | logits = self(input_ids[:, -self.config.max_seq_len:]) |
| | next_token_logits = logits[:, -1, :] / temperature |
| | probs = F.softmax(next_token_logits, dim=-1) |
| | next_token = torch.multinomial(probs, num_samples=1) |
| | input_ids = torch.cat([input_ids, next_token], dim=-1) |
| | return input_ids |
| | |
| | model = TinyAya(ModelConfig()) |
| | state_dict = torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location="cpu") |
| | model.load_state_dict(state_dict) |
| | model.eval() |
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | ``` |
| |
|
| | Text Generation Example |
| |
|
| | ```python |
| | prompt = """Was Jeffrey a good guy?""" |
| | input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device) |
| | with torch.no_grad(): |
| | output = model.generate(input_ids, max_new_tokens=50, temperature=0.8) |
| | print("Generated text:") |
| | print(tokenizer.decode(output[0])) |
| | ``` |
| |
|
| | Training Details |
| |
|
| | The model was trained for one epoch on the full dataset using an L4 GPU in Google Colab. |
| | Optimizer: AdamW (lr=1e-4) with gradient clipping (max norm=1.0). Mixed precision (float16) was used. |
| |
|
| | Limitations |
| |
|
| | · The model is small and was trained on a limited dataset; it may produce repetitive or nonsensical outputs. |
| | · It has not undergone any safety fine‑tuning; use with caution. |
| |
|
| | License |
| |
|
| | MIT |
| |
|