--- language: en license: mit tags: - tiny-epstein - epstein-files - transformers --- # tiny-epstein-100m # Actively being trained. work in progress # THANK YOU FOR CHECKING OUT THIS MODEL <3 # PLEASE LEAVE A LIKE AND SHARE IM TRYING TO GET A JOB AS AN AI ENGINEER # IF YOU ARE HIRING AI ENGINEERS PLEASE HMU I DONT HAVE PROPER EDUCATION BUT IM OBVIOUSLY VERY KNOWLEDGEABLE A small transformer model (~100M parameters) trained on the [teyler/epstein-files-20k](https://huggingface.co/datasets/teyler/epstein-files-20k) dataset. The architecture is inspired by **Tiny Aya** modifications and is designed for efficient on-device inference. ## Model Details - **Architecture**: Decoder-only transformer with parallel blocks, Grouped Query Attention (GQA), SwiGLU activation, and bias‑free LayerNorm. - **Sliding Window Attention**: 3:1 local:global ratio (first 75% of layers use sliding window with RoPE; remaining layers use full attention with NoPE). - **Parameters**: ~100 million - **Context Length**: 1024 tokens (configurable) - **Tokenizer**: GPT‑2 (same as used during training) - **Training Data**: [teyler/epstein-files-20k](https://huggingface.co/datasets/teyler/epstein-files-20k) – 20,000 documents related to the Epstein files. ## Intended Use This model is primarily for research and experimentation. It can generate continuations of text given a prompt, especially on topics related to the Epstein files. ## How to Use ### Installation Make sure you have `torch` and `transformers` installed. If you want to run inference, install the required packages: ```bash pip install torch transformers ``` Loading the Model and Tokenizer ```python import torch from transformers import GPT2TokenizerFast import os import torch.nn as nn import torch.nn.functional as F from torch.utils.data import DataLoader from transformers import AutoTokenizer from datasets import load_dataset, concatenate_datasets, Dataset from tqdm import tqdm import math from huggingface_hub import hf_hub_download from huggingface_hub import snapshot_download # Download the model from Hugging Face Hub model_path = snapshot_download(repo_id="liminerity/tiny-epstein-100m") tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') tokenizer.pad_token = tokenizer.eos_token # ------------------------------------------------------------------------------ # Configuration (scaled to ~150M for L4 GPU) # ------------------------------------------------------------------------------ class ModelConfig: vocab_size = 50257 # will be updated from tokenizer emb_dim = 768 # embedding dimension hidden_dim = 2048 # intermediate size (FFN) - reduced num_layers = 12 # number of transformer layers - reduced num_heads = 12 # number of query heads - reduced num_kv_heads = 4 # number of key/value heads (GQA) max_seq_len = 1024 # shorter sequence length to save memory window_size = 1024 # sliding window size (match max_seq_len) sliding_window_ratio = 0.75 # fraction of layers with sliding window rope_theta = 10000.0 # base for RoPE dtype = torch.float16 # use mixed precision bias = False # no bias in linear layers dropout = 0.0 # no dropout mentioned gradient_checkpointing = True # enable to save memory # ------------------------------------------------------------------------------ # Helper modules (unchanged) # ------------------------------------------------------------------------------ class CohereLayerNorm(nn.Module): """LayerNorm without bias (scale only).""" def __init__(self, emb_dim, eps=1e-5): super().__init__() self.eps = eps self.weight = nn.Parameter(torch.ones(emb_dim)) def forward(self, x): input_dtype = x.dtype x = x.to(torch.float32) mean = x.mean(dim=-1, keepdim=True) variance = (x - mean).pow(2).mean(dim=-1, keepdim=True) x = (x - mean) * torch.rsqrt(variance + self.eps) return (self.weight.to(torch.float32) * x).to(input_dtype) class FeedForward(nn.Module): """SwiGLU MLP.""" def __init__(self, config): super().__init__() self.fc1 = nn.Linear(config.emb_dim, config.hidden_dim, bias=config.bias) self.fc2 = nn.Linear(config.emb_dim, config.hidden_dim, bias=config.bias) self.fc3 = nn.Linear(config.hidden_dim, config.emb_dim, bias=config.bias) def forward(self, x): x_fc1 = self.fc1(x) x_fc2 = self.fc2(x) x = F.silu(x_fc1) * x_fc2 return self.fc3(x) def precompute_rope_freqs(dim, max_seq_len, theta=10000.0, dtype=torch.float32): """Precompute rotary position embeddings.""" assert dim % 2 == 0, "dim must be even" freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=dtype)[:(dim // 2)] / dim)) t = torch.arange(max_seq_len, dtype=dtype) freqs = torch.outer(t, freqs) # shape (max_seq_len, dim//2) emb = torch.cat((freqs, freqs), dim=-1) # shape (max_seq_len, dim) return emb.sin(), emb.cos() def rotate_half(x): """Rotates half the hidden dims of the input.""" x1, x2 = x.chunk(2, dim=-1) return torch.cat((-x2, x1), dim=-1) def apply_rotary_emb(x, cos, sin): """ Apply rotary embeddings to input tensor. x: (batch, seq_len, num_heads, head_dim) cos, sin: (seq_len, head_dim) """ cos = cos.unsqueeze(0).unsqueeze(2) # (1, seq_len, 1, head_dim) sin = sin.unsqueeze(0).unsqueeze(2) # (1, seq_len, 1, head_dim) return (x * cos) + (rotate_half(x) * sin) class GroupedQueryAttention(nn.Module): """Multi-head attention with GQA and optional sliding window mask.""" def __init__(self, config, layer_id): super().__init__() self.num_heads = config.num_heads self.num_kv_heads = config.num_kv_heads self.head_dim = config.emb_dim // config.num_heads assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads self.wq = nn.Linear(config.emb_dim, config.num_heads * self.head_dim, bias=config.bias) self.wk = nn.Linear(config.emb_dim, config.num_kv_heads * self.head_dim, bias=config.bias) self.wv = nn.Linear(config.emb_dim, config.num_kv_heads * self.head_dim, bias=config.bias) self.wo = nn.Linear(config.num_heads * self.head_dim, config.emb_dim, bias=config.bias) total_layers = config.num_layers num_sliding = int(total_layers * config.sliding_window_ratio) self.use_sliding = (layer_id < num_sliding) self.window_size = config.window_size self.max_seq_len = config.max_seq_len self.rope_theta = config.rope_theta self.rope_sin, self.rope_cos = None, None def init_rope(self, max_seq_len, device): if self.rope_sin is not None and self.rope_sin.shape[0] >= max_seq_len: return sin, cos = precompute_rope_freqs( self.head_dim, max_seq_len, theta=self.rope_theta, dtype=torch.float32 ) self.rope_sin = sin.to(device) self.rope_cos = cos.to(device) def forward(self, x, mask=None): batch, seq_len, _ = x.shape device = x.device if self.use_sliding: self.init_rope(seq_len, device) xq = self.wq(x) xk = self.wk(x) xv = self.wv(x) xq = xq.view(batch, seq_len, self.num_heads, self.head_dim) xk = xk.view(batch, seq_len, self.num_kv_heads, self.head_dim) xv = xv.view(batch, seq_len, self.num_kv_heads, self.head_dim) if self.use_sliding: xq = apply_rotary_emb(xq, self.rope_cos[:seq_len], self.rope_sin[:seq_len]) xk = apply_rotary_emb(xk, self.rope_cos[:seq_len], self.rope_sin[:seq_len]) xk = xk.repeat_interleave(self.num_queries_per_kv, dim=2) xv = xv.repeat_interleave(self.num_queries_per_kv, dim=2) xq = xq.transpose(1, 2) xk = xk.transpose(1, 2) xv = xv.transpose(1, 2) scores = torch.matmul(xq, xk.transpose(-2, -1)) / math.sqrt(self.head_dim) if mask is not None: scores = scores + mask else: mask = torch.full((seq_len, seq_len), float('-inf'), device=device) mask = torch.triu(mask, diagonal=1) if self.use_sliding: for i in range(seq_len): low = max(0, i - self.window_size + 1) mask[i, :low] = float('-inf') scores = scores + mask probs = F.softmax(scores, dim=-1, dtype=torch.float32).to(xq.dtype) out = torch.matmul(probs, xv) out = out.transpose(1, 2).contiguous().view(batch, seq_len, -1) return self.wo(out) class ParallelTransformerBlock(nn.Module): """Decoder block with parallel attention and MLP.""" def __init__(self, config, layer_id): super().__init__() self.norm = CohereLayerNorm(config.emb_dim) self.attn = GroupedQueryAttention(config, layer_id) self.mlp = FeedForward(config) def forward(self, x, mask=None): residual = x x = self.norm(x) attn_out = self.attn(x, mask=mask) mlp_out = self.mlp(x) return residual + attn_out + mlp_out class TinyAya(nn.Module): """Tiny Aya 150M model.""" def __init__(self, config): super().__init__() self.config = config self.token_embedding = nn.Embedding(config.vocab_size, config.emb_dim) self.layers = nn.ModuleList([ ParallelTransformerBlock(config, i) for i in range(config.num_layers) ]) self.norm = CohereLayerNorm(config.emb_dim) self.lm_head = nn.Linear(config.emb_dim, config.vocab_size, bias=False) self.lm_head.weight = self.token_embedding.weight if config.gradient_checkpointing: self.gradient_checkpointing_enable() def gradient_checkpointing_enable(self): self._gradient_checkpointing = True def forward(self, input_ids, mask=None): x = self.token_embedding(input_ids) for layer in self.layers: if self.training and getattr(self, '_gradient_checkpointing', False): x = torch.utils.checkpoint.checkpoint(layer, x, mask) else: x = layer(x, mask=mask) x = self.norm(x) logits = self.lm_head(x) return logits @torch.no_grad() def generate(self, input_ids, max_new_tokens=50, temperature=1.0): self.eval() for _ in range(max_new_tokens): logits = self(input_ids[:, -self.config.max_seq_len:]) next_token_logits = logits[:, -1, :] / temperature probs = F.softmax(next_token_logits, dim=-1) next_token = torch.multinomial(probs, num_samples=1) input_ids = torch.cat([input_ids, next_token], dim=-1) return input_ids model = TinyAya(ModelConfig()) state_dict = torch.load(os.path.join(model_path, "pytorch_model.bin"), map_location="cpu") model.load_state_dict(state_dict) model.eval() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") ``` Text Generation Example ```python prompt = """Was Jeffrey a good guy?""" input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device) with torch.no_grad(): output = model.generate(input_ids, max_new_tokens=50, temperature=0.8) print("Generated text:") print(tokenizer.decode(output[0])) ``` Training Details The model was trained for one epoch on the full dataset using an L4 GPU in Google Colab. Optimizer: AdamW (lr=1e-4) with gradient clipping (max norm=1.0). Mixed precision (float16) was used. Limitations · The model is small and was trained on a limited dataset; it may produce repetitive or nonsensical outputs. · It has not undergone any safety fine‑tuning; use with caution. License MIT