|
|
--- |
|
|
license: apache-2.0 |
|
|
datasets: |
|
|
- bookcorpus/bookcorpus |
|
|
language: |
|
|
- en |
|
|
pipeline_tag: text-generation |
|
|
tags: |
|
|
- mini-gpt |
|
|
- gpt-1 |
|
|
- gpt |
|
|
- generative-pretrained-transformer |
|
|
- decoder-only-transformer |
|
|
--- |
|
|
# Introducing Mini GPT-1 ~55M parameters |
|
|
This is a custom decoder-only transformer model (GPT1-style) trained from scratch on Bookcorpus dataset using PyTorch by Dilip Pokhrel. |
|
|
|
|
|
## Model Details |
|
|
|
|
|
- **Architecture**: Decoder-only Transformer |
|
|
- **Layers**: 6 |
|
|
- **Embedding Size**: 512 |
|
|
- **Heads**: 8 |
|
|
- **Feedforward Dim**: 2048 |
|
|
- **Sequence Length**: 128 |
|
|
- **Vocab Size**: 35,000 |
|
|
|
|
|
## Tokenizer |
|
|
|
|
|
Trained using `ByteLevelBPETokenizer` from the `tokenizers` library. |
|
|
|
|
|
## Inference Example |
|
|
|
|
|
Run it in google colab. Go to ==> https://colab.research.google.com |
|
|
|
|
|
```python |
|
|
# Clone only if not already cloned |
|
|
import os |
|
|
if not os.path.exists("mini-gpt1"): |
|
|
!git clone https://huggingface.co/dilip025/mini-gpt1 |
|
|
|
|
|
# Install dependencies, Uncomment it if you haven't installed |
|
|
# !pip install torch tokenizers |
|
|
|
|
|
# Add repo path to Python |
|
|
import sys |
|
|
sys.path.append("mini-gpt1") |
|
|
|
|
|
# Imports |
|
|
from model_code.decoder_only_transformer import DecoderOnlyTransformer |
|
|
from tokenizers import ByteLevelBPETokenizer |
|
|
import torch |
|
|
|
|
|
# Load tokenizer |
|
|
tokenizer = ByteLevelBPETokenizer( |
|
|
"mini-gpt1/vocab.json", |
|
|
"mini-gpt1/merges.txt", |
|
|
) |
|
|
|
|
|
# Model config |
|
|
vocab_size = 35000 |
|
|
max_len = 128 |
|
|
embed_dim = 512 |
|
|
num_heads = 8 |
|
|
depth = 6 |
|
|
ff_dim = 2048 |
|
|
|
|
|
# Device |
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
# Load model and weights |
|
|
model = DecoderOnlyTransformer( |
|
|
vocab_size=vocab_size, |
|
|
max_len=max_len, |
|
|
embed_dim=embed_dim, |
|
|
num_heads=num_heads, |
|
|
depth=depth, |
|
|
ff_dim=ff_dim, |
|
|
).to(device) |
|
|
|
|
|
state_dict = torch.load("mini-gpt1/pytorch_model.bin", map_location=device) |
|
|
model.load_state_dict(state_dict) |
|
|
model.eval() |
|
|
|
|
|
# 💡 Your generation function with temperature & top-k |
|
|
def generate(model, tokenizer, prompt, max_length=50, temperature=1.0, top_k=50): |
|
|
model.eval() |
|
|
device = next(model.parameters()).device |
|
|
|
|
|
encoding = tokenizer.encode(prompt) |
|
|
input_ids = torch.tensor([encoding.ids], dtype=torch.long).to(device) |
|
|
generated = input_ids.clone() |
|
|
|
|
|
for _ in range(max_length): |
|
|
logits = model(generated) # [1, T, vocab_size] |
|
|
next_token_logits = logits[:, -1, :] / temperature |
|
|
|
|
|
if top_k is not None: |
|
|
values, indices = torch.topk(next_token_logits, top_k) |
|
|
mask = torch.full_like(next_token_logits, float('-inf')) |
|
|
mask.scatter_(1, indices, values) |
|
|
next_token_logits = mask |
|
|
|
|
|
probs = torch.softmax(next_token_logits, dim=-1) |
|
|
next_token = torch.multinomial(probs, num_samples=1) |
|
|
|
|
|
generated = torch.cat((generated, next_token), dim=1) |
|
|
|
|
|
# Optional: stop on [EOS] token |
|
|
if hasattr(tokenizer, 'token_to_id') and tokenizer.token_to_id('[EOS]') is not None: |
|
|
if next_token.item() == tokenizer.token_to_id('[EOS]'): |
|
|
break |
|
|
|
|
|
return tokenizer.decode(generated[0].tolist()) |
|
|
|
|
|
|
|
|
# 🔥 Example inference -- Run this in second cell too see gibberish ;) |
|
|
prompt = "He told me a story" |
|
|
output = generate(model, tokenizer, prompt, max_length=100, temperature=1.2, top_k=40) |
|
|
print("Generated Output:\n", output) |
|
|
|