File size: 2,843 Bytes
8794ddf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
# bayes_mini
`bayes_mini` is a custom GPT-2 (124M) language model trained from scratch on ~20 GB of English Wikipedia data.
## Architecture
- Based on GPT-2 small (124M parameters)
- 12 layers, 12 attention heads
- Hidden size: 768
- Context length: 1024
- Vocabulary size: 50257
- Dropout: 0.1
## Training Configuration
- Dataset: Cleaned English Wikipedia (~20 GB)
- Architecture: GPT-2 Small (124M parameters)
- Optimizer settings: `Foundation better_quality`
- Hardware: NVIDIA GeForce RTX 4060 (8 GB VRAM)
- Epochs: 50
- Batch size: 4 (gradient accumulation steps: 8 -> effective batch size: 32)
- Learning rate: 2e-4
- Warmup steps: 2000
- Weight decay: 0.01
## Install required packages
```bash
pip install torch transformers tiktoken huggingface_hub
```
## Example Usage
```python
import os
import torch
import json
import tiktoken
import importlib.util
from huggingface_hub import hf_hub_download
# === CONFIG ===
REPO_ID = "faizack/bayes_mini_custom"
# === Step 1: Download necessary files ===
config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json")
model_path = hf_hub_download(repo_id=REPO_ID, filename="pytorch_model.bin")
modeling_path = hf_hub_download(repo_id=REPO_ID, filename="modeling_gpt2_custom.py")
# === Step 2: Dynamically import modeling_gpt2_custom.py ===
spec = importlib.util.spec_from_file_location("modeling_gpt2_custom", modeling_path)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
GPTModel = mod.GPTModel # Now you can use GPTModel
# === Step 3: Load config ===
with open(config_path, "r") as f:
config = json.load(f)
model_config = {
"vocab_size": config["vocab_size"],
"context_length": config["n_positions"],
"emb_dim": config["n_embd"],
"n_heads": config["n_head"],
"n_layers": config["n_layer"],
"drop_rate": config["dropout"],
"qkv_bias": config["qkv_bias"],
}
# === Step 4: Load tokenizer ===
tokenizer = tiktoken.get_encoding("gpt2")
prompt = "The rise of artificial intelligence"
input_ids = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
# === Step 5: Load model ===
model = GPTModel(model_config)
model.load_state_dict(torch.load(model_path, map_location="cpu"))
model.eval()
# === Step 6: Generate ===
def generate(model, idx, max_new_tokens=50):
for _ in range(max_new_tokens):
idx_cond = idx[:, -model_config["context_length"] :]
with torch.no_grad():
logits = model(idx_cond)
logits = logits[:, -1, :]
probs = torch.softmax(logits, dim=-1)
next_token = torch.multinomial(probs, num_samples=1)
idx = torch.cat([idx, next_token], dim=1)
return idx
output = generate(model, input_ids)
print(tokenizer.decode(output[0].tolist()))
```
|