| # bayes_mini | |
| `bayes_mini` is a custom GPT-2 (124M) language model trained from scratch on ~20 GB of English Wikipedia data. | |
| ## Architecture | |
| - Based on GPT-2 small (124M parameters) | |
| - 12 layers, 12 attention heads | |
| - Hidden size: 768 | |
| - Context length: 1024 | |
| - Vocabulary size: 50257 | |
| - Dropout: 0.1 | |
| ## Training Configuration | |
| - Dataset: Cleaned English Wikipedia (~20 GB) | |
| - Architecture: GPT-2 Small (124M parameters) | |
| - Optimizer settings: `Foundation better_quality` | |
| - Hardware: NVIDIA GeForce RTX 4060 (8 GB VRAM) | |
| - Epochs: 50 | |
| - Batch size: 4 (gradient accumulation steps: 8 -> effective batch size: 32) | |
| - Learning rate: 2e-4 | |
| - Warmup steps: 2000 | |
| - Weight decay: 0.01 | |
| ## Install required packages | |
| ```bash | |
| pip install torch transformers tiktoken huggingface_hub | |
| ``` | |
| ## Example Usage | |
| ```python | |
| import os | |
| import torch | |
| import json | |
| import tiktoken | |
| import importlib.util | |
| from huggingface_hub import hf_hub_download | |
| # === CONFIG === | |
| REPO_ID = "faizack/bayes_mini_custom" | |
| # === Step 1: Download necessary files === | |
| config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json") | |
| model_path = hf_hub_download(repo_id=REPO_ID, filename="pytorch_model.bin") | |
| modeling_path = hf_hub_download(repo_id=REPO_ID, filename="modeling_gpt2_custom.py") | |
| # === Step 2: Dynamically import modeling_gpt2_custom.py === | |
| spec = importlib.util.spec_from_file_location("modeling_gpt2_custom", modeling_path) | |
| mod = importlib.util.module_from_spec(spec) | |
| spec.loader.exec_module(mod) | |
| GPTModel = mod.GPTModel # Now you can use GPTModel | |
| # === Step 3: Load config === | |
| with open(config_path, "r") as f: | |
| config = json.load(f) | |
| model_config = { | |
| "vocab_size": config["vocab_size"], | |
| "context_length": config["n_positions"], | |
| "emb_dim": config["n_embd"], | |
| "n_heads": config["n_head"], | |
| "n_layers": config["n_layer"], | |
| "drop_rate": config["dropout"], | |
| "qkv_bias": config["qkv_bias"], | |
| } | |
| # === Step 4: Load tokenizer === | |
| tokenizer = tiktoken.get_encoding("gpt2") | |
| prompt = "The rise of artificial intelligence" | |
| input_ids = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0) | |
| # === Step 5: Load model === | |
| model = GPTModel(model_config) | |
| model.load_state_dict(torch.load(model_path, map_location="cpu")) | |
| model.eval() | |
| # === Step 6: Generate === | |
| def generate(model, idx, max_new_tokens=50): | |
| for _ in range(max_new_tokens): | |
| idx_cond = idx[:, -model_config["context_length"] :] | |
| with torch.no_grad(): | |
| logits = model(idx_cond) | |
| logits = logits[:, -1, :] | |
| probs = torch.softmax(logits, dim=-1) | |
| next_token = torch.multinomial(probs, num_samples=1) | |
| idx = torch.cat([idx, next_token], dim=1) | |
| return idx | |
| output = generate(model, input_ids) | |
| print(tokenizer.decode(output[0].tolist())) | |
| ``` | |