In [None]:
# cell 1: install
!pip install -q transformers datasets accelerate torch torchvision sentencepiece

# For progress bars
!pip install -q tqdm


In [None]:
# cell 2: upload your train.jsonl if not already present
from google.colab import files
print("If you already uploaded train.jsonl, skip the upload step.")
uploaded = files.upload()  # use to upload train.jsonl with {"input","output"} lines


If you already uploaded train.jsonl, skip the upload step.


Saving train-50000.jsonl to train-50000.jsonl


In [None]:
# cell 3: load HF DatasetDict (expects your train.jsonl in /content)
from datasets import load_dataset, DatasetDict

# If user uploaded as 'train.jsonl', otherwise change filename
ds = load_dataset("json", data_files={"train":"train.jsonl"})["train"]

# create a small validation split for quick checks
ds = ds.train_test_split(test_size=0.08, seed=42)
dataset = DatasetDict({"train": ds["train"], "validation": ds["test"]})
print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 46000
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 4000
    })
})


In [None]:
# cell 4: imports and helper functions
import math, time, os, random
from typing import Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# gpt2 tokenizer has no pad token; set it
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
vocab_size = len(tokenizer)

def compute_param_count(n_layers, d_model, d_ff, n_heads):
    # approximate parameter count
    # embeddings: vocab*d_model + pos*d_model
    # per layer: attn (qkv + out) ~ 4 * d_model * d_model  (approx),
    # feedforward ~ 2 * d_model * d_ff
    emb = vocab_size * d_model + d_model * 1024  # assume max pos 1024
    per_layer = 4 * d_model * d_model + 2 * d_model * d_ff
    total = emb + n_layers * per_layer
    return total


Device: cuda


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# cell 5: tiny transformer blocks
class CausalSelfAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        assert d_model % n_heads == 0
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        self.scale = self.head_dim ** -0.5
        self.qkv = nn.Linear(d_model, 3 * d_model)
        self.out = nn.Linear(d_model, d_model)
        self.register_buffer("mask", torch.tril(torch.ones(1, 1, 1024, 1024)))  # max seq 1024; adjust if needed

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.qkv(x)  # (B,T,3C)
        q, k, v = qkv.chunk(3, dim=2)
        q = q.view(B, T, self.n_heads, self.head_dim).transpose(1,2)  # B,heads,T,hd
        k = k.view(B, T, self.n_heads, self.head_dim).transpose(1,2)
        v = v.view(B, T, self.n_heads, self.head_dim).transpose(1,2)
        att = (q @ k.transpose(-2,-1)) * self.scale  # B,heads,T,T
        # apply causal mask (only use first T x T part of mask)
        mask = self.mask[:,:,:T,:T]
        att = att.masked_fill(mask==0, float('-inf'))
        att = F.softmax(att, dim=-1)
        out = att @ v  # B,heads,T,head
        out = out.transpose(1,2).contiguous().view(B, T, C)
        return self.out(out)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Linear(d_ff, d_model),
        )
    def forward(self, x): return self.net(x)

class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.attn = CausalSelfAttention(d_model, n_heads)
        self.ln2 = nn.LayerNorm(d_model)
        self.mlp = FeedForward(d_model, d_ff)
    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x


In [None]:
# cell 6: full TinyGPT class
class TinyGPT(nn.Module):
    def __init__(self, vocab_size, n_layers=4, n_heads=4, d_model=256, d_ff=1024, max_seq_len=512):
        super().__init__()
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.token_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Embedding(max_seq_len, d_model)
        self.drop = nn.Dropout(0.1)
        self.blocks = nn.ModuleList([TransformerBlock(d_model, n_heads, d_ff) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(d_model)
        self.head = nn.Linear(d_model, vocab_size, bias=False)

    def forward(self, idx):
        B, T = idx.shape
        positions = torch.arange(0, T, device=idx.device).unsqueeze(0).expand(B, T)
        x = self.token_emb(idx) + self.pos_emb(positions)
        x = self.drop(x)
        for b in self.blocks:
            x = b(x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits


In [None]:
# cell 7: create a small model (configurable)
# recommended safe small config for Colab Free: n_layers=4, d_model=256, n_heads=4, d_ff=1024
config = dict(n_layers=4, n_heads=4, d_model=256, d_ff=1024, max_seq_len=256)
approx = compute_param_count(config['n_layers'], config['d_model'], config['d_ff'], config['n_heads'])
print("Approx param count (very rough):", int(approx))
model = TinyGPT(vocab_size, **config).to(device)
print(model)


Approx param count (very rough): 16273920
TinyGPT(
  (token_emb): Embedding(50258, 256)
  (pos_emb): Embedding(256, 256)
  (drop): Dropout(p=0.1, inplace=False)
  (blocks): ModuleList(
    (0-3): 4 x TransformerBlock(
      (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (attn): CausalSelfAttention(
        (qkv): Linear(in_features=256, out_features=768, bias=True)
        (out): Linear(in_features=256, out_features=256, bias=True)
      )
      (ln2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (mlp): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=256, out_features=1024, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=1024, out_features=256, bias=True)
        )
      )
    )
  )
  (ln_f): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=256, out_features=50258, bias=False)
)


In [None]:
# cell 8: build text examples (combine input+output)
def build_text(example):
    return example["input"].strip() + "\n" + example["output"].strip()

# quick tokenize function that returns tensors of input ids
max_len = 256

def encode_examples(batch):
    texts = [build_text(x) for x in batch]
    enc = tokenizer(texts, truncation=True, padding="max_length", max_length=max_len, return_tensors="pt")
    # For causal LM, labels = input_ids (predict next token)
    input_ids = enc["input_ids"]
    labels = input_ids.clone()
    return {"input_ids": input_ids, "labels": labels}

# convert HF dataset to torch Dataset
class HFDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset):
        self.examples = [build_text(x) for x in hf_dataset]
    def __len__(self): return len(self.examples)
    def __getitem__(self, idx):
        enc = tokenizer(self.examples[idx], truncation=True, padding="max_length", max_length=max_len, return_tensors="pt")
        return {"input_ids": enc["input_ids"].squeeze(0), "labels": enc["input_ids"].squeeze(0)}

train_ds = HFDataset(dataset["train"])
val_ds = HFDataset(dataset["validation"])

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=8)


In [None]:
# cell 9: training helpers
def evaluate(model, data_loader):
    model.eval()
    total_loss = 0.0
    count = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            logits = model(input_ids)
            # shift so predictions at t predict t+1 token (standard LM loss)
            shift_logits = logits[:, :-1, :].contiguous()
            shift_labels = labels[:, 1:].contiguous()
            loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), ignore_index=tokenizer.pad_token_id)
            total_loss += loss.item()
            count += 1
    return total_loss / max(1, count)

# training
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
num_epochs = 1  # start with 1 epoch for fast iteration
print("Starting training...")
start = time.time()
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, batch in enumerate(train_loader):
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        logits = model(input_ids)
        shift_logits = logits[:, :-1, :].contiguous()
        shift_labels = labels[:, 1:].contiguous()
        loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), ignore_index=tokenizer.pad_token_id)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        running_loss += loss.item()
        if i % 50 == 0:
            print(f"Epoch {epoch} Step {i} Loss {loss.item():.4f}")
    val_loss = evaluate(model, val_loader)
    print(f"Epoch {epoch} completed. Validation loss: {val_loss:.4f}")
end = time.time()
print("Training finished in", round(end-start, 1), "seconds")


Starting training...
Epoch 0 Step 0 Loss 11.0609
Epoch 0 Step 50 Loss 1.3375
Epoch 0 Step 100 Loss 0.5094
Epoch 0 Step 150 Loss 0.2872
Epoch 0 Step 200 Loss 0.2452
Epoch 0 Step 250 Loss 0.2308
Epoch 0 Step 300 Loss 0.1905
Epoch 0 Step 350 Loss 0.1591
Epoch 0 Step 400 Loss 0.1171
Epoch 0 Step 450 Loss 0.1049
Epoch 0 Step 500 Loss 0.1037
Epoch 0 Step 550 Loss 0.1163
Epoch 0 Step 600 Loss 0.1156
Epoch 0 Step 650 Loss 0.1019
Epoch 0 Step 700 Loss 0.1147
Epoch 0 Step 750 Loss 0.1103
Epoch 0 Step 800 Loss 0.1040
Epoch 0 Step 850 Loss 0.1255
Epoch 0 Step 900 Loss 0.1070
Epoch 0 Step 950 Loss 0.1180
Epoch 0 Step 1000 Loss 0.1136
Epoch 0 Step 1050 Loss 0.1090
Epoch 0 Step 1100 Loss 0.1055
Epoch 0 Step 1150 Loss 0.1099
Epoch 0 Step 1200 Loss 0.0996
Epoch 0 Step 1250 Loss 0.0955
Epoch 0 Step 1300 Loss 0.1113
Epoch 0 Step 1350 Loss 0.1094
Epoch 0 Step 1400 Loss 0.1115
Epoch 0 Step 1450 Loss 0.1148
Epoch 0 Step 1500 Loss 0.1057
Epoch 0 Step 1550 Loss 0.0997
Epoch 0 Step 1600 Loss 0.1090
Epoch 0 Ste

In [None]:
# cell 10: generation helper (greedy / top-k)
@torch.no_grad()
def generate(model, prompt, max_new_tokens=80, temperature=1.0, top_k=50):
    model.eval()
    tokens = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_len).input_ids.to(device)
    for _ in range(max_new_tokens):
        logits = model(tokens)
        next_logits = logits[:, -1, :] / (temperature if temperature>0 else 1.0)
        filtered_logits, _ = torch.topk(next_logits, k=top_k)
        # sample among top_k
        probs = F.softmax(next_logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1)
        tokens = torch.cat([tokens, next_token], dim=1)
    return tokenizer.decode(tokens[0], skip_special_tokens=True)

prompt = "Write a short paragraph about renewable energy adoption in the UK."
print(generate(model, prompt, max_new_tokens=80, temperature=0.9, top_k=40))


Write a short paragraph about renewable energy adoption in the UK.
This passage provides a clear, and neutral explanation about renewable energy. It offers context, structured detail, and meaningful insight while avoiding biased or inappropriate content. It offers context, and meaningful insight while avoiding biased or inappropriate content. It offers context, and meaningful insight while avoiding biased or inappropriate content. It offers context, and meaningful insight while avoiding biased or inappropriate content. It offers context, structured detail,


In [None]:
# cell 11: multi-config runner (quick-and-dirty)
configs = [
    {"name":"small","n_layers":2,"n_heads":2,"d_model":128,"d_ff":512},
    {"name":"base","n_layers":4,"n_heads":4,"d_model":256,"d_ff":1024},
    # Only attempt large if you have RAM/GPU
    # {"name":"wide","n_layers":6,"n_heads":6,"d_model":384,"d_ff":1536},
]

results = []

for cfg in configs:
    print("Running config:", cfg["name"])
    approx = compute_param_count(cfg["n_layers"], cfg["d_model"], cfg["d_ff"], cfg["n_heads"])
    print("Approx params:", int(approx))
    m = TinyGPT(vocab_size, n_layers=cfg["n_layers"], n_heads=cfg["n_heads"],
                d_model=cfg["d_model"], d_ff=cfg["d_ff"], max_seq_len=max_len).to(device)
    opt = torch.optim.AdamW(m.parameters(), lr=3e-4)
    # small 1-epoch train
    for epoch in range(1):
        m.train()
        for i, batch in enumerate(train_loader):
            inp = batch["input_ids"].to(device); lab = batch["labels"].to(device)
            logits = m(inp)
            loss = F.cross_entropy(logits[:, :-1, :].reshape(-1, logits.size(-1)), lab[:,1:].reshape(-1), ignore_index=tokenizer.pad_token_id)
            loss.backward(); opt.step(); opt.zero_grad()
            if i%100==0:
                print(cfg["name"], "step", i, "loss", loss.item())
    val_loss = evaluate(m, val_loader)
    sample = generate(m, "Describe the future of renewable energy in the UK.", max_new_tokens=60)
    results.append({"config":cfg["name"], "val_loss":val_loss, "sample":sample})
    # free up memory
    del m; torch.cuda.empty_cache()

print("Summary:")
for r in results: print(r)


Running config: small
Approx params: 6957312
small step 0 loss 10.936142921447754
small step 100 loss 2.0066487789154053
small step 200 loss 0.7831152081489563
small step 300 loss 0.4042394757270813
small step 400 loss 0.27594664692878723
small step 500 loss 0.23986797034740448
small step 600 loss 0.2129729688167572
small step 700 loss 0.17271828651428223
small step 800 loss 0.1482691466808319
small step 900 loss 0.13474945724010468
small step 1000 loss 0.13562719523906708
small step 1100 loss 0.1322852075099945
small step 1200 loss 0.12768428027629852
small step 1300 loss 0.13074539601802826
small step 1400 loss 0.11563324928283691
small step 1500 loss 0.11895468831062317
small step 1600 loss 0.108633853495121
small step 1700 loss 0.10433203727006912
small step 1800 loss 0.10576470196247101
small step 1900 loss 0.10991769284009933
small step 2000 loss 0.1076023131608963
small step 2100 loss 0.11549754440784454
small step 2200 loss 0.10694943368434906
small step 2300 loss 0.10303946584

In [None]:
# cell 12: save small model example (for base model)
save_dir = "./tinygpt_base"
os.makedirs(save_dir, exist_ok=True)
torch.save(model.state_dict(), os.path.join(save_dir, "pytorch_model.bin"))
tokenizer.save_pretrained(save_dir)

# Optional: convert to HF format (simple script)
# Note: this will not automatically create a Transformers AutoModel repo, but saves checkpoint + tokenizer.
print("Saved to", save_dir)


Saved to ./tinygpt_base


In [None]:
# Cell 13: Push TinyGPT model to Hugging Face Hub
from huggingface_hub import HfApi, HfFolder, upload_folder

# Step 1: Login (only needs to be done once per session)
from huggingface_hub import notebook_login
notebook_login()

# Step 2: Define repo name
hf_repo_name = "Abdurrahmanesc/tinygpt-base-model"   # <-- change this to your desired repo

# Step 3: Create repo (skip if already exists)
api = HfApi()
api.create_repo(repo_id=hf_repo_name, exist_ok=True)

# Step 4: Upload local folder containing model + tokenizer
upload_folder(
    folder_path="./tinygpt_base",
    repo_id=hf_repo_name,
    commit_message="Upload TinyGPT base model"
)

print(f"🚀 TinyGPT model uploaded successfully to: https://huggingface.co/{hf_repo_name}")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...pt_base/pytorch_model.bin:   0%|          |  643kB /  133MB            

🚀 TinyGPT model uploaded successfully to: https://huggingface.co/Abdurrahmanesc/tinygpt-base-model
