CreatedNull commited on Jul 28, 2025

Commit

79eec1d

verified ·

1 Parent(s): 6c83835

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.gitattributes +2 -0
__pycache__/dataset.cpython-312.pyc +0 -0
__pycache__/filter.cpython-312.pyc +0 -0
__pycache__/model.cpython-312.pyc +0 -0
colab-scripts/dataset.py +274 -0
customgen.py +165 -0
data/filtered_data.jsonl +3 -0
data/overfit_data.jsonl +10 -0
data/reason_data.jsonl +0 -0
data/reasoned2_data.jsonl +0 -0
data/reasoned_data.jsonl +0 -0
data/unfiltered_data.jsonl +3 -0
dataset.py +142 -90
datasetgen-synthetic.py +75 -0
datasetgen.py +35 -10
datasetgen2.py +64 -0
datasets/5k_synthetic_dataset.jsonl +0 -0
filter.py +6 -5
minigpt.py +11 -7
model.py +18 -5
train_custom.py +44 -13

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/filtered_data.jsonl filter=lfs diff=lfs merge=lfs -text
+data/unfiltered_data.jsonl filter=lfs diff=lfs merge=lfs -text

__pycache__/dataset.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/dataset.cpython-312.pyc and b/__pycache__/dataset.cpython-312.pyc differ

__pycache__/filter.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/filter.cpython-312.pyc and b/__pycache__/filter.cpython-312.pyc differ

__pycache__/model.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/model.cpython-312.pyc and b/__pycache__/model.cpython-312.pyc differ

colab-scripts/dataset.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import json
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from tokenizers import Tokenizer
+from tqdm import tqdm
+import os
+import re
+from collections import Counter
+import multiprocessing
+from torch.utils.data import random_split
+multiprocessing.set_start_method("spawn", force=True)
+class ChatDataset(Dataset):
+    def __init__(self, data, tokenizer, block_size=64):
+        self.tokenizer = tokenizer
+        self.block_size = block_size
+        self.data = self.tokenize_data(data)
+    def tokenize_data(self, data):
+        chunks = []
+        with open(data, "r", encoding="utf-8") as f:
+            for d in f:
+                line = json.loads(d.strip())
+                # Fix duplicated instruction
+                text = "^User: " + line["instruction"].strip() + " MiniGPT: " + line["output"].strip() + " <END>"
+                encoding = self.tokenizer.encode(text)
+                tokens = encoding.ids
+                #print(tokens)
+                if len(tokens) < self.block_size:
+                    continue
+                for i in range(0, len(tokens) - self.block_size + 1, self.block_size):
+                    chunk = tokens[i:i + self.block_size]
+                    if len(chunk) == self.block_size:
+                        chunks.append(chunk)
+        return chunks
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        chunk = self.data[idx]
+        x = torch.tensor(chunk[:-1])
+        y = torch.tensor(chunk[1:])
+        return x, y
+class MiniBPETokenizr:
+    def __init__(self):
+        self.stoi = {}
+        self.itos = {}
+        self.vocab_size = 0
+    def tokenize(self, text):
+        text = text.lower().strip()
+        words = re.findall(r"[a-zA-Z0-9]+|[^\w\s]", text)
+        return [list(w) + ['</w>'] if w.isalnum() else [w] for w in words]
+    def get_stats(self, corpus):
+        pairs = Counter()
+        for tokens in corpus:
+            for i in range(len(tokens) - 1):
+                pairs[(tokens[i], tokens[i + 1])] += 1
+        return pairs
+    def merge_vocab(self, corpus, pair_to_merge):
+        bigram = re.escape(' '.join(pair_to_merge))
+        pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
+        merged = []
+        for tokens in corpus:
+            token_str = ' '.join(tokens)
+            token_str = pattern.sub(''.join(pair_to_merge), token_str)
+            merged.append(token_str.split())
+        return merged
+    def train(self, texts, merge_limit=1000):
+        corpus = [sum(self.tokenize(t), []) for t in texts]
+        merges_done = 0
+        loop = tqdm(total=merge_limit, desc="Training BPE")
+        while merges_done < merge_limit:
+            pairs = self.get_stats(corpus)
+            if not pairs:
+                break
+            best = max(pairs, key=pairs.get)
+            corpus = self.merge_vocab(corpus, best)
+            merges_done += 1
+            loop.update(1)
+        vocab = set(tok for seq in corpus for tok in seq)
+        vocab.update(["<PAD>", "<UNK>", "<END>", "^user:", "minigpt:"])
+        self.stoi = {tok: i for i, tok in enumerate(sorted(vocab))}
+        self.itos = {i: tok for tok, i in self.stoi.items()}
+        self.vocab_size = len(self.stoi)
+    def encode(self, text):
+        tokens = sum(self.tokenize(text), [])
+        output = []
+        i = 0
+        while i < len(tokens):
+            j = len(tokens)
+            while j > i:
+                candidate = ''.join(tokens[i:j])
+                if candidate in self.stoi:
+                    output.append(self.stoi[candidate])
+                    i = j
+                    break
+                j -= 1
+            else:
+                output.append(self.stoi.get("<UNK>", 1))
+                i += 1
+        return output
+    def decode(self, token_ids):
+        tokens = [self.itos.get(i, "<UNK>") for i in token_ids]
+        text = ' '.join(t.replace('</w>', '') for t in tokens if t not in {"<PAD>", "<END>", "<UNK>"})
+        text = re.sub(r'\s([?.!,:;])', r'\1', text)
+        return text.strip()
+    def save(self, path):
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump({"stoi": self.stoi, "itos": self.itos}, f)
+    def load(self, path):
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            self.stoi = {k: int(v) for k, v in data["stoi"].items()}
+            self.itos = {int(v): k for k, v in self.stoi.items()}
+        self.vocab_size = len(self.stoi)
+class SimpleTokenizr:
+    def __init__(self):
+        self.stoi = {}
+        self.itos = {}
+    def tokenize(self, text):
+        return re.findall(r"[a-zA-Z']+|\d+|[^\w\s]", text.lower())
+    def train(self, texts):
+        vocab = set()
+        for text in texts:
+            tokens = self.tokenize(text)
+            vocab.update(tokens)
+        vocab.update(["<PAD>", "<UNK>", "<END>", "^user :", "minigpt :", "MiniGPT :", ":"])
+        sorted_vocab = sorted(vocab)
+        self.stoi = {token: idx for idx, token in enumerate(sorted_vocab)}
+        self.itos = {idx: token for token, idx in self.stoi.items()}
+    def encode(self, text):
+        tokens = self.tokenize(text)
+        return [self.stoi.get(tok, self.stoi["<UNK>"]) for tok in tokens] + [self.stoi["<END>"]]
+    def decode(self, token_ids):
+        tokens = [self.itos.get(i, "<UNK>") for i in token_ids]
+        clean_tokens = [tok for tok in tokens if tok not in {"<PAD>", "<UNK>", "<END>"}]
+        text = ''
+        for i, tok in enumerate(clean_tokens):
+            if re.match(r"[.,!?;:]", tok):
+                text += tok
+            elif i > 0:
+                text += ' ' + tok
+            else:
+                text += tok
+        return text.strip().capitalize()
+    def save(self, path):
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump({"stoi": self.stoi, "itos": self.itos}, f)
+    def load(self, path):
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            self.stoi = {k: int(v) for k, v in data["stoi"].items()}
+            self.itos = {int(k): v for v, k in self.stoi.items()}
+    def __len__(self):
+        return len(self.stoi)
+    @property
+    def vocab_size(self):
+        return len(self.stoi)
+def validate(model, dataloader, device):
+    model.eval()
+    total_loss, correct, total = 0, 0, 0
+    with torch.no_grad():
+        for x, y in dataloader:
+            x, y = x.to(device), y.to(device)
+            logits = model(x)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
+            total_loss += loss.item()
+            preds = torch.argmax(logits, dim=-1)
+            correct += (preds == y).sum().item()
+            total += y.numel()
+    avg_loss = total_loss / len(dataloader)
+    accuracy = 100 * correct / total
+    return avg_loss, accuracy
+def train(model, dataset, tokenizer, epochs, filepathh, start_epoch=0, start_step=0):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    # 🔀 Proper train/val split
+    val_size = int(0.1 * len(dataset))
+    train_size = len(dataset) - val_size
+    train_set, val_set = random_split(dataset, [train_size, val_size])
+    train_loader = DataLoader(train_set, batch_size=10, shuffle=True, num_workers=2)
+    val_loader = DataLoader(val_set, batch_size=10, shuffle=False, num_workers=2)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
+    checkpoint_path = "./trained-mini-gpt/checkpoint-mini-gpt.pth"
+    if os.path.exists(checkpoint_path):
+        checkpoint = torch.load(checkpoint_path)
+        if "model_state_dict" in checkpoint:
+            model.load_state_dict(checkpoint["model_state_dict"])
+            optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+            start_epoch = checkpoint["epoch"]
+            start_step = checkpoint["step"]
+        else:
+            model.load_state_dict(checkpoint)
+    else:
+        print("🚀 Starting from scratch.")
+    total_steps = start_step
+    for epoch in range(start_epoch, epochs):
+        model.train()
+        total_loss, correct, total = 0, 0, 0
+        loop = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs}")
+        for step, (x, y) in loop:
+            x, y = x.to(device), y.to(device)
+            logits = model(x)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            preds = torch.argmax(logits, dim=-1)
+            correct += (preds == y).sum().item()
+            total += y.numel()
+            acc = 100 * correct / total
+            loop.set_postfix(loss=loss.item(), acc=acc)
+            #if step % 100 == 0:
+            #    torch.save({
+            #        "model_state_dict": model.state_dict(),
+            #        "optimizer_state_dict": optimizer.state_dict(),
+            #        "epoch": epoch,
+            #        "step": total_steps
+            #    }, checkpoint_path)
+        # 🔍 Validate after each epoch
+        val_loss, val_acc = validate(model, val_loader, device)
+        print(f"✅ Val Loss: {val_loss:.4f} | Val Accuracy: {val_acc:.2f}%")
+        # 💾 Save checkpoint
+        torch.save({
+            "model_state_dict": model.state_dict(),
+            "optimizer_state_dict": optimizer.state_dict(),
+            "epoch": epoch,
+            "step": total_steps
+        }, checkpoint_path)
+    torch.save(model.state_dict(), "./trained-mini-gpt/mini-gpt.pth")
+    print("🎉 Training complete.")

customgen.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import json
+import random
+from tqdm import tqdm
+from transformers import AutoTokenizer
+# CONFIG
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+MAX_TOKENS = 27
+NUM_SAMPLES = 50000
+SAVE_PATH = "./customgens/mini_qna_dataset.jsonl"
+# Extended Templates with Paraphrasing
+TEMPLATES = [
+    # WHY
+    ("Why do {subject} {action}?", "Because {reason}."),
+    ("What makes {subject} {action}?", "It's because {reason}."),
+    ("Explain why {subject} {action}.", "{reason} is the reason."),
+    # WHAT IS
+    ("What is {thing}?", "{thing} is {definition}."),
+    ("Define {thing}.", "{thing} refers to {definition}."),
+    ("Can you tell me what {thing} means?", "Sure! It's {definition}."),
+    # HOW
+    ("How does {thing} work?", "It works by {mechanism}."),
+    ("What's the mechanism behind {thing}?", "It involves {mechanism}."),
+    ("Explain how {thing} functions.", "{mechanism} is how it works."),
+    # WHEN / CONDITION
+    ("What happens when {condition}?", "{result}."),
+    ("Describe what occurs if {condition}.", "Usually, {result}."),
+    ("When {condition}, what takes place?", "The result is {result}."),
+    # IMPORTANCE
+    ("Why is {thing} important?", "Because {importance}."),
+    ("What makes {thing} important?", "{importance} is why."),
+    ("Is {thing} important? Why?", "Yes, because {importance}."),
+]
+# Knowledge Bank
+DATA = {
+    "animals": {
+        "subjects": ["cats", "dogs", "birds", "fish"],
+        "actions": ["sleep a lot", "bark", "fly", "swim"],
+        "reasons": [
+            "they conserve energy",
+            "they are nocturnal",
+            "it's in their nature",
+            "they communicate that way"
+        ]
+    },
+    "science": {
+        "things": ["gravity", "photosynthesis", "a star", "an atom"],
+        "definitions": [
+            "a force that pulls objects together",
+            "the process plants use to make food",
+            "a burning ball of gas",
+            "the smallest unit of matter"
+        ],
+        "mechanisms": [
+            "converting sunlight into energy",
+            "attracting objects with mass",
+            "splitting light into colors",
+            "colliding particles"
+        ],
+        "conditions": ["you heat ice", "a star dies"],
+        "results": ["it melts", "it becomes a black hole"],
+        "importance": [
+            "it keeps us on Earth",
+            "it enables life on Earth"
+        ]
+    },
+    "food": {
+        "things": ["a waffle", "chocolate", "rice", "milk"],
+        "definitions": [
+            "a sweet, crispy batter cake",
+            "a sweet made from cocoa",
+            "a grain eaten daily in Asia",
+            "a white liquid from cows"
+        ],
+        "importance": [
+            "it provides energy",
+            "it’s part of daily nutrition"
+        ]
+    }
+}
+TOPIC_COUNT = {k: 0 for k in DATA}
+MAX_PER_TOPIC = NUM_SAMPLES // len(DATA)
+def sample_topic():
+    options = [t for t in DATA if TOPIC_COUNT[t] < MAX_PER_TOPIC]
+    return random.choice(options) if options else None
+def fill_template(template_pair, topic_data):
+    q_temp, a_temp = template_pair
+    replacements = {
+        "{subject}": random.choice(topic_data.get("subjects", topic_data.get("things", ["something"]))),
+        "{action}": random.choice(topic_data.get("actions", ["do things"])),
+        "{reason}": random.choice(topic_data.get("reasons", ["that’s how they survive"])),
+        "{thing}": random.choice(topic_data.get("things", ["a thing"])),
+        "{definition}": random.choice(topic_data.get("definitions", ["an object used every day"])),
+        "{mechanism}": random.choice(topic_data.get("mechanisms", ["processing energy"])),
+        "{condition}": random.choice(topic_data.get("conditions", ["a change occurs"])),
+        "{result}": random.choice(topic_data.get("results", ["it transforms"])),
+        "{importance}": random.choice(topic_data.get("importance", ["it is vital to survival"]))
+    }
+    q = q_temp
+    a = a_temp
+    for key, val in replacements.items():
+        q = q.replace(key, val)
+        a = a.replace(key, val)
+    return q.strip(), a.strip()
+def maybe_add_noise(q, a):
+    rand = random.random()
+    if rand < 0.05:
+        a = "I'm not sure."
+    elif rand < 0.10:
+        q += " Just wondering."
+        a = "Well, " + a
+    return q, a
+def token_count(text):
+    return len(tokenizer.encode(text))
+def main():
+    with open(SAVE_PATH, "w", encoding="utf-8") as f:
+        total = 0
+        pbar = tqdm(total=NUM_SAMPLES)
+        while total < NUM_SAMPLES:
+            topic = sample_topic()
+            if not topic:
+                break
+            template = random.choice(TEMPLATES)
+            topic_data = DATA[topic]
+            question, answer = fill_template(template, topic_data)
+            question, answer = maybe_add_noise(question, answer)
+            combined = f"Q: {question} A: {answer}"
+            if token_count(combined) <= MAX_TOKENS:
+                record = {
+                    "question": question,
+                    "answer": answer,
+                    "text": combined
+                }
+                f.write(json.dumps(record, ensure_ascii=False) + "\n")
+                total += 1
+                TOPIC_COUNT[topic] += 1
+                pbar.update(1)
+                if total % 5000 == 0:
+                    print(f"\n[Sample {total}]")
+                    print("Q:", question)
+                    print("A:", answer)
+                    print("Tokens:", token_count(combined))
+        pbar.close()
+        print(f"\n✅ Saved {total} samples to {SAVE_PATH}")
+if __name__ == "__main__":
+    main()

data/filtered_data.jsonl CHANGED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:32618c1059ded33dbd43d56601000555b92ddd39e256aa1347c283720060ab59
+size 12276354

data/overfit_data.jsonl ADDED Viewed

	@@ -0,0 +1,10 @@

+{"instruction": "Describe the structure of an atom.", "input": "", "output": "An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.", "text": "Describe the structure of an atom.\nAn atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom."}
+{"instruction": "Describe the structure of an atom.", "input": "", "output": "An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.", "text": "Describe the structure of an atom.\nAn atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom."}
+{"instruction": "Describe the structure of an atom.", "input": "", "output": "An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.", "text": "Describe the structure of an atom.\nAn atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom."}
+{"instruction": "Describe the structure of an atom.", "input": "", "output": "An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.", "text": "Describe the structure of an atom.\nAn atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom."}
+{"instruction": "Describe the structure of an atom.", "input": "", "output": "An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.", "text": "Describe the structure of an atom.\nAn atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom."}
+{"instruction": "Describe the structure of an atom.", "input": "", "output": "An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.", "text": "Describe the structure of an atom.\nAn atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom."}
+{"instruction": "Describe the structure of an atom.", "input": "", "output": "An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.", "text": "Describe the structure of an atom.\nAn atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom."}
+{"instruction": "Describe the structure of an atom.", "input": "", "output": "An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.", "text": "Describe the structure of an atom.\nAn atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom."}
+{"instruction": "Describe the structure of an atom.", "input": "", "output": "An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.", "text": "Describe the structure of an atom.\nAn atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom."}
+{"instruction": "Describe the structure of an atom.", "input": "", "output": "An atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom.", "text": "Describe the structure of an atom.\nAn atom is made up of a nucleus, which contains protons and neutrons, surrounded by electrons that travel in orbits around the nucleus. The protons and neutrons have a positive charge, while the electrons have a negative charge, resulting in an overall neutral atom. The number of each particle determines the atomic number and the type of atom."}

data/reason_data.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/reasoned2_data.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/reasoned_data.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/unfiltered_data.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cb453e936376c150b34bc9424acb98744c669e9a8f9373aefab26c03a50b691
+size 40249782

dataset.py CHANGED Viewed

@@ -1,46 +1,67 @@
-from concurrent.futures import thread
 import json
-import threading
 import torch
 import torch.nn.functional as F
 from torch.utils.data import Dataset, DataLoader
-from torch.optim.lr_scheduler import OneCycleLR
 from tqdm import tqdm
-import re
-import time
 import os
 from collections import Counter
 class ChatDataset(Dataset):
-    def __init__(self, file_path, tokenizer, block_size=16):
-        self.samples = []
-        with open(file_path, "r", encoding="utf-8") as f:
-            for line in f:
-                line = line.strip()
-                if line:
-                    data = json.loads(line)
-                    tokens = tokenizer.encode(data["text"]) + [tokenizer.stoi["<END>"]]
-                    for i in range(0, len(tokens) - block_size):
-                        x = tokens[i:i + block_size]
-                        y = tokens[i + 1:i + block_size + 1]
-                        self.samples.append((x, y))
     def __len__(self):
-        return len(self.samples)
     def __getitem__(self, idx):
-        x, y = self.samples[idx]
-        return torch.tensor(x), torch.tensor(y)
 class MiniBPETokenizr:
     def __init__(self):
-        self.stoi = {}  # string to index
-        self.itos = {}  # index to string
         self.vocab_size = 0
-    def __len__(self):
-        return len(self.stoi)
     def tokenize(self, text):
         text = text.lower().strip()
         words = re.findall(r"[a-zA-Z0-9]+|[^\w\s]", text)
@@ -49,21 +70,20 @@ class MiniBPETokenizr:
     def get_stats(self, corpus):
         pairs = Counter()
         for tokens in corpus:
-            for i in range(len(tokens)-1):
-                pairs[(tokens[i], tokens[i+1])] += 1
         return pairs
     def merge_vocab(self, corpus, pair_to_merge):
-        merged = []
         bigram = re.escape(' '.join(pair_to_merge))
         pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
         for tokens in corpus:
             token_str = ' '.join(tokens)
             token_str = pattern.sub(''.join(pair_to_merge), token_str)
             merged.append(token_str.split())
         return merged
     def train(self, texts, merge_limit=1000):
         corpus = [sum(self.tokenize(t), []) for t in texts]
         merges_done = 0
@@ -72,22 +92,16 @@ class MiniBPETokenizr:
         while merges_done < merge_limit:
             pairs = self.get_stats(corpus)
             if not pairs:
-                tqdm.write("⚠️ No more pairs to merge.")
                 break
             best = max(pairs, key=pairs.get)
             corpus = self.merge_vocab(corpus, best)
             merges_done += 1
-            loop.n = merges_done
-            loop.refresh()
-            #tqdm.write(f"best: {best}")
-            #tqdm.write(f"corpus: {corpus}")
         vocab = set(tok for seq in corpus for tok in seq)
-        vocab.update({"<PAD>", "<UNK>", "<END>", "^user:", "minigpt:"})
         self.stoi = {tok: i for i, tok in enumerate(sorted(vocab))}
         self.itos = {i: tok for tok, i in self.stoi.items()}
-        print(f"stoi: {len(self.stoi)}")
-        print(f"itos: {len(self.itos)}")
         self.vocab_size = len(self.stoi)
     def encode(self, text):
@@ -107,12 +121,11 @@ class MiniBPETokenizr:
                 output.append(self.stoi.get("<UNK>", 1))
                 i += 1
         return output
     def decode(self, token_ids):
         tokens = [self.itos.get(i, "<UNK>") for i in token_ids]
-        # Join tokens and remove </w> markers, then fix spacing before punctuation
         text = ' '.join(t.replace('</w>', '') for t in tokens if t not in {"<PAD>", "<END>", "<UNK>"})
-        text = re.sub(r'\s([?.!,:;])', r'\1', text)  # Remove space before punctuation
         return text.strip()
     def save(self, path):
@@ -125,24 +138,21 @@ class MiniBPETokenizr:
             self.stoi = {k: int(v) for k, v in data["stoi"].items()}
             self.itos = {int(v): k for k, v in self.stoi.items()}
         self.vocab_size = len(self.stoi)
 class SimpleTokenizr:
     def __init__(self):
         self.stoi = {}
         self.itos = {}
     def tokenize(self, text):
-        # Lowercase and split into words, digits, and punctuation
-        #return re.findall(r"[a-zA-Z]+|\d+|[^\w\s]", text.lower()) -- somewhat good
-        return re.findall(r"[a-zA-Z']+|\d+|[^\w\s]",text.lower())
     def train(self, texts):
         vocab = set()
         for text in texts:
             tokens = self.tokenize(text)
             vocab.update(tokens)
-        # Add special tokens
-        vocab.update(["<PAD>", "<UNK>", "<END>","^user :","minigpt :","Minigpt :","MiniGPT :",":","Minigpt"])
         sorted_vocab = sorted(vocab)
         self.stoi = {token: idx for idx, token in enumerate(sorted_vocab)}
         self.itos = {idx: token for token, idx in self.stoi.items()}
@@ -153,13 +163,10 @@ class SimpleTokenizr:
     def decode(self, token_ids):
         tokens = [self.itos.get(i, "<UNK>") for i in token_ids]
-        # Filter special/utility tokens
-        clean_tokens = [tok for tok in tokens if tok not in {"<PAD>", "<UNK>", "<END>","^user :","minigpt :","Minigpt :","MiniGPT :",":"}]
-        # Join with proper formatting
         text = ''
         for i, tok in enumerate(clean_tokens):
-            if re.match(r"[.,!?;:]", tok):  # no space before punctuation
                 text += tok
             elif i > 0:
                 text += ' ' + tok
@@ -184,16 +191,44 @@ class SimpleTokenizr:
     def vocab_size(self):
         return len(self.stoi)
-def train(model, dataset, tokenizer, epochs, filepathh, start_epoch=0, start_step=0):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
-    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
-    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4,weight_decay=0.001)
-    checkpoint_path = "./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth"
     if os.path.exists(checkpoint_path):
         checkpoint = torch.load(checkpoint_path)
         if "model_state_dict" in checkpoint:
@@ -202,19 +237,34 @@ def train(model, dataset, tokenizer, epochs, filepathh, start_epoch=0, start_ste
             start_epoch = checkpoint["epoch"]
             start_step = checkpoint["step"]
         else:
-            print("⚠️ Legacy checkpoint detected. Loading only model weights.")
             model.load_state_dict(checkpoint)
     else:
         print("🚀 Starting from scratch.")
     total_steps = start_step
-    sreq = 0
-    #scheduler = OneCycleLR(optimizer,max_lr=1e-4,total_steps=epochs * len(dataloader),pct_start=0.1,anneal_strategy="linear")
     for epoch in range(start_epoch, epochs):
-        total_loss = 0
-        loop = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}/{epochs} Training")
         for step, (x, y) in loop:
             x, y = x.to(device), y.to(device)
             logits = model(x)
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
@@ -223,32 +273,34 @@ def train(model, dataset, tokenizer, epochs, filepathh, start_epoch=0, start_ste
             optimizer.step()
             total_loss += loss.item()
-            total_steps += 1
-            sreq += 1
-            # Save every 4 steps
-            if sreq >= 4:
-                tqdm.write("💾 Saved checkpoint.")
-                torch.save({
-                    "model_state_dict": model.state_dict(),
-                    "optimizer_state_dict": optimizer.state_dict(),
-                    "epoch": epoch,
-                    "step": total_steps
-                }, "./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth")
-                tokenizer.save("./customchatbot-v1/trained-mini-gpt/tokenizer.json")
-                sreq = 0
-            loop.set_postfix(loss=loss.item())
-    print(f"✅ Final Loss: {total_loss / total_steps:.4f}")
-    torch.save(model.state_dict(), "./customchatbot-v1/trained-mini-gpt/mini-gpt.pth")
-    tokenizer.save("./customchatbot-v1/trained-mini-gpt/tokenizer.json")
-    print("🎉 Training complete.")
-# 🔧 Example usage
-# tokenizer = SimpleTokenizr()
-# tokenizer.load("path/to/tokenizer.json")
-# dataset = ChatDataset("your_dataset.jsonl", tokenizer)
-# model = YourModelClass(...)  # your GPT-like model
-# train(model, dataset, tokenizer, epochs=2, filepathh="your_dataset.jsonl")

 import json
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.data import Dataset, DataLoader
+from tokenizers import Tokenizer
 from tqdm import tqdm
 import os
+import re
 from collections import Counter
+import multiprocessing
+from torch.utils.data import random_split
+multiprocessing.set_start_method("spawn", force=True)
 class ChatDataset(Dataset):
+    def __init__(self, data, tokenizer, block_size=64):
+        self.tokenizer = tokenizer
+        self.block_size = block_size
+        self.data = self.tokenize_data(data)
+    def tokenize_data(self, data):
+        chunks = []
+        with open(data, "r", encoding="utf-8") as f:
+            for d in f:
+                line = json.loads(d.strip())
+                # Fix duplicated instruction
+                text = "^User: " + line["instruction"].strip() + " MiniGPT: " + line["output"].strip() + " <END>"
+                encoding = self.tokenizer.encode(text)
+                tokens = encoding.ids
+                # You confirmed your 10 examples are long enough, so no change to this filter.
+                # If you were to use shorter data later, you'd need to reconsider this.
+                if len(tokens) < self.block_size:
+                    print(f"Skipping short example (length {len(tokens)} < block_size {self.block_size}): {text[:50]}...")
+                    continue
+                # 🎯 CHANGE 3: Use overlapping chunks (stride = 1)
+                # This drastically increases the effective number of training samples
+                # derived from your limited raw data.
+                stride = 1 # Change this to 1 for max overlap, or self.block_size // 2 for moderate
+                for i in range(0, len(tokens) - self.block_size + 1, stride):
+                    chunk = tokens[i:i + self.block_size]
+                    if len(chunk) == self.block_size: # Ensures only full blocks are added
+                        chunks.append(chunk)
+        print(f"Dataset created with {len(chunks)} total training chunks.") # Added print
+        return chunks
     def __len__(self):
+        return len(self.data)
     def __getitem__(self, idx):
+        chunk = self.data[idx]
+        x = torch.tensor(chunk[:-1], dtype=torch.long) # Ensure dtype is long
+        y = torch.tensor(chunk[1:], dtype=torch.long)  # Ensure dtype is long
+        return x, y
+# MiniBPETokenizr and SimpleTokenizr classes (no changes, but included for completeness)
 class MiniBPETokenizr:
     def __init__(self):
+        self.stoi = {}
+        self.itos = {}
         self.vocab_size = 0
     def tokenize(self, text):
         text = text.lower().strip()
         words = re.findall(r"[a-zA-Z0-9]+|[^\w\s]", text)
     def get_stats(self, corpus):
         pairs = Counter()
         for tokens in corpus:
+            for i in range(len(tokens) - 1):
+                pairs[(tokens[i], tokens[i + 1])] += 1
         return pairs
     def merge_vocab(self, corpus, pair_to_merge):
         bigram = re.escape(' '.join(pair_to_merge))
         pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
+        merged = []
         for tokens in corpus:
             token_str = ' '.join(tokens)
             token_str = pattern.sub(''.join(pair_to_merge), token_str)
             merged.append(token_str.split())
         return merged
     def train(self, texts, merge_limit=1000):
         corpus = [sum(self.tokenize(t), []) for t in texts]
         merges_done = 0
         while merges_done < merge_limit:
             pairs = self.get_stats(corpus)
             if not pairs:
                 break
             best = max(pairs, key=pairs.get)
             corpus = self.merge_vocab(corpus, best)
             merges_done += 1
+            loop.update(1)
         vocab = set(tok for seq in corpus for tok in seq)
+        vocab.update(["<PAD>", "<UNK>", "<END>", "^user:", "minigpt:"])
         self.stoi = {tok: i for i, tok in enumerate(sorted(vocab))}
         self.itos = {i: tok for tok, i in self.stoi.items()}
         self.vocab_size = len(self.stoi)
     def encode(self, text):
                 output.append(self.stoi.get("<UNK>", 1))
                 i += 1
         return output
     def decode(self, token_ids):
         tokens = [self.itos.get(i, "<UNK>") for i in token_ids]
         text = ' '.join(t.replace('</w>', '') for t in tokens if t not in {"<PAD>", "<END>", "<UNK>"})
+        text = re.sub(r'\s([?.!,:;])', r'\1', text)
         return text.strip()
     def save(self, path):
             self.stoi = {k: int(v) for k, v in data["stoi"].items()}
             self.itos = {int(v): k for k, v in self.stoi.items()}
         self.vocab_size = len(self.stoi)
 class SimpleTokenizr:
     def __init__(self):
         self.stoi = {}
         self.itos = {}
     def tokenize(self, text):
+        return re.findall(r"[a-zA-Z']+|\d+|[^\w\s]", text.lower())
     def train(self, texts):
         vocab = set()
         for text in texts:
             tokens = self.tokenize(text)
             vocab.update(tokens)
+        vocab.update(["<PAD>", "<UNK>", "<END>", "^user :", "minigpt :", "MiniGPT :", ":"])
         sorted_vocab = sorted(vocab)
         self.stoi = {token: idx for idx, token in enumerate(sorted_vocab)}
         self.itos = {idx: token for token, idx in self.stoi.items()}
     def decode(self, token_ids):
         tokens = [self.itos.get(i, "<UNK>") for i in token_ids]
+        clean_tokens = [tok for tok in tokens if tok not in {"<PAD>", "<UNK>", "<END>"}]
         text = ''
         for i, tok in enumerate(clean_tokens):
+            if re.match(r"[.,!?;:]", tok):
                 text += tok
             elif i > 0:
                 text += ' ' + tok
     def vocab_size(self):
         return len(self.stoi)
+def validate(model, dataloader, device):
+    model.eval()
+    total_loss, correct, total = 0, 0, 0
+    with torch.no_grad():
+        for x, y in dataloader:
+            x, y = x.to(device), y.to(device)
+            logits = model(x)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
+            total_loss += loss.item()
+            preds = torch.argmax(logits, dim=-1)
+            correct += (preds == y).sum().item()
+            total += y.numel()
+    avg_loss = total_loss / len(dataloader)
+    accuracy = 100 * correct / total
+    return avg_loss, accuracy
+# 🎯 CHANGE 4: Add learning_rate parameter to the train function
+def train(model, dataset, tokenizer, epochs, filepathh, start_epoch=0, start_step=0, learning_rate=5e-5):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
+    # 🔀 Proper train/val split
+    val_size = int(0.1 * len(dataset))
+    train_size = len(dataset) - val_size
+    train_set, val_set = random_split(dataset, [train_size, val_size])
+    # 🎯 CHANGE 5: Reduce batch_size and num_workers for debugging tiny datasets
+    # Batch size 1 or equal to len(train_set) is ideal for testing memorization
+    # num_workers=0 simplifies debugging.
+    train_loader = DataLoader(train_set, batch_size=1, shuffle=True, num_workers=0)
+    val_loader = DataLoader(val_set, batch_size=1, shuffle=False, num_workers=0)
+    # Use the passed learning_rate
+    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
+    checkpoint_path = "./trained-mini-gpt/checkpoint-mini-gpt.pth"
     if os.path.exists(checkpoint_path):
         checkpoint = torch.load(checkpoint_path)
         if "model_state_dict" in checkpoint:
             start_epoch = checkpoint["epoch"]
             start_step = checkpoint["step"]
         else:
             model.load_state_dict(checkpoint)
     else:
         print("🚀 Starting from scratch.")
     total_steps = start_step
     for epoch in range(start_epoch, epochs):
+        model.train()
+        total_loss, correct, total = 0, 0, 0
+        loop = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs}")
         for step, (x, y) in loop:
             x, y = x.to(device), y.to(device)
+            # 🎯 CHANGE 6: Add detailed print statements to observe learning
+            # This is CRUCIAL for debugging underfitting on tiny data.
+            if step % 1 == 0: # Print every step for tiny datasets
+                input_ids_cpu = x[0].cpu().tolist()
+                target_ids_cpu = y[0].cpu().tolist()
+                decoded_input = tokenizer.decode(input_ids_cpu)
+                decoded_target = tokenizer.decode(target_ids_cpu)
+                print(f"\n--- Epoch {epoch+1}, Step {step} ---")
+                print(f"Input (decoded): '{decoded_input}'")
+                print(f"Target (decoded): '{decoded_target}'")
             logits = model(x)
             loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
             optimizer.step()
             total_loss += loss.item()
+            preds = torch.argmax(logits, dim=-1)
+            correct += (preds == y).sum().item()
+            total += y.numel()
+            acc = 100 * correct / total
+            loop.set_postfix(loss=loss.item(), acc=acc)
+            # After optimizer.step(), print predicted output to see if it matches target
+            if step % 1 == 0:
+                predicted_logits_cpu = logits[0, :, :].cpu() # For first example in batch
+                predicted_ids = torch.argmax(predicted_logits_cpu, dim=-1).tolist()
+                decoded_predicted = tokenizer.decode(predicted_ids)
+                print(f"Predicted (decoded): '{decoded_predicted}'")
+                print(f"Current Batch Loss: {loss.item():.4f}")
+                print(f"Current Batch Accuracy: {100 * (preds == y).float().mean().item():.2f}%") # Accuracy for current batch
+        # 🔍 Validate after each epoch
+        val_loss, val_acc = validate(model, val_loader, device)
+        print(f"✅ Val Loss: {val_loss:.4f} | Val Accuracy: {val_acc:.2f}%")
+        # 💾 Save checkpoint
+        torch.save({
+            "model_state_dict": model.state_dict(),
+            "optimizer_state_dict": optimizer.state_dict(),
+            "epoch": epoch,
+            "step": total_steps
+        }, checkpoint_path)
+    torch.save(model.state_dict(), "./trained-mini-gpt/mini-gpt.pth")
+    print("🎉 Training complete.")

datasetgen-synthetic.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import random
+import json
+topics = {
+    "Math Reasoning": [
+        ("What is {a} + {b}?", "{a} + {b} is {sum}."),
+        ("If you have {a} apples and get {b} more, how many?", "{a} + {b} = {sum} apples."),
+        ("Solve: {a} + {b}", "The answer is {sum}.")
+    ],
+    "Causality": [
+        ("If it rains, what might happen?", "If it rains, the ground may become wet."),
+        ("Why do plants grow towards light?", "Because light is a stimulus and plants respond by growing towards it."),
+        ("What happens if you drop a glass?", "It will likely break due to gravity.")
+    ],
+    "Grammar Correction": [
+        ("Correct this: 'He go to school everyday.'", "'He goes to school every day.'"),
+        ("Fix this sentence: 'I has two cat.'", "'I have two cats.'"),
+        ("Can you fix this sentence: 'he have two taco.'", "'He has two tacos.'"),
+        ("What’s the correct form of: 'She don't like it.'", "'She doesn't like it.'")
+    ],
+    "Common Sense": [
+        ("Can a person eat soup with a fork?", "No, it is impractical to eat soup with a fork."),
+        ("Should you touch fire?", "No, touching fire can cause burns."),
+        ("If you're tired, what should you do?", "You should rest or sleep.")
+    ],
+    "World Knowledge": [
+        ("What is the capital of France?", "Paris is the capital of France."),
+        ("Who was the first president of the USA?", "George Washington."),
+        ("What currency is used in Japan?", "The Japanese Yen.")
+    ],
+    "Instruction Following": [
+        ("Open the window and turn off the light.", "Opening the window. Turning off the light."),
+        ("Sort these numbers in ascending order: 5, 2, 8.", "2, 5, 8."),
+        ("Sort these numbers in descending order: 5, 2, 8.", "8, 5, 2."),
+        ("Describe how to make a sandwich.", "Take two slices of bread, add your fillings, and place one slice on top.")
+    ]
+}
+def generate_sample(id, topic):
+    pattern = random.choice(topics[topic])
+    if topic == "Math Reasoning":
+        a = random.randint(1, 20)
+        b = random.randint(1, 20)
+        sum_ab = a + b
+        input_str = pattern[0].format(a=a, b=b, sum=sum_ab)
+        output_str = pattern[1].format(a=a, b=b, sum=sum_ab)
+    else:
+        input_str = pattern[0]
+        output_str = pattern[1]
+    return {
+        "id": id,
+        "topic": topic,
+        "input": input_str,
+        "output": output_str
+    }
+def generate_dataset(n=10000):
+    dataset = []
+    topic_list = list(topics.keys())
+    for i in range(n):
+        topic = random.choice(topic_list)
+        sample = generate_sample(i, topic)
+        dataset.append(sample)
+    return dataset
+def save_as_jsonl(data, path="./data/reasoned_data.jsonl"):
+    with open(path, "w", encoding="utf-8") as f:
+        for item in data:
+            json.dump(item, f, ensure_ascii=False)
+            f.write("\n")
+if __name__ == "__main__":
+    data = generate_dataset(10000)
+    save_as_jsonl(data)
+    print("Saved to ./data/reasoned_data.jsonl")

datasetgen.py CHANGED Viewed

@@ -2,19 +2,44 @@ from datasets import load_dataset
 import json
 import re
 from tqdm import tqdm
-from filter import filterdata
-ds = load_dataset("fka/awesome-chatgpt-prompts",split="train")
 convo = []
-buffer = {}
-print("getting data...")
-for entry in tqdm(ds):
-    print(entry)
-    #convo.append({"text": f"^User: {buffer['user']}\nMiniGPT:{buffer['assistant']} <END>"})
-print(f"Got {len(convo)} pairs/amount of q&a")
-print("Filtering data...")
-filterdata(convo)

 import json
 import re
 from tqdm import tqdm
+from filter import filterdata  # Custom filtering logic
+# Load 110k samples from OpenWebText
+print("📦 Loading dataset (110k samples)...")
+ds = load_dataset("OpenAssistant/oasst1",split="train")
 convo = []
+print("⚙️ Processing dataset into Q&A pairs...")
+for entry in tqdm(ds, unit='samples'):
+    if entry.get("role") == "assistant" and entry.get("text") and entry.get("parent_id"):
+        parent = next((x for x in ds if x["message_id"] == entry["parent_id"]), None)
+        if parent and parent.get("role") == "user":
+            convo.append({
+                "input": parent["text"],
+                "output": entry["text"]
+            })
+    #convo.append({
+    #    "instruction": instruction,
+    #    "input": user_input,
+    #    "output": bot_response,
+    #    "text": full_instruction + "\n" + bot_response
+    #})
+print(f"✅ Got {len(convo)} usable Q&A pairs.")
+# Save unfiltered data
+unfiltered_path = "./data/unfiltered_data.jsonl"
+with open(unfiltered_path, "w", encoding="utf-8") as f:
+    for line in convo:
+        f.write(json.dumps(line, ensure_ascii=False) + "\n")
+print(f"📝 Saved unfiltered data to {unfiltered_path}")
+# Run filtering
+print("🚿 Starting filtering...")
+filterdata(convo)

datasetgen2.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# datasetgen.py
+import json
+import random
+from faker import Faker
+from tqdm import tqdm
+import os
+fake = Faker()
+OUTPUT_PATH = "data/filtered_data.jsonl"
+#os.makedirs("datasets", exist_ok=True)
+def generate_example():
+    """Generates a single GPT-like QA pair"""
+    q_templates = [
+        "What is {}?",
+        "How do you {}?",
+        "Why is {} important?",
+        "Give me an example of {}.",
+        "Explain {} in simple terms.",
+        "Compare {} and {}.",
+        "What happens if {}?",
+        "Can you summarize {}?"
+    ]
+    concepts = [
+        "machine learning", "quantum physics", "natural selection",
+        "photosynthesis", "neural networks", "global warming",
+        "black holes", "economic inflation", "probability", "blockchain"
+    ]
+    actions = [
+        "train a neural network", "reduce carbon emissions", "make bread",
+        "calculate probability", "grow tomatoes", "optimize code",
+        "write a resume", "design a logo", "encrypt data", "learn Python"
+    ]
+    concept = random.choice(concepts)
+    action = random.choice(actions)
+    template = random.choice(q_templates)
+    if '{}' in template and template.count('{}') == 1:
+        question = template.format(random.choice([concept, action]))
+    else:
+        question = template.format(concept, random.choice(concepts))
+    # Simulate an answer (in real GPT training you'd use real completions)
+    answer = f"{fake.paragraph(nb_sentences=4)}"
+    return {
+        "text": "^User: "+ question + "\nMiniGPT: " + answer + " <END>",
+    }
+def generate_dataset(n=5000):
+    with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
+        for _ in tqdm(range(n), desc="Generating Examples"):
+            example = generate_example()
+            f.write(json.dumps(example, ensure_ascii=False) + "\n")
+    print(f"\n✅ Dataset saved to: {OUTPUT_PATH}")
+if __name__ == "__main__":
+    generate_dataset(5000)

datasets/5k_synthetic_dataset.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

filter.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 import re
 from dataset import SimpleTokenizr
 tokenizer = SimpleTokenizr()
@@ -13,7 +14,7 @@ def filterdata(data):
     unused_lines = 0
     low_quality_lines = 0
     long_lines = 0
-    for line in data:
         decoded = json.dumps(line)
         data = json.loads(decoded)
         text = data.get("text","")
@@ -23,20 +24,20 @@ def filterdata(data):
             unused_lines += 1
             unused.append(line)
         else:
-            if len(encoded) <= 27:
                 filtered_lines += 1
                 filtered.append(line)
-        if len(encoded) > 27:
             long_lines += 1
             long.append(text)
     print(f"Filtered {filtered_lines} successfully!")
     print(f"Removed {unused_lines} from data.")
-    print(f"Removed {long_lines} from data (too long).")
     #print(f"Removed {low_quality} from data (low quality).")
-    with open("./customchatbot-v1/data/filtered_data.jsonl", "w", encoding="utf-8") as f:
         for lines in filtered:
             dump = json.dumps(lines)
             decoded = json.loads(dump)

 import json
 import re
 from dataset import SimpleTokenizr
+from tqdm import tqdm
 tokenizer = SimpleTokenizr()
     unused_lines = 0
     low_quality_lines = 0
     long_lines = 0
+    for line in tqdm(data, unit='B', unit_scale=True, unit_divisor=1024):
         decoded = json.dumps(line)
         data = json.loads(decoded)
         text = data.get("text","")
             unused_lines += 1
             unused.append(line)
         else:
+            if len(encoded) >= 64:
                 filtered_lines += 1
                 filtered.append(line)
+        if len(encoded) < 64:
             long_lines += 1
             long.append(text)
     print(f"Filtered {filtered_lines} successfully!")
     print(f"Removed {unused_lines} from data.")
+    print(f"Removed {long_lines} from data (too short).")
     #print(f"Removed {low_quality} from data (low quality).")
+    with open("./data/filtered_data.jsonl", "w", encoding="utf-8") as f:
         for lines in filtered:
             dump = json.dumps(lines)
             decoded = json.loads(dump)

minigpt.py CHANGED Viewed

@@ -4,16 +4,20 @@ from model import MiniGPT
 from dataset import MiniBPETokenizr,SimpleTokenizr
 import json
 import os
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load tokenizer
-tokenizer = SimpleTokenizr()
-tokenizer.load("./customchatbot-v1/trained-mini-gpt/tokenizer.json")
 # Load model
-model = MiniGPT(vocab_size=len(tokenizer))
-model.load_state_dict(torch.load("./customchatbot-v1/trained-mini-gpt/mini-gpt.pth", map_location=device) if os.path.exists("./customchatbot-v1/trained-mini-gpt/mini-gpt.pth") else torch.load("./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth", map_location=device)["model_state_dict"] )
 model.eval().to(device)
 totalparams = sum(p.numel() for p in model.parameters())
 print(f"Model total params: {totalparams:,}")
@@ -30,7 +34,7 @@ def sample_token(logits, temperature=1.0):
     return torch.multinomial(probs, num_samples=1).item()
 def generate_reply(prompt, max_tokens=100):
-    tokens = tokenizer.encode(prompt)
     if not tokens:
         print("⚠️ Empty prompt after encoding.")
         return
@@ -44,8 +48,8 @@ def generate_reply(prompt, max_tokens=100):
             next_token = sample_token(logits)
             generated.append(next_token)
-            next_str = tokenizer.itos.get(next_token, "")
-            encoded_text = tokenizer.encode(next_str)
             decoded_text = tokenizer.decode(encoded_text)
             print(decoded_text, end=" ", flush=True)

 from dataset import MiniBPETokenizr,SimpleTokenizr
 import json
 import os
+from tokenizers import Tokenizer
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load tokenizer
+#tokenizer = SimpleTokenizr()
+#tokenizer.load("./customchatbot-v1/trained-mini-gpt/tokenizer.json")
+tokenizer = Tokenizer.from_file("./trained-mini-gpt/tokenizer.json")
 # Load model
+model = MiniGPT(vocab_size=tokenizer.get_vocab_size())
+#model.load_state_dict(torch.load("./customchatbot-v1/trained-mini-gpt/mini-gpt.pth", map_location=device) if os.path.exists("./customchatbot-v1/trained-mini-gpt/mini-gpt.pth") else torch.load("./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth", map_location=device)["model_state_dict"] )
+checkpoint = torch.load("./trained-mini-gpt/mini-gpt.pth", map_location=device)
+model.load_state_dict(checkpoint)
 model.eval().to(device)
 totalparams = sum(p.numel() for p in model.parameters())
 print(f"Model total params: {totalparams:,}")
     return torch.multinomial(probs, num_samples=1).item()
 def generate_reply(prompt, max_tokens=100):
+    tokens = tokenizer.encode(prompt).ids
     if not tokens:
         print("⚠️ Empty prompt after encoding.")
         return
             next_token = sample_token(logits)
             generated.append(next_token)
+            next_str = tokenizer.id_to_token(next_token)
+            encoded_text = tokenizer.encode(next_str).ids
             decoded_text = tokenizer.decode(encoded_text)
             print(decoded_text, end=" ", flush=True)

model.py CHANGED Viewed

@@ -2,26 +2,39 @@ import torch
 import torch.nn as nn
 class MiniGPT(nn.Module):
-    def __init__(self, vocab_size, d_model=456, n_heads=8, n_layers=4, max_len=256):
         super().__init__()
         self.token_embed = nn.Embedding(vocab_size, d_model)
         self.pos_embed = nn.Embedding(max_len, d_model)
-        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads)
         self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
         self.ln = nn.LayerNorm(d_model)
         self.fc_out = nn.Linear(d_model, vocab_size)
     def forward(self, input_ids):
         B, T = input_ids.shape
         pos = torch.arange(0, T, device=input_ids.device).unsqueeze(0)
         x = self.token_embed(input_ids) + self.pos_embed(pos)
         x = x.transpose(0, 1)  # [T, B, D]
-        x = self.transformer(x)
         x = x.transpose(0, 1)  # [B, T, D]
         x = self.ln(x)
         return self.fc_out(x)
     def reset_params(self):
         for layer in self.children():
-            if hasattr(layer,'reset_parameters'):
                 layer.reset_parameters()

 import torch.nn as nn
 class MiniGPT(nn.Module):
+    def __init__(self, vocab_size, d_model=1024, n_heads=16, n_layers=24, max_len=512):
         super().__init__()
         self.token_embed = nn.Embedding(vocab_size, d_model)
         self.pos_embed = nn.Embedding(max_len, d_model)
+        # 🎯 CHANGE 1: Set dropout to 0.0 for debugging underfitting on tiny data
+        # This allows the model to memorize the small dataset.
+        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dropout=0.0, batch_first=False)
         self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
         self.ln = nn.LayerNorm(d_model)
         self.fc_out = nn.Linear(d_model, vocab_size)
+    def generate_causal_mask(self, T, device):
+        # This mask is correct for a TransformerEncoder used causally (True masks future tokens)
+        return torch.triu(torch.ones(T, T, device=device), diagonal=1).bool()
     def forward(self, input_ids):
         B, T = input_ids.shape
         pos = torch.arange(0, T, device=input_ids.device).unsqueeze(0)
         x = self.token_embed(input_ids) + self.pos_embed(pos)
         x = x.transpose(0, 1)  # [T, B, D]
+        # Causal Mask
+        mask = self.generate_causal_mask(T, input_ids.device)
+        x = self.transformer(x, mask)
         x = x.transpose(0, 1)  # [B, T, D]
         x = self.ln(x)
         return self.fc_out(x)
     def reset_params(self):
         for layer in self.children():
+            if hasattr(layer, 'reset_parameters'):
                 layer.reset_parameters()

train_custom.py CHANGED Viewed

@@ -1,19 +1,50 @@
 import torch
-from dataset import MiniBPETokenizr, ChatDataset, train,SimpleTokenizr
 from model import MiniGPT
 import json
-# Load and prepare
-with open("./customchatbot-v1/data/filtered_data.jsonl", "r", encoding="utf-8") as f:
-    texts = [json.loads(line)["text"] for line in f if line.strip()]
-tokenizer = SimpleTokenizr()
-tokenizer.train(texts)
-ch_path = "./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth"
-dataset = ChatDataset("./customchatbot-v1/data/filtered_data.jsonl", tokenizer)
-model = MiniGPT(vocab_size=len(tokenizer))
-model.reset_params()
-#model.load_state_dict(torch.load(ch_path))
-# Train
-train(model, dataset, tokenizer, epochs=3, filepathh="./customchatbot-v1/data/merged_data.jsonl")

 import torch
+from dataset import MiniBPETokenizr, ChatDataset, train, SimpleTokenizr # SimpleTokenizr might be unused now
 from model import MiniGPT
 import json
+from tokenizers import Tokenizer, models, trainers, pre_tokenizers, normalizers
+from tokenizers.trainers import BpeTrainer
+from tokenizers.normalizers import Lowercase, NFD, StripAccents
+from tokenizers.pre_tokenizers import Whitespace
+# For debugging purposes, turn on anomaly detection for gradients
+torch.autograd.set_detect_anomaly(True)
+# Load training data
+# NOTE: For underfitting on "10 examples", ensure this file *only* contains those 10 examples,
+# and they are long enough (as you confirmed).
+with open("./data/overfit_data.jsonl", "r", encoding="utf-8") as f:
+    texts = [(json.loads(line)["input"] + ' ' + json.loads(line)["output"]) for line in f if line.strip()]
+def main():
+    # 🧠 Initialize HuggingFace BPE tokenizer
+    tokenizer = Tokenizer(models.BPE(unk_token="<UNK>"))
+    tokenizer.normalizer = normalizers.Sequence([Lowercase(), NFD(), StripAccents()])
+    tokenizer.pre_tokenizer = Whitespace()
+    # 🛠️ BPE Training
+    trainer = BpeTrainer(
+        vocab_size=28517,
+        special_tokens=["<PAD>", "<UNK>", "<END>", "^User:", "MiniGPT:"]
+    )
+    tokenizer.train_from_iterator(texts, trainer)
+    # 💾 Save tokenizer
+    tokenizer.save("./trained-mini-gpt/tokenizer.json")
+    hf_tokenizer = Tokenizer.from_file("./trained-mini-gpt/tokenizer.json")
+    # 🧾 Dataset & Model Init
+    dataset = ChatDataset(
+        data="./data/overfit_data.jsonl", # Ensure this path points to your 10-example dataset for testing
+        tokenizer=hf_tokenizer
+    )
+    model = MiniGPT(vocab_size=hf_tokenizer.get_vocab_size())
+    model.reset_params()
+    # 🚂 Train
+    # 🎯 CHANGE 2: Pass an increased learning rate (e.g., 1e-4) to the train function.
+    # Set epochs to a high number for clear overfitting.
+    train(model, dataset, hf_tokenizer, epochs=200, filepathh="./data/merged_data.jsonl", learning_rate=1e-4)
+if __name__ == "__main__":
+    main()