CreatedNull commited on Jul 16, 2025

Commit

4de3b20

verified ·

1 Parent(s): a56d6d4

Upload folder using huggingface_hub

Browse files

Files changed (26) hide show

__pycache__/dataset.cpython-312.pyc +0 -0
__pycache__/filter.cpython-312.pyc +0 -0
__pycache__/mergelines.cpython-312.pyc +0 -0
__pycache__/model.cpython-312.pyc +0 -0
__pycache__/tokenizer.cpython-312.pyc +0 -0
__pycache__/train_custom.cpython-312.pyc +0 -0
data/backup_data.jsonl +0 -0
data/data.jsonl +0 -0
data/filtered_data.jsonl +0 -0
data/merged_data.jsonl +0 -0
data/tiny-gpt.pth +3 -0
data/tokenizer.json +1 -0
data/unused_data.jsonl +49 -0
dataset.py +254 -0
datasetgen.py +20 -0
filter.py +43 -0
mergelines.py +20 -0
mergelines2.py +12 -0
ml_tinygpt.py +66 -0
model.py +27 -0
tokenizer.py +171 -0
train_custom.py +19 -0
train_custommade.py +35 -0
trained-tiny-gpt/checkpoint-tiny-gpt.pth +3 -0
trained-tiny-gpt/tokenizer.json +1 -0
trainer_data_maker.py +0 -0

__pycache__/dataset.cpython-312.pyc ADDED Viewed

Binary file (16.3 kB). View file

__pycache__/filter.cpython-312.pyc ADDED Viewed

Binary file (1.97 kB). View file

__pycache__/mergelines.cpython-312.pyc ADDED Viewed

Binary file (1.13 kB). View file

__pycache__/model.cpython-312.pyc ADDED Viewed

Binary file (2.36 kB). View file

__pycache__/tokenizer.cpython-312.pyc ADDED Viewed

Binary file (3.71 kB). View file

__pycache__/train_custom.cpython-312.pyc ADDED Viewed

Binary file (668 Bytes). View file

data/backup_data.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/data.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/filtered_data.jsonl ADDED Viewed

File without changes

data/merged_data.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

data/tiny-gpt.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6b0b9cc1fa9939f44d73b690551a670ccae70ed6b0a735e74e57d9a654ec4c3
+size 2336301

data/tokenizer.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"stoi": {"'No": 0, "Because": 1, "Can": 2, "Hello": 3, "Hello!": 4, "How": 5, "I": 6, "I'll": 7, "I'm": 8, "My": 9, "Sure!": 10, "Tell": 11, "TinyGPT.": 12, "TinyGPT:": 13, "User:": 14, "What": 15, "What's": 16, "Why": 17, "a": 18, "and": 19, "are": 20, "asking!": 21, "atoms?": 22, "blue": 23, "break,": 24, "but": 25, "can": 26, "chicken": 27, "chickened": 28, "code,": 29, "color?": 30, "computer": 31, "cross": 32, "didn't": 33, "do": 34, "do?": 35, "don't": 36, "everything!": 37, "favorite": 38, "for": 39, "go": 40, "have": 41, "help": 42, "how": 43, "i": 44, "is": 45, "it": 46, "joke": 47, "just": 48, "like": 49, "make": 50, "me": 51, "me?": 52, "my": 53, "name": 54, "name?": 55, "need": 56, "needed": 57, "of": 58, "other": 59, "out!": 60, "preferences,": 61, "problem,": 62, "road?": 63, "said:": 64, "scientists": 65, "sleep!'": 66, "sort": 67, "text!": 68, "thanks": 69, "the": 70, "they": 71, "things.": 72, "to": 73, "told": 74, "trust": 75, "up": 76, "what": 77, "with?": 78, "you": 79, "you,": 80, "you?": 81, "your": 82}, "itos": {"0": "'No", "1": "Because", "2": "Can", "3": "Hello", "4": "Hello!", "5": "How", "6": "I", "7": "I'll", "8": "I'm", "9": "My", "10": "Sure!", "11": "Tell", "12": "TinyGPT.", "13": "TinyGPT:", "14": "User:", "15": "What", "16": "What's", "17": "Why", "18": "a", "19": "and", "20": "are", "21": "asking!", "22": "atoms?", "23": "blue", "24": "break,", "25": "but", "26": "can", "27": "chicken", "28": "chickened", "29": "code,", "30": "color?", "31": "computer", "32": "cross", "33": "didn't", "34": "do", "35": "do?", "36": "don't", "37": "everything!", "38": "favorite", "39": "for", "40": "go", "41": "have", "42": "help", "43": "how", "44": "i", "45": "is", "46": "it", "47": "joke", "48": "just", "49": "like", "50": "make", "51": "me", "52": "me?", "53": "my", "54": "name", "55": "name?", "56": "need", "57": "needed", "58": "of", "59": "other", "60": "out!", "61": "preferences,", "62": "problem,", "63": "road?", "64": "said:", "65": "scientists", "66": "sleep!'", "67": "sort", "68": "text!", "69": "thanks", "70": "the", "71": "they", "72": "things.", "73": "to", "74": "told", "75": "trust", "76": "up", "77": "what", "78": "with?", "79": "you", "80": "you,", "81": "you?", "82": "your"}}

data/unused_data.jsonl ADDED Viewed

	@@ -0,0 +1,49 @@

+{"text": "Hello is What your name Hi Hey Should"}
+{"text": "Help You How I can My TinyGPT What's"}
+{"text": "! ? . , do and other sort of things"}
+{"text": "joke didn't chicken chickened cross"}
+{"text": "the road Sure out Hello! how can i help you?"}
+{"text": "What is your name? What's your name?"}
+{"text": "My name is TinyGPT. What can you do?"}
+{"text": "I can help you, and other sort of things."}
+{"text": "Hello"}
+{"text": "is"}
+{"text": "What"}
+{"text": "your"}
+{"text": "name"}
+{"text": "Hi"}
+{"text": "Hey"}
+{"text": "Should"}
+{"text": "Help"}
+{"text": "You"}
+{"text": "How"}
+{"text": "I"}
+{"text": "can"}
+{"text": "My"}
+{"text": "TinyGPT"}
+{"text": "What's"}
+{"text": "!"}
+{"text": "?"}
+{"text": "."}
+{"text": ","}
+{"text": "do"}
+{"text": "and"}
+{"text": "other"}
+{"text": "sort"}
+{"text": "of"}
+{"text": "things"}
+{"text": "joke"}
+{"text": "didn't"}
+{"text": "chicken"}
+{"text": "chickened"}
+{"text": "cross"}
+{"text": "the"}
+{"text": "road"}
+{"text": "Sure"}
+{"text": "out"}
+{"text": "Hello! how can i help you?"}
+{"text": "What is your name?"}
+{"text": "What's your name?"}
+{"text": "My name is TinyGPT."}
+{"text": "What can you do?"}
+{"text": "I can help you, and other sort of things."}

dataset.py ADDED Viewed

	@@ -0,0 +1,254 @@

+from concurrent.futures import thread
+import json
+import threading
+import torch
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader
+from torch.optim.lr_scheduler import OneCycleLR
+from tqdm import tqdm
+import re
+import time
+import os
+from collections import Counter
+class ChatDataset(Dataset):
+    def __init__(self, file_path, tokenizer, block_size=16):
+        self.samples = []
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    data = json.loads(line)
+                    tokens = tokenizer.encode(data["text"]) + [tokenizer.stoi["<END>"]]
+                    for i in range(0, len(tokens) - block_size):
+                        x = tokens[i:i + block_size]
+                        y = tokens[i + 1:i + block_size + 1]
+                        self.samples.append((x, y))
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        x, y = self.samples[idx]
+        return torch.tensor(x), torch.tensor(y)
+class MiniBPETokenizr:
+    def __init__(self):
+        self.stoi = {}  # string to index
+        self.itos = {}  # index to string
+        self.vocab_size = 0
+    def __len__(self):
+        return len(self.stoi)
+    def tokenize(self, text):
+        text = text.lower().strip()
+        words = re.findall(r"[a-zA-Z0-9]+|[^\w\s]", text)
+        return [list(w) + ['</w>'] if w.isalnum() else [w] for w in words]
+    def get_stats(self, corpus):
+        pairs = Counter()
+        for tokens in corpus:
+            for i in range(len(tokens)-1):
+                pairs[(tokens[i], tokens[i+1])] += 1
+        return pairs
+    def merge_vocab(self, corpus, pair_to_merge):
+        merged = []
+        bigram = re.escape(' '.join(pair_to_merge))
+        pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
+        for tokens in corpus:
+            token_str = ' '.join(tokens)
+            token_str = pattern.sub(''.join(pair_to_merge), token_str)
+            merged.append(token_str.split())
+        return merged
+    def train(self, texts, merge_limit=1000):
+        corpus = [sum(self.tokenize(t), []) for t in texts]
+        merges_done = 0
+        loop = tqdm(total=merge_limit, desc="Training BPE")
+        while merges_done < merge_limit:
+            pairs = self.get_stats(corpus)
+            if not pairs:
+                tqdm.write("⚠️ No more pairs to merge.")
+                break
+            best = max(pairs, key=pairs.get)
+            corpus = self.merge_vocab(corpus, best)
+            merges_done += 1
+            loop.n = merges_done
+            loop.refresh()
+            #tqdm.write(f"best: {best}")
+            #tqdm.write(f"corpus: {corpus}")
+        vocab = set(tok for seq in corpus for tok in seq)
+        vocab.update({"<PAD>", "<UNK>", "<END>", "^user:", "minigpt:"})
+        self.stoi = {tok: i for i, tok in enumerate(sorted(vocab))}
+        self.itos = {i: tok for tok, i in self.stoi.items()}
+        print(f"stoi: {len(self.stoi)}")
+        print(f"itos: {len(self.itos)}")
+        self.vocab_size = len(self.stoi)
+    def encode(self, text):
+        tokens = sum(self.tokenize(text), [])
+        output = []
+        i = 0
+        while i < len(tokens):
+            j = len(tokens)
+            while j > i:
+                candidate = ''.join(tokens[i:j])
+                if candidate in self.stoi:
+                    output.append(self.stoi[candidate])
+                    i = j
+                    break
+                j -= 1
+            else:
+                output.append(self.stoi.get("<UNK>", 1))
+                i += 1
+        return output
+    def decode(self, token_ids):
+        tokens = [self.itos.get(i, "<UNK>") for i in token_ids]
+        # Join tokens and remove </w> markers, then fix spacing before punctuation
+        text = ' '.join(t.replace('</w>', '') for t in tokens if t not in {"<PAD>", "<END>", "<UNK>"})
+        text = re.sub(r'\s([?.!,:;])', r'\1', text)  # Remove space before punctuation
+        return text.strip()
+    def save(self, path):
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump({"stoi": self.stoi, "itos": self.itos}, f)
+    def load(self, path):
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            self.stoi = {k: int(v) for k, v in data["stoi"].items()}
+            self.itos = {int(v): k for k, v in self.stoi.items()}
+        self.vocab_size = len(self.stoi)
+class SimpleTokenizr:
+    def __init__(self):
+        self.stoi = {}
+        self.itos = {}
+    def tokenize(self, text):
+        # Lowercase and split into words, digits, and punctuation
+        #return re.findall(r"[a-zA-Z]+|\d+|[^\w\s]", text.lower()) -- somewhat good
+        return re.findall(r"[a-zA-Z']+|\d+|[^\w\s]",text.lower())
+    def train(self, texts):
+        vocab = set()
+        for text in texts:
+            tokens = self.tokenize(text)
+            vocab.update(tokens)
+        # Add special tokens
+        vocab.update(["<PAD>", "<UNK>", "<END>","^user :","minigpt :","Minigpt :","MiniGPT :",":","Minigpt"])
+        sorted_vocab = sorted(vocab)
+        self.stoi = {token: idx for idx, token in enumerate(sorted_vocab)}
+        self.itos = {idx: token for token, idx in self.stoi.items()}
+    def encode(self, text):
+        tokens = self.tokenize(text)
+        return [self.stoi.get(tok, self.stoi["<UNK>"]) for tok in tokens] + [self.stoi["<END>"]]
+    def decode(self, token_ids):
+        tokens = [self.itos.get(i, "<UNK>") for i in token_ids]
+        # Filter special/utility tokens
+        clean_tokens = [tok for tok in tokens if tok not in {"<PAD>", "<UNK>", "<END>","^user :","minigpt :","Minigpt :","MiniGPT :",":"}]
+        # Join with proper formatting
+        text = ''
+        for i, tok in enumerate(clean_tokens):
+            if re.match(r"[.,!?;:]", tok):  # no space before punctuation
+                text += tok
+            elif i > 0:
+                text += ' ' + tok
+            else:
+                text += tok
+        return text.strip().capitalize()
+    def save(self, path):
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump({"stoi": self.stoi, "itos": self.itos}, f)
+    def load(self, path):
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            self.stoi = {k: int(v) for k, v in data["stoi"].items()}
+            self.itos = {int(k): v for v, k in self.stoi.items()}
+    def __len__(self):
+        return len(self.stoi)
+    @property
+    def vocab_size(self):
+        return len(self.stoi)
+def train(model, dataset, tokenizer, epochs, filepathh, start_epoch=0, start_step=0):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4,weight_decay=0.001)
+    checkpoint_path = "./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth"
+    if os.path.exists(checkpoint_path):
+        checkpoint = torch.load(checkpoint_path)
+        if "model_state_dict" in checkpoint:
+            model.load_state_dict(checkpoint["model_state_dict"])
+            optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
+            start_epoch = checkpoint["epoch"]
+            start_step = checkpoint["step"]
+        else:
+            print("⚠️ Legacy checkpoint detected. Loading only model weights.")
+            model.load_state_dict(checkpoint)
+    else:
+        print("🚀 Starting from scratch.")
+    total_steps = start_step
+    sreq = 0
+    #scheduler = OneCycleLR(optimizer,max_lr=1e-4,total_steps=epochs * len(dataloader),pct_start=0.1,anneal_strategy="linear")
+    for epoch in range(start_epoch, epochs):
+        total_loss = 0
+        loop = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}/{epochs} Training")
+        for step, (x, y) in loop:
+            x, y = x.to(device), y.to(device)
+            logits = model(x)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            total_steps += 1
+            sreq += 1
+            # Save every 4 steps
+            if sreq >= 4:
+                tqdm.write("💾 Saved checkpoint.")
+                torch.save({
+                    "model_state_dict": model.state_dict(),
+                    "optimizer_state_dict": optimizer.state_dict(),
+                    "epoch": epoch,
+                    "step": total_steps
+                }, "./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth")
+                tokenizer.save("./customchatbot-v1/trained-mini-gpt/tokenizer.json")
+                sreq = 0
+            loop.set_postfix(loss=loss.item())
+    print(f"✅ Final Loss: {total_loss / total_steps:.4f}")
+    torch.save(model.state_dict(), "./customchatbot-v1/trained-mini-gpt/mini-gpt.pth")
+    tokenizer.save("./customchatbot-v1/trained-mini-gpt/tokenizer.json")
+    print("🎉 Training complete.")
+# 🔧 Example usage
+# tokenizer = SimpleTokenizr()
+# tokenizer.load("path/to/tokenizer.json")
+# dataset = ChatDataset("your_dataset.jsonl", tokenizer)
+# model = YourModelClass(...)  # your GPT-like model
+# train(model, dataset, tokenizer, epochs=2, filepathh="your_dataset.jsonl")

datasetgen.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from datasets import load_dataset
+import json
+import re
+from tqdm import tqdm
+from filter import filterdata
+ds = load_dataset("fka/awesome-chatgpt-prompts",split="train")
+convo = []
+buffer = {}
+print("getting data...")
+for entry in tqdm(ds):
+    print(entry)
+    #convo.append({"text": f"^User: {buffer['user']}\nMiniGPT:{buffer['assistant']} <END>"})
+print(f"Got {len(convo)} pairs/amount of q&a")
+print("Filtering data...")
+filterdata(convo)

filter.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import json
+import re
+from dataset import SimpleTokenizr
+tokenizer = SimpleTokenizr()
+def filterdata(data):
+    filtered = []
+    unused = []
+    low_quality = []
+    long = []
+    filtered_lines = 0
+    unused_lines = 0
+    low_quality_lines = 0
+    long_lines = 0
+    for line in data:
+        decoded = json.dumps(line)
+        data = json.loads(decoded)
+        text = data.get("text","")
+        encoded = tokenizer.tokenize(text)
+        if re.search(r"\d",text):
+            unused_lines += 1
+            unused.append(line)
+        else:
+            if len(encoded) <= 27:
+                filtered_lines += 1
+                filtered.append(line)
+        if len(encoded) > 27:
+            long_lines += 1
+            long.append(text)
+    print(f"Filtered {filtered_lines} successfully!")
+    print(f"Removed {unused_lines} from data.")
+    print(f"Removed {long_lines} from data (too long).")
+    #print(f"Removed {low_quality} from data (low quality).")
+    with open("./customchatbot-v1/data/filtered_data.jsonl", "w", encoding="utf-8") as f:
+        for lines in filtered:
+            dump = json.dumps(lines)
+            decoded = json.loads(dump)
+            f.write(json.dumps(decoded,ensure_ascii=False) + "\n")

mergelines.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import json
+def merge_short_lines(file_path,min_length=32):
+    merged = []
+    buffer = ""
+    with open(file_path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            data = json.loads(line)
+            text = data["text"]
+            buffer += " " + text.strip()
+            if len(buffer) >= min_length:
+                merged.append({"text": buffer.strip()})
+                buffer = ""
+    if buffer.strip():
+        merged.append({"text": buffer.strip})
+    print(f"Merged {len(merged)} lines")
+    return merged

mergelines2.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import json
+from mergelines import merge_short_lines
+merged_data = merge_short_lines("./customchatbot-v1/data/data.jsonl")
+with open("./customchatbot-v1/data/merged_data.jsonl","w",encoding="utf-8") as out:
+    for item in merged_data:
+        out.write(json.dumps(item) + "\n")
+# with open("./customchatbot-v1/data/data.jsonl","r",encoding="utf-8") as out:
+ #   for item in out:
+ #       with open("./customchatbot-v1/data/backup_data.jsonl","w",encoding="utf-8") as out2:
+ #           out2.write(json.dumps(item) + "\n")

ml_tinygpt.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import torch
+import torch.nn.functional as F
+from model import MiniGPT
+from dataset import MiniBPETokenizr,SimpleTokenizr
+import json
+import os
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load tokenizer
+tokenizer = SimpleTokenizr()
+tokenizer.load("./customchatbot-v1/trained-mini-gpt/tokenizer.json")
+# Load model
+model = MiniGPT(vocab_size=len(tokenizer))
+model.load_state_dict(torch.load("./customchatbot-v1/trained-mini-gpt/mini-gpt.pth", map_location=device) if os.path.exists("./customchatbot-v1/trained-mini-gpt/mini-gpt.pth") else torch.load("./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth", map_location=device)["model_state_dict"] )
+model.eval().to(device)
+totalparams = sum(p.numel() for p in model.parameters())
+print(f"Model total params: {totalparams:,}")
+def sample_token(logits, temperature=1.0):
+    logits = logits / temperature
+    logits = torch.nan_to_num(logits, nan=-1e9)
+    probs = F.softmax(logits, dim=-1)
+    if torch.any(torch.isnan(probs)) or torch.any(probs < 0):
+        print("⚠️ Invalid probs detected. Using uniform fallback.")
+        probs = torch.ones_like(probs) / probs.size(-1)
+    return torch.multinomial(probs, num_samples=1).item()
+def generate_reply(prompt, max_tokens=100):
+    tokens = tokenizer.encode(prompt)
+    if not tokens:
+        print("⚠️ Empty prompt after encoding.")
+        return
+    input_ids = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)
+    generated = []
+    with torch.no_grad():
+        for _ in range(max_tokens):
+            logits = model(input_ids)
+            logits = logits[:, -1, :]
+            next_token = sample_token(logits)
+            generated.append(next_token)
+            next_str = tokenizer.itos.get(next_token, "")
+            encoded_text = tokenizer.encode(next_str)
+            decoded_text = tokenizer.decode(encoded_text)
+            print(decoded_text, end=" ", flush=True)
+            if next_str == "<END>":
+                break
+            input_ids = torch.cat([input_ids, torch.tensor([[next_token]]).to(device)], dim=1)
+    print()
+# Chat loop
+print("🧠 MiniGPT Chat (type 'exit' to quit')")
+while True:
+    user_input = input("User: ")
+    if user_input.lower() == "exit":
+        break
+    prompt = f"^User: {user_input}\nMiniGPT:"
+    print("MiniGPT: ", end="", flush=True)
+    generate_reply(prompt)

model.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import torch
+import torch.nn as nn
+class MiniGPT(nn.Module):
+    def __init__(self, vocab_size, d_model=456, n_heads=8, n_layers=4, max_len=256):
+        super().__init__()
+        self.token_embed = nn.Embedding(vocab_size, d_model)
+        self.pos_embed = nn.Embedding(max_len, d_model)
+        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads)
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
+        self.ln = nn.LayerNorm(d_model)
+        self.fc_out = nn.Linear(d_model, vocab_size)
+    def forward(self, input_ids):
+        B, T = input_ids.shape
+        pos = torch.arange(0, T, device=input_ids.device).unsqueeze(0)
+        x = self.token_embed(input_ids) + self.pos_embed(pos)
+        x = x.transpose(0, 1)  # [T, B, D]
+        x = self.transformer(x)
+        x = x.transpose(0, 1)  # [B, T, D]
+        x = self.ln(x)
+        return self.fc_out(x)
+    def reset_params(self):
+        for layer in self.children():
+            if hasattr(layer,'reset_parameters'):
+                layer.reset_parameters()

tokenizer.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import json
+import torch
+from torch.utils.data import Dataset
+import re
+from collections import Counter
+class ChatTokenizer:
+    def __init__(self, vocab_size=1000):
+        self.vocab_size = vocab_size
+        self.token2id = {}
+        self.id2token = {}
+        self.bpe_ranks = {}
+    def tokenize(self, text):
+        words = re.findall(r"\w+|\S", text.lower())
+        return [' '.join(list(word)) + ' </w>' for word in words]
+    def get_stats(self, tokens):
+        pairs = Counter()
+        for token in tokens:
+            symbols = token.split()
+            for i in range(len(symbols) - 1):
+                pairs[(symbols[i], symbols[i+1])] += 1
+        return pairs
+    def merge_pairs(self, tokens, pair):
+        pattern = re.escape(' '.join(pair))
+        replacement = ''.join(pair)
+        return [re.sub(rf'\b{pattern}\b', replacement, token) for token in tokens]
+    def train(self, texts):
+        tokens = []
+        for text in texts:
+            tokens.extend(self.tokenize(text))
+        vocab = Counter(tokens)
+        for _ in range(self.vocab_size):
+            pairs = self.get_stats(vocab)
+            if not pairs:
+                break
+            best = pairs.most_common(1)[0][0]
+            vocab = Counter(self.merge_pairs(vocab.elements(), best))
+            self.bpe_ranks[best] = _
+        final_tokens = set()
+        for token in vocab:
+            final_tokens.update(token.split())
+        final_tokens.update(["<PAD>", "<UNK>", "<END>", "^user:", "minigpt:"])
+        self.token2id = {tok: i for i, tok in enumerate(sorted(final_tokens))}
+        self.id2token = {i: tok for tok, i in self.token2id.items()}
+    def encode(self, text):
+        tokenized = self.tokenize(text)
+        for pair, _ in sorted(self.bpe_ranks.items(), key=lambda x: x[1]):
+            tokenized = self.merge_pairs(tokenized, pair)
+        ids = []
+        for token in tokenized:
+            for part in token.split():
+                ids.append(self.token2id.get(part, self.token2id["<UNK>"]))
+        ids.append(self.token2id["<END>"])
+        return ids
+    def decode(self, token_ids):
+        tokens = [self.id2token.get(tid, "<UNK>") for tid in token_ids]
+        sentence = ""
+        for tok in tokens:
+            if tok == "<END>":
+                break
+            elif tok == "</w>":
+                sentence += " "
+            elif tok in {"<PAD>", "<UNK>"}:
+                continue
+            else:
+                sentence += tok
+        return sentence.strip()
+    def save(self, path):
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump({
+                "token2id": self.token2id,
+                "bpe_ranks": {f"{a} {b}": r for (a, b), r in self.bpe_ranks.items()}
+            }, f)
+    def load(self, path):
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        self.token2id = {k: int(v) for k, v in data["token2id"].items()}
+        self.id2token = {v: k for k, v in self.token2id.items()}
+        self.bpe_ranks = {tuple(k.split()): v for k, v in data["bpe_ranks"].items()}
+    def __len__(self):
+        return len(self.token2id)
+    @property
+    def stoi(self):
+        return self.token2id
+    @property
+    def itos(self):
+        return self.id2token
+    @property
+    def vocab_size(self):
+        return len(self.token2id)
+class ChatDataset(Dataset):
+    def __init__(self, file_path, tokenizer, block_size=64):
+        self.samples = []
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                data = json.loads(line)
+                text = data["text"].strip()
+                # Wrap in format: ^User: ... MiniGPT: ...
+                if not text.lower().startswith("^user:"):
+                    text = "^User: " + text
+                if "MiniGPT:" not in text:
+                    text += "\nMiniGPT:"
+                tokens = tokenizer.encode(text)
+                for i in range(0, len(tokens) - block_size):
+                    x = tokens[i:i + block_size]
+                    y = tokens[i + 1:i + block_size + 1]
+                    self.samples.append((x, y))
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        x, y = self.samples[idx]
+        return torch.tensor(x), torch.tensor(y)
+class ChatDataset(Dataset):
+    def __init__(self, file_path, tokenizer, block_size=64):
+        self.samples = []
+        with open(file_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                data = json.loads(line)
+                text = data["text"].strip()
+                # Wrap in format: ^User: ... MiniGPT: ...
+                if not text.lower().startswith("^user:"):
+                    text = "^User: " + text
+                if "MiniGPT:" not in text:
+                    text += "\nMiniGPT:"
+                tokens = tokenizer.encode(text) + [tokenizer.stoi["<END>"]]
+                for i in range(0, len(tokens) - block_size):
+                    x = tokens[i:i + block_size]
+                    y = tokens[i + 1:i + block_size + 1]
+                    self.samples.append((x, y))
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        x, y = self.samples[idx]
+        return torch.tensor(x), torch.tensor(y)

train_custom.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import torch
+from dataset import MiniBPETokenizr, ChatDataset, train,SimpleTokenizr
+from model import MiniGPT
+import json
+# Load and prepare
+with open("./customchatbot-v1/data/filtered_data.jsonl", "r", encoding="utf-8") as f:
+    texts = [json.loads(line)["text"] for line in f if line.strip()]
+tokenizer = SimpleTokenizr()
+tokenizer.train(texts)
+ch_path = "./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth"
+dataset = ChatDataset("./customchatbot-v1/data/filtered_data.jsonl", tokenizer)
+model = MiniGPT(vocab_size=len(tokenizer))
+model.reset_params()
+#model.load_state_dict(torch.load(ch_path))
+# Train
+train(model, dataset, tokenizer, epochs=3, filepathh="./customchatbot-v1/data/merged_data.jsonl")

train_custommade.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import json
+import torch.nn as nn
+import torch
+from model import MiniGPT
+from dataset import DataLoader,ChatDataset,SimpleTokenizr
+from tqdm import tqdm
+with open("./customchatbot-v1/data/merged_data.jsonl", "r", encoding="utf-8") as f:
+    texts = [json.loads(line)["text"] for line in f if line.strip()]
+tokenizer = SimpleTokenizr()
+tokenizer.train(texts)
+model = MiniGPT(vocab_size=100)
+criterion = nn.CrossEntropyLoss()
+optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device)
+dataset = ChatDataset("./customchatbot-v1/data/merged_data.jsonl", tokenizer)
+dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
+def Train(epochs):
+    for epoch in range(epochs):
+        model.train()
+        loop = tqdm(enumerate(dataloader),total=len(dataloader),desc="Training")
+        tloss = 0
+        for i,l in loop:
+            optimizer.zero_grad()
+            outputs = model(i)
+            loss = criterion(outputs,l)
+            loss.backward()
+Train(epochs=1)

trained-tiny-gpt/checkpoint-tiny-gpt.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7dc6d3e7756554064ba1cb7785ae75395d3fb6b74362e212b0029da91c79c2f2
+size 66253943

trained-tiny-gpt/tokenizer.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"stoi": {"'": 0, "+": 1, ",": 2, "-": 3, ".": 4, "/": 5, "1": 6, "10": 7, "100": 8, "101": 9, "102": 10, "103": 11, "104": 12, "105": 13, "106": 14, "107": 15, "108": 16, "109": 17, "11": 18, "110": 19, "111": 20, "112": 21, "113": 22, "114": 23, "115": 24, "116": 25, "117": 26, "118": 27, "119": 28, "12": 29, "120": 30, "121": 31, "122": 32, "123": 33, "124": 34, "125": 35, "126": 36, "127": 37, "128": 38, "129": 39, "13": 40, "130": 41, "131": 42, "132": 43, "133": 44, "134": 45, "135": 46, "136": 47, "137": 48, "138": 49, "139": 50, "14": 51, "140": 52, "141": 53, "142": 54, "143": 55, "144": 56, "145": 57, "146": 58, "147": 59, "148": 60, "149": 61, "15": 62, "150": 63, "151": 64, "152": 65, "153": 66, "154": 67, "155": 68, "156": 69, "157": 70, "158": 71, "159": 72, "16": 73, "160": 74, "161": 75, "162": 76, "163": 77, "165": 78, "166": 79, "167": 80, "168": 81, "169": 82, "17": 83, "170": 84, "171": 85, "172": 86, "173": 87, "177": 88, "178": 89, "179": 90, "18": 91, "180": 92, "181": 93, "182": 94, "183": 95, "184": 96, "185": 97, "186": 98, "187": 99, "188": 100, "19": 101, "190": 102, "193": 103, "194": 104, "2": 105, "20": 106, "21": 107, "22": 108, "23": 109, "24": 110, "25": 111, "26": 112, "27": 113, "28": 114, "29": 115, "3": 116, "30": 117, "31": 118, "32": 119, "33": 120, "34": 121, "35": 122, "36": 123, "37": 124, "38": 125, "39": 126, "4": 127, "40": 128, "41": 129, "42": 130, "43": 131, "44": 132, "45": 133, "46": 134, "47": 135, "48": 136, "49": 137, "5": 138, "50": 139, "51": 140, "52": 141, "53": 142, "54": 143, "55": 144, "56": 145, "57": 146, "58": 147, "59": 148, "6": 149, "60": 150, "61": 151, "62": 152, "63": 153, "64": 154, "65": 155, "66": 156, "67": 157, "68": 158, "69": 159, "7": 160, "70": 161, "71": 162, "72": 163, "73": 164, "74": 165, "75": 166, "76": 167, "77": 168, "78": 169, "79": 170, "8": 171, "80": 172, "81": 173, "82": 174, "83": 175, "84": 176, "85": 177, "86": 178, "87": 179, "88": 180, "89": 181, "9": 182, "90": 183, "91": 184, "92": 185, "93": 186, "94": 187, "95": 188, "96": 189, "97": 190, "98": 191, "99": 192, ":": 193, "<": 194, "<END>": 195, "<PAD>": 196, "<UNK>": 197, ">": 198, "?": 199, "^": 200, "^user:": 201, "a": 202, "about": 203, "actions": 204, "add": 205, "ai": 206, "algorithm": 207, "allows": 208, "an": 209, "and": 210, "are": 211, "array": 212, "artificial": 213, "as": 214, "based": 215, "block": 216, "blueprint": 217, "book": 218, "boolean": 219, "bras": 220, "brazil": 221, "by": 222, "calculate": 223, "can": 224, "canada": 225, "capital": 226, "change": 227, "city": 228, "class": 229, "code": 230, "conclusions": 231, "conditions": 232, "convert": 233, "correct": 234, "creating": 235, "decision": 236, "deduction": 237, "define": 238, "delhi": 239, "democracy": 240, "derive": 241, "different": 242, "do": 243, "does": 244, "doesn": 245, "don": 246, "during": 247, "each": 248, "else": 249, "end": 250, "energy": 251, "error": 252, "execution": 253, "explain": 254, "false": 255, "fix": 256, "for": 257, "force": 258, "france": 259, "from": 260, "function": 261, "general": 262, "give": 263, "go": 264, "government": 265, "gravity": 266, "handles": 267, "has": 268, "have": 269, "he": 270, "help": 271, "how": 272, "human": 273, "i": 274, "if": 275, "in": 276, "india": 277, "instructions": 278, "intelligence": 279, "into": 280, "is": 281, "it": 282, "japan": 283, "know": 284, "late": 285, "lia": 286, "like": 287, "logic": 288, "loop": 289, "machines": 290, "making": 291, "me": 292, "mean": 293, "meaning": 294, "minigpt": 295, "minigpt:": 296, "multiple": 297, "new": 298, "objects": 299, "of": 300, "on": 301, "one": 302, "organized": 303, "other": 304, "ottawa": 305, "paris": 306, "perform": 307, "photosynthesis": 308, "plants": 309, "playing": 310, "please": 311, "plus": 312, "population": 313, "problem": 314, "programming": 315, "pulls": 316, "purpose": 317, "python": 318, "repeating": 319, "reusable": 320, "s": 321, "school": 322, "sentence": 323, "serves": 324, "set": 325, "she": 326, "should": 327, "simulation": 328, "solve": 329, "specific": 330, "statements": 331, "stores": 332, "sum": 333, "sunlight": 334, "system": 335, "t": 336, "tell": 337, "term": 338, "that": 339, "the": 340, "there": 341, "they": 342, "this": 343, "to": 344, "tokyo": 345, "toward": 346, "true": 347, "use": 348, "used": 349, "useful": 350, "user": 351, "value": 352, "values": 353, "variable": 354, "version": 355, "want": 356, "was": 357, "we": 358, "went": 359, "were": 360, "what": 361, "when": 362, "which": 363, "whole": 364, "why": 365, "yesterday": 366, "you": 367, "\u2014": 368}, "itos": {"0": "'", "1": "+", "2": ",", "3": "-", "4": ".", "5": "/", "6": "1", "7": "10", "8": "100", "9": "101", "10": "102", "11": "103", "12": "104", "13": "105", "14": "106", "15": "107", "16": "108", "17": "109", "18": "11", "19": "110", "20": "111", "21": "112", "22": "113", "23": "114", "24": "115", "25": "116", "26": "117", "27": "118", "28": "119", "29": "12", "30": "120", "31": "121", "32": "122", "33": "123", "34": "124", "35": "125", "36": "126", "37": "127", "38": "128", "39": "129", "40": "13", "41": "130", "42": "131", "43": "132", "44": "133", "45": "134", "46": "135", "47": "136", "48": "137", "49": "138", "50": "139", "51": "14", "52": "140", "53": "141", "54": "142", "55": "143", "56": "144", "57": "145", "58": "146", "59": "147", "60": "148", "61": "149", "62": "15", "63": "150", "64": "151", "65": "152", "66": "153", "67": "154", "68": "155", "69": "156", "70": "157", "71": "158", "72": "159", "73": "16", "74": "160", "75": "161", "76": "162", "77": "163", "78": "165", "79": "166", "80": "167", "81": "168", "82": "169", "83": "17", "84": "170", "85": "171", "86": "172", "87": "173", "88": "177", "89": "178", "90": "179", "91": "18", "92": "180", "93": "181", "94": "182", "95": "183", "96": "184", "97": "185", "98": "186", "99": "187", "100": "188", "101": "19", "102": "190", "103": "193", "104": "194", "105": "2", "106": "20", "107": "21", "108": "22", "109": "23", "110": "24", "111": "25", "112": "26", "113": "27", "114": "28", "115": "29", "116": "3", "117": "30", "118": "31", "119": "32", "120": "33", "121": "34", "122": "35", "123": "36", "124": "37", "125": "38", "126": "39", "127": "4", "128": "40", "129": "41", "130": "42", "131": "43", "132": "44", "133": "45", "134": "46", "135": "47", "136": "48", "137": "49", "138": "5", "139": "50", "140": "51", "141": "52", "142": "53", "143": "54", "144": "55", "145": "56", "146": "57", "147": "58", "148": "59", "149": "6", "150": "60", "151": "61", "152": "62", "153": "63", "154": "64", "155": "65", "156": "66", "157": "67", "158": "68", "159": "69", "160": "7", "161": "70", "162": "71", "163": "72", "164": "73", "165": "74", "166": "75", "167": "76", "168": "77", "169": "78", "170": "79", "171": "8", "172": "80", "173": "81", "174": "82", "175": "83", "176": "84", "177": "85", "178": "86", "179": "87", "180": "88", "181": "89", "182": "9", "183": "90", "184": "91", "185": "92", "186": "93", "187": "94", "188": "95", "189": "96", "190": "97", "191": "98", "192": "99", "193": ":", "194": "<", "195": "<END>", "196": "<PAD>", "197": "<UNK>", "198": ">", "199": "?", "200": "^", "201": "^user:", "202": "a", "203": "about", "204": "actions", "205": "add", "206": "ai", "207": "algorithm", "208": "allows", "209": "an", "210": "and", "211": "are", "212": "array", "213": "artificial", "214": "as", "215": "based", "216": "block", "217": "blueprint", "218": "book", "219": "boolean", "220": "bras", "221": "brazil", "222": "by", "223": "calculate", "224": "can", "225": "canada", "226": "capital", "227": "change", "228": "city", "229": "class", "230": "code", "231": "conclusions", "232": "conditions", "233": "convert", "234": "correct", "235": "creating", "236": "decision", "237": "deduction", "238": "define", "239": "delhi", "240": "democracy", "241": "derive", "242": "different", "243": "do", "244": "does", "245": "doesn", "246": "don", "247": "during", "248": "each", "249": "else", "250": "end", "251": "energy", "252": "error", "253": "execution", "254": "explain", "255": "false", "256": "fix", "257": "for", "258": "force", "259": "france", "260": "from", "261": "function", "262": "general", "263": "give", "264": "go", "265": "government", "266": "gravity", "267": "handles", "268": "has", "269": "have", "270": "he", "271": "help", "272": "how", "273": "human", "274": "i", "275": "if", "276": "in", "277": "india", "278": "instructions", "279": "intelligence", "280": "into", "281": "is", "282": "it", "283": "japan", "284": "know", "285": "late", "286": "lia", "287": "like", "288": "logic", "289": "loop", "290": "machines", "291": "making", "292": "me", "293": "mean", "294": "meaning", "295": "minigpt", "296": "minigpt:", "297": "multiple", "298": "new", "299": "objects", "300": "of", "301": "on", "302": "one", "303": "organized", "304": "other", "305": "ottawa", "306": "paris", "307": "perform", "308": "photosynthesis", "309": "plants", "310": "playing", "311": "please", "312": "plus", "313": "population", "314": "problem", "315": "programming", "316": "pulls", "317": "purpose", "318": "python", "319": "repeating", "320": "reusable", "321": "s", "322": "school", "323": "sentence", "324": "serves", "325": "set", "326": "she", "327": "should", "328": "simulation", "329": "solve", "330": "specific", "331": "statements", "332": "stores", "333": "sum", "334": "sunlight", "335": "system", "336": "t", "337": "tell", "338": "term", "339": "that", "340": "the", "341": "there", "342": "they", "343": "this", "344": "to", "345": "tokyo", "346": "toward", "347": "true", "348": "use", "349": "used", "350": "useful", "351": "user", "352": "value", "353": "values", "354": "variable", "355": "version", "356": "want", "357": "was", "358": "we", "359": "went", "360": "were", "361": "what", "362": "when", "363": "which", "364": "whole", "365": "why", "366": "yesterday", "367": "you", "368": "\u2014"}}

trainer_data_maker.py ADDED Viewed

File without changes