CreatedNull commited on
Commit
4de3b20
·
verified ·
1 Parent(s): a56d6d4

Upload folder using huggingface_hub

Browse files
__pycache__/dataset.cpython-312.pyc ADDED
Binary file (16.3 kB). View file
 
__pycache__/filter.cpython-312.pyc ADDED
Binary file (1.97 kB). View file
 
__pycache__/mergelines.cpython-312.pyc ADDED
Binary file (1.13 kB). View file
 
__pycache__/model.cpython-312.pyc ADDED
Binary file (2.36 kB). View file
 
__pycache__/tokenizer.cpython-312.pyc ADDED
Binary file (3.71 kB). View file
 
__pycache__/train_custom.cpython-312.pyc ADDED
Binary file (668 Bytes). View file
 
data/backup_data.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/data.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/filtered_data.jsonl ADDED
File without changes
data/merged_data.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/tiny-gpt.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6b0b9cc1fa9939f44d73b690551a670ccae70ed6b0a735e74e57d9a654ec4c3
3
+ size 2336301
data/tokenizer.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"stoi": {"'No": 0, "Because": 1, "Can": 2, "Hello": 3, "Hello!": 4, "How": 5, "I": 6, "I'll": 7, "I'm": 8, "My": 9, "Sure!": 10, "Tell": 11, "TinyGPT.": 12, "TinyGPT:": 13, "User:": 14, "What": 15, "What's": 16, "Why": 17, "a": 18, "and": 19, "are": 20, "asking!": 21, "atoms?": 22, "blue": 23, "break,": 24, "but": 25, "can": 26, "chicken": 27, "chickened": 28, "code,": 29, "color?": 30, "computer": 31, "cross": 32, "didn't": 33, "do": 34, "do?": 35, "don't": 36, "everything!": 37, "favorite": 38, "for": 39, "go": 40, "have": 41, "help": 42, "how": 43, "i": 44, "is": 45, "it": 46, "joke": 47, "just": 48, "like": 49, "make": 50, "me": 51, "me?": 52, "my": 53, "name": 54, "name?": 55, "need": 56, "needed": 57, "of": 58, "other": 59, "out!": 60, "preferences,": 61, "problem,": 62, "road?": 63, "said:": 64, "scientists": 65, "sleep!'": 66, "sort": 67, "text!": 68, "thanks": 69, "the": 70, "they": 71, "things.": 72, "to": 73, "told": 74, "trust": 75, "up": 76, "what": 77, "with?": 78, "you": 79, "you,": 80, "you?": 81, "your": 82}, "itos": {"0": "'No", "1": "Because", "2": "Can", "3": "Hello", "4": "Hello!", "5": "How", "6": "I", "7": "I'll", "8": "I'm", "9": "My", "10": "Sure!", "11": "Tell", "12": "TinyGPT.", "13": "TinyGPT:", "14": "User:", "15": "What", "16": "What's", "17": "Why", "18": "a", "19": "and", "20": "are", "21": "asking!", "22": "atoms?", "23": "blue", "24": "break,", "25": "but", "26": "can", "27": "chicken", "28": "chickened", "29": "code,", "30": "color?", "31": "computer", "32": "cross", "33": "didn't", "34": "do", "35": "do?", "36": "don't", "37": "everything!", "38": "favorite", "39": "for", "40": "go", "41": "have", "42": "help", "43": "how", "44": "i", "45": "is", "46": "it", "47": "joke", "48": "just", "49": "like", "50": "make", "51": "me", "52": "me?", "53": "my", "54": "name", "55": "name?", "56": "need", "57": "needed", "58": "of", "59": "other", "60": "out!", "61": "preferences,", "62": "problem,", "63": "road?", "64": "said:", "65": "scientists", "66": "sleep!'", "67": "sort", "68": "text!", "69": "thanks", "70": "the", "71": "they", "72": "things.", "73": "to", "74": "told", "75": "trust", "76": "up", "77": "what", "78": "with?", "79": "you", "80": "you,", "81": "you?", "82": "your"}}
data/unused_data.jsonl ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"text": "Hello is What your name Hi Hey Should"}
2
+ {"text": "Help You How I can My TinyGPT What's"}
3
+ {"text": "! ? . , do and other sort of things"}
4
+ {"text": "joke didn't chicken chickened cross"}
5
+ {"text": "the road Sure out Hello! how can i help you?"}
6
+ {"text": "What is your name? What's your name?"}
7
+ {"text": "My name is TinyGPT. What can you do?"}
8
+ {"text": "I can help you, and other sort of things."}
9
+ {"text": "Hello"}
10
+ {"text": "is"}
11
+ {"text": "What"}
12
+ {"text": "your"}
13
+ {"text": "name"}
14
+ {"text": "Hi"}
15
+ {"text": "Hey"}
16
+ {"text": "Should"}
17
+ {"text": "Help"}
18
+ {"text": "You"}
19
+ {"text": "How"}
20
+ {"text": "I"}
21
+ {"text": "can"}
22
+ {"text": "My"}
23
+ {"text": "TinyGPT"}
24
+ {"text": "What's"}
25
+ {"text": "!"}
26
+ {"text": "?"}
27
+ {"text": "."}
28
+ {"text": ","}
29
+ {"text": "do"}
30
+ {"text": "and"}
31
+ {"text": "other"}
32
+ {"text": "sort"}
33
+ {"text": "of"}
34
+ {"text": "things"}
35
+ {"text": "joke"}
36
+ {"text": "didn't"}
37
+ {"text": "chicken"}
38
+ {"text": "chickened"}
39
+ {"text": "cross"}
40
+ {"text": "the"}
41
+ {"text": "road"}
42
+ {"text": "Sure"}
43
+ {"text": "out"}
44
+ {"text": "Hello! how can i help you?"}
45
+ {"text": "What is your name?"}
46
+ {"text": "What's your name?"}
47
+ {"text": "My name is TinyGPT."}
48
+ {"text": "What can you do?"}
49
+ {"text": "I can help you, and other sort of things."}
dataset.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from concurrent.futures import thread
2
+ import json
3
+ import threading
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from torch.utils.data import Dataset, DataLoader
7
+ from torch.optim.lr_scheduler import OneCycleLR
8
+ from tqdm import tqdm
9
+ import re
10
+ import time
11
+ import os
12
+ from collections import Counter
13
+
14
+ class ChatDataset(Dataset):
15
+ def __init__(self, file_path, tokenizer, block_size=16):
16
+ self.samples = []
17
+ with open(file_path, "r", encoding="utf-8") as f:
18
+ for line in f:
19
+ line = line.strip()
20
+ if line:
21
+ data = json.loads(line)
22
+ tokens = tokenizer.encode(data["text"]) + [tokenizer.stoi["<END>"]]
23
+ for i in range(0, len(tokens) - block_size):
24
+ x = tokens[i:i + block_size]
25
+ y = tokens[i + 1:i + block_size + 1]
26
+ self.samples.append((x, y))
27
+
28
+ def __len__(self):
29
+ return len(self.samples)
30
+
31
+ def __getitem__(self, idx):
32
+ x, y = self.samples[idx]
33
+ return torch.tensor(x), torch.tensor(y)
34
+
35
+ class MiniBPETokenizr:
36
+ def __init__(self):
37
+ self.stoi = {} # string to index
38
+ self.itos = {} # index to string
39
+ self.vocab_size = 0
40
+
41
+ def __len__(self):
42
+ return len(self.stoi)
43
+
44
+ def tokenize(self, text):
45
+ text = text.lower().strip()
46
+ words = re.findall(r"[a-zA-Z0-9]+|[^\w\s]", text)
47
+ return [list(w) + ['</w>'] if w.isalnum() else [w] for w in words]
48
+
49
+ def get_stats(self, corpus):
50
+ pairs = Counter()
51
+ for tokens in corpus:
52
+ for i in range(len(tokens)-1):
53
+ pairs[(tokens[i], tokens[i+1])] += 1
54
+ return pairs
55
+
56
+ def merge_vocab(self, corpus, pair_to_merge):
57
+ merged = []
58
+ bigram = re.escape(' '.join(pair_to_merge))
59
+ pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
60
+
61
+ for tokens in corpus:
62
+ token_str = ' '.join(tokens)
63
+ token_str = pattern.sub(''.join(pair_to_merge), token_str)
64
+ merged.append(token_str.split())
65
+ return merged
66
+
67
+ def train(self, texts, merge_limit=1000):
68
+ corpus = [sum(self.tokenize(t), []) for t in texts]
69
+ merges_done = 0
70
+ loop = tqdm(total=merge_limit, desc="Training BPE")
71
+
72
+ while merges_done < merge_limit:
73
+ pairs = self.get_stats(corpus)
74
+ if not pairs:
75
+ tqdm.write("⚠️ No more pairs to merge.")
76
+ break
77
+ best = max(pairs, key=pairs.get)
78
+ corpus = self.merge_vocab(corpus, best)
79
+ merges_done += 1
80
+ loop.n = merges_done
81
+ loop.refresh()
82
+ #tqdm.write(f"best: {best}")
83
+ #tqdm.write(f"corpus: {corpus}")
84
+
85
+ vocab = set(tok for seq in corpus for tok in seq)
86
+ vocab.update({"<PAD>", "<UNK>", "<END>", "^user:", "minigpt:"})
87
+ self.stoi = {tok: i for i, tok in enumerate(sorted(vocab))}
88
+ self.itos = {i: tok for tok, i in self.stoi.items()}
89
+ print(f"stoi: {len(self.stoi)}")
90
+ print(f"itos: {len(self.itos)}")
91
+ self.vocab_size = len(self.stoi)
92
+
93
+ def encode(self, text):
94
+ tokens = sum(self.tokenize(text), [])
95
+ output = []
96
+ i = 0
97
+ while i < len(tokens):
98
+ j = len(tokens)
99
+ while j > i:
100
+ candidate = ''.join(tokens[i:j])
101
+ if candidate in self.stoi:
102
+ output.append(self.stoi[candidate])
103
+ i = j
104
+ break
105
+ j -= 1
106
+ else:
107
+ output.append(self.stoi.get("<UNK>", 1))
108
+ i += 1
109
+ return output
110
+
111
+ def decode(self, token_ids):
112
+ tokens = [self.itos.get(i, "<UNK>") for i in token_ids]
113
+ # Join tokens and remove </w> markers, then fix spacing before punctuation
114
+ text = ' '.join(t.replace('</w>', '') for t in tokens if t not in {"<PAD>", "<END>", "<UNK>"})
115
+ text = re.sub(r'\s([?.!,:;])', r'\1', text) # Remove space before punctuation
116
+ return text.strip()
117
+
118
+ def save(self, path):
119
+ with open(path, "w", encoding="utf-8") as f:
120
+ json.dump({"stoi": self.stoi, "itos": self.itos}, f)
121
+
122
+ def load(self, path):
123
+ with open(path, "r", encoding="utf-8") as f:
124
+ data = json.load(f)
125
+ self.stoi = {k: int(v) for k, v in data["stoi"].items()}
126
+ self.itos = {int(v): k for k, v in self.stoi.items()}
127
+ self.vocab_size = len(self.stoi)
128
+
129
+ class SimpleTokenizr:
130
+ def __init__(self):
131
+ self.stoi = {}
132
+ self.itos = {}
133
+
134
+ def tokenize(self, text):
135
+ # Lowercase and split into words, digits, and punctuation
136
+ #return re.findall(r"[a-zA-Z]+|\d+|[^\w\s]", text.lower()) -- somewhat good
137
+ return re.findall(r"[a-zA-Z']+|\d+|[^\w\s]",text.lower())
138
+
139
+ def train(self, texts):
140
+ vocab = set()
141
+ for text in texts:
142
+ tokens = self.tokenize(text)
143
+ vocab.update(tokens)
144
+ # Add special tokens
145
+ vocab.update(["<PAD>", "<UNK>", "<END>","^user :","minigpt :","Minigpt :","MiniGPT :",":","Minigpt"])
146
+ sorted_vocab = sorted(vocab)
147
+ self.stoi = {token: idx for idx, token in enumerate(sorted_vocab)}
148
+ self.itos = {idx: token for token, idx in self.stoi.items()}
149
+
150
+ def encode(self, text):
151
+ tokens = self.tokenize(text)
152
+ return [self.stoi.get(tok, self.stoi["<UNK>"]) for tok in tokens] + [self.stoi["<END>"]]
153
+
154
+ def decode(self, token_ids):
155
+ tokens = [self.itos.get(i, "<UNK>") for i in token_ids]
156
+ # Filter special/utility tokens
157
+ clean_tokens = [tok for tok in tokens if tok not in {"<PAD>", "<UNK>", "<END>","^user :","minigpt :","Minigpt :","MiniGPT :",":"}]
158
+
159
+ # Join with proper formatting
160
+ text = ''
161
+ for i, tok in enumerate(clean_tokens):
162
+ if re.match(r"[.,!?;:]", tok): # no space before punctuation
163
+ text += tok
164
+ elif i > 0:
165
+ text += ' ' + tok
166
+ else:
167
+ text += tok
168
+ return text.strip().capitalize()
169
+
170
+ def save(self, path):
171
+ with open(path, "w", encoding="utf-8") as f:
172
+ json.dump({"stoi": self.stoi, "itos": self.itos}, f)
173
+
174
+ def load(self, path):
175
+ with open(path, "r", encoding="utf-8") as f:
176
+ data = json.load(f)
177
+ self.stoi = {k: int(v) for k, v in data["stoi"].items()}
178
+ self.itos = {int(k): v for v, k in self.stoi.items()}
179
+
180
+ def __len__(self):
181
+ return len(self.stoi)
182
+
183
+ @property
184
+ def vocab_size(self):
185
+ return len(self.stoi)
186
+
187
+
188
+ def train(model, dataset, tokenizer, epochs, filepathh, start_epoch=0, start_step=0):
189
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
190
+ model.to(device)
191
+
192
+ dataloader = DataLoader(dataset, batch_size=8, shuffle=True)
193
+ optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4,weight_decay=0.001)
194
+
195
+
196
+ checkpoint_path = "./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth"
197
+ if os.path.exists(checkpoint_path):
198
+ checkpoint = torch.load(checkpoint_path)
199
+ if "model_state_dict" in checkpoint:
200
+ model.load_state_dict(checkpoint["model_state_dict"])
201
+ optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
202
+ start_epoch = checkpoint["epoch"]
203
+ start_step = checkpoint["step"]
204
+ else:
205
+ print("⚠️ Legacy checkpoint detected. Loading only model weights.")
206
+ model.load_state_dict(checkpoint)
207
+ else:
208
+ print("🚀 Starting from scratch.")
209
+
210
+ total_steps = start_step
211
+ sreq = 0
212
+ #scheduler = OneCycleLR(optimizer,max_lr=1e-4,total_steps=epochs * len(dataloader),pct_start=0.1,anneal_strategy="linear")
213
+ for epoch in range(start_epoch, epochs):
214
+ total_loss = 0
215
+ loop = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}/{epochs} Training")
216
+ for step, (x, y) in loop:
217
+ x, y = x.to(device), y.to(device)
218
+ logits = model(x)
219
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
220
+
221
+ optimizer.zero_grad()
222
+ loss.backward()
223
+ optimizer.step()
224
+
225
+ total_loss += loss.item()
226
+ total_steps += 1
227
+ sreq += 1
228
+
229
+ # Save every 4 steps
230
+ if sreq >= 4:
231
+ tqdm.write("💾 Saved checkpoint.")
232
+ torch.save({
233
+ "model_state_dict": model.state_dict(),
234
+ "optimizer_state_dict": optimizer.state_dict(),
235
+ "epoch": epoch,
236
+ "step": total_steps
237
+ }, "./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth")
238
+ tokenizer.save("./customchatbot-v1/trained-mini-gpt/tokenizer.json")
239
+ sreq = 0
240
+
241
+ loop.set_postfix(loss=loss.item())
242
+
243
+ print(f"✅ Final Loss: {total_loss / total_steps:.4f}")
244
+ torch.save(model.state_dict(), "./customchatbot-v1/trained-mini-gpt/mini-gpt.pth")
245
+ tokenizer.save("./customchatbot-v1/trained-mini-gpt/tokenizer.json")
246
+ print("🎉 Training complete.")
247
+
248
+
249
+ # 🔧 Example usage
250
+ # tokenizer = SimpleTokenizr()
251
+ # tokenizer.load("path/to/tokenizer.json")
252
+ # dataset = ChatDataset("your_dataset.jsonl", tokenizer)
253
+ # model = YourModelClass(...) # your GPT-like model
254
+ # train(model, dataset, tokenizer, epochs=2, filepathh="your_dataset.jsonl")
datasetgen.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import json
3
+ import re
4
+ from tqdm import tqdm
5
+ from filter import filterdata
6
+
7
+ ds = load_dataset("fka/awesome-chatgpt-prompts",split="train")
8
+
9
+ convo = []
10
+ buffer = {}
11
+
12
+ print("getting data...")
13
+ for entry in tqdm(ds):
14
+ print(entry)
15
+ #convo.append({"text": f"^User: {buffer['user']}\nMiniGPT:{buffer['assistant']} <END>"})
16
+
17
+ print(f"Got {len(convo)} pairs/amount of q&a")
18
+
19
+ print("Filtering data...")
20
+ filterdata(convo)
filter.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from dataset import SimpleTokenizr
4
+
5
+ tokenizer = SimpleTokenizr()
6
+
7
+ def filterdata(data):
8
+ filtered = []
9
+ unused = []
10
+ low_quality = []
11
+ long = []
12
+ filtered_lines = 0
13
+ unused_lines = 0
14
+ low_quality_lines = 0
15
+ long_lines = 0
16
+ for line in data:
17
+ decoded = json.dumps(line)
18
+ data = json.loads(decoded)
19
+ text = data.get("text","")
20
+
21
+ encoded = tokenizer.tokenize(text)
22
+ if re.search(r"\d",text):
23
+ unused_lines += 1
24
+ unused.append(line)
25
+ else:
26
+ if len(encoded) <= 27:
27
+ filtered_lines += 1
28
+ filtered.append(line)
29
+ if len(encoded) > 27:
30
+ long_lines += 1
31
+ long.append(text)
32
+
33
+ print(f"Filtered {filtered_lines} successfully!")
34
+ print(f"Removed {unused_lines} from data.")
35
+ print(f"Removed {long_lines} from data (too long).")
36
+ #print(f"Removed {low_quality} from data (low quality).")
37
+
38
+
39
+ with open("./customchatbot-v1/data/filtered_data.jsonl", "w", encoding="utf-8") as f:
40
+ for lines in filtered:
41
+ dump = json.dumps(lines)
42
+ decoded = json.loads(dump)
43
+ f.write(json.dumps(decoded,ensure_ascii=False) + "\n")
mergelines.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ def merge_short_lines(file_path,min_length=32):
3
+ merged = []
4
+ buffer = ""
5
+
6
+ with open(file_path, "r", encoding="utf-8") as f:
7
+ for line in f:
8
+ line = line.strip()
9
+ data = json.loads(line)
10
+ text = data["text"]
11
+ buffer += " " + text.strip()
12
+ if len(buffer) >= min_length:
13
+ merged.append({"text": buffer.strip()})
14
+ buffer = ""
15
+
16
+ if buffer.strip():
17
+ merged.append({"text": buffer.strip})
18
+
19
+ print(f"Merged {len(merged)} lines")
20
+ return merged
mergelines2.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from mergelines import merge_short_lines
3
+
4
+ merged_data = merge_short_lines("./customchatbot-v1/data/data.jsonl")
5
+ with open("./customchatbot-v1/data/merged_data.jsonl","w",encoding="utf-8") as out:
6
+ for item in merged_data:
7
+ out.write(json.dumps(item) + "\n")
8
+
9
+ # with open("./customchatbot-v1/data/data.jsonl","r",encoding="utf-8") as out:
10
+ # for item in out:
11
+ # with open("./customchatbot-v1/data/backup_data.jsonl","w",encoding="utf-8") as out2:
12
+ # out2.write(json.dumps(item) + "\n")
ml_tinygpt.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from model import MiniGPT
4
+ from dataset import MiniBPETokenizr,SimpleTokenizr
5
+ import json
6
+ import os
7
+
8
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9
+
10
+ # Load tokenizer
11
+ tokenizer = SimpleTokenizr()
12
+ tokenizer.load("./customchatbot-v1/trained-mini-gpt/tokenizer.json")
13
+
14
+ # Load model
15
+ model = MiniGPT(vocab_size=len(tokenizer))
16
+ model.load_state_dict(torch.load("./customchatbot-v1/trained-mini-gpt/mini-gpt.pth", map_location=device) if os.path.exists("./customchatbot-v1/trained-mini-gpt/mini-gpt.pth") else torch.load("./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth", map_location=device)["model_state_dict"] )
17
+ model.eval().to(device)
18
+ totalparams = sum(p.numel() for p in model.parameters())
19
+ print(f"Model total params: {totalparams:,}")
20
+
21
+ def sample_token(logits, temperature=1.0):
22
+ logits = logits / temperature
23
+ logits = torch.nan_to_num(logits, nan=-1e9)
24
+ probs = F.softmax(logits, dim=-1)
25
+
26
+ if torch.any(torch.isnan(probs)) or torch.any(probs < 0):
27
+ print("⚠️ Invalid probs detected. Using uniform fallback.")
28
+ probs = torch.ones_like(probs) / probs.size(-1)
29
+
30
+ return torch.multinomial(probs, num_samples=1).item()
31
+
32
+ def generate_reply(prompt, max_tokens=100):
33
+ tokens = tokenizer.encode(prompt)
34
+ if not tokens:
35
+ print("⚠️ Empty prompt after encoding.")
36
+ return
37
+ input_ids = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(device)
38
+ generated = []
39
+
40
+ with torch.no_grad():
41
+ for _ in range(max_tokens):
42
+ logits = model(input_ids)
43
+ logits = logits[:, -1, :]
44
+ next_token = sample_token(logits)
45
+ generated.append(next_token)
46
+
47
+ next_str = tokenizer.itos.get(next_token, "")
48
+ encoded_text = tokenizer.encode(next_str)
49
+ decoded_text = tokenizer.decode(encoded_text)
50
+ print(decoded_text, end=" ", flush=True)
51
+
52
+ if next_str == "<END>":
53
+ break
54
+
55
+ input_ids = torch.cat([input_ids, torch.tensor([[next_token]]).to(device)], dim=1)
56
+ print()
57
+
58
+ # Chat loop
59
+ print("🧠 MiniGPT Chat (type 'exit' to quit')")
60
+ while True:
61
+ user_input = input("User: ")
62
+ if user_input.lower() == "exit":
63
+ break
64
+ prompt = f"^User: {user_input}\nMiniGPT:"
65
+ print("MiniGPT: ", end="", flush=True)
66
+ generate_reply(prompt)
model.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ class MiniGPT(nn.Module):
5
+ def __init__(self, vocab_size, d_model=456, n_heads=8, n_layers=4, max_len=256):
6
+ super().__init__()
7
+ self.token_embed = nn.Embedding(vocab_size, d_model)
8
+ self.pos_embed = nn.Embedding(max_len, d_model)
9
+ encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads)
10
+ self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
11
+ self.ln = nn.LayerNorm(d_model)
12
+ self.fc_out = nn.Linear(d_model, vocab_size)
13
+
14
+ def forward(self, input_ids):
15
+ B, T = input_ids.shape
16
+ pos = torch.arange(0, T, device=input_ids.device).unsqueeze(0)
17
+ x = self.token_embed(input_ids) + self.pos_embed(pos)
18
+ x = x.transpose(0, 1) # [T, B, D]
19
+ x = self.transformer(x)
20
+ x = x.transpose(0, 1) # [B, T, D]
21
+ x = self.ln(x)
22
+ return self.fc_out(x)
23
+
24
+ def reset_params(self):
25
+ for layer in self.children():
26
+ if hasattr(layer,'reset_parameters'):
27
+ layer.reset_parameters()
tokenizer.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+ from torch.utils.data import Dataset
4
+ import re
5
+ from collections import Counter
6
+
7
+
8
+ class ChatTokenizer:
9
+ def __init__(self, vocab_size=1000):
10
+ self.vocab_size = vocab_size
11
+ self.token2id = {}
12
+ self.id2token = {}
13
+ self.bpe_ranks = {}
14
+
15
+ def tokenize(self, text):
16
+ words = re.findall(r"\w+|\S", text.lower())
17
+ return [' '.join(list(word)) + ' </w>' for word in words]
18
+
19
+ def get_stats(self, tokens):
20
+ pairs = Counter()
21
+ for token in tokens:
22
+ symbols = token.split()
23
+ for i in range(len(symbols) - 1):
24
+ pairs[(symbols[i], symbols[i+1])] += 1
25
+ return pairs
26
+
27
+ def merge_pairs(self, tokens, pair):
28
+ pattern = re.escape(' '.join(pair))
29
+ replacement = ''.join(pair)
30
+ return [re.sub(rf'\b{pattern}\b', replacement, token) for token in tokens]
31
+
32
+ def train(self, texts):
33
+ tokens = []
34
+ for text in texts:
35
+ tokens.extend(self.tokenize(text))
36
+ vocab = Counter(tokens)
37
+
38
+ for _ in range(self.vocab_size):
39
+ pairs = self.get_stats(vocab)
40
+ if not pairs:
41
+ break
42
+ best = pairs.most_common(1)[0][0]
43
+ vocab = Counter(self.merge_pairs(vocab.elements(), best))
44
+ self.bpe_ranks[best] = _
45
+
46
+ final_tokens = set()
47
+ for token in vocab:
48
+ final_tokens.update(token.split())
49
+ final_tokens.update(["<PAD>", "<UNK>", "<END>", "^user:", "minigpt:"])
50
+ self.token2id = {tok: i for i, tok in enumerate(sorted(final_tokens))}
51
+ self.id2token = {i: tok for tok, i in self.token2id.items()}
52
+
53
+ def encode(self, text):
54
+ tokenized = self.tokenize(text)
55
+ for pair, _ in sorted(self.bpe_ranks.items(), key=lambda x: x[1]):
56
+ tokenized = self.merge_pairs(tokenized, pair)
57
+ ids = []
58
+ for token in tokenized:
59
+ for part in token.split():
60
+ ids.append(self.token2id.get(part, self.token2id["<UNK>"]))
61
+ ids.append(self.token2id["<END>"])
62
+ return ids
63
+
64
+ def decode(self, token_ids):
65
+ tokens = [self.id2token.get(tid, "<UNK>") for tid in token_ids]
66
+ sentence = ""
67
+ for tok in tokens:
68
+ if tok == "<END>":
69
+ break
70
+ elif tok == "</w>":
71
+ sentence += " "
72
+ elif tok in {"<PAD>", "<UNK>"}:
73
+ continue
74
+ else:
75
+ sentence += tok
76
+ return sentence.strip()
77
+
78
+ def save(self, path):
79
+ with open(path, "w", encoding="utf-8") as f:
80
+ json.dump({
81
+ "token2id": self.token2id,
82
+ "bpe_ranks": {f"{a} {b}": r for (a, b), r in self.bpe_ranks.items()}
83
+ }, f)
84
+
85
+ def load(self, path):
86
+ with open(path, "r", encoding="utf-8") as f:
87
+ data = json.load(f)
88
+ self.token2id = {k: int(v) for k, v in data["token2id"].items()}
89
+ self.id2token = {v: k for k, v in self.token2id.items()}
90
+ self.bpe_ranks = {tuple(k.split()): v for k, v in data["bpe_ranks"].items()}
91
+
92
+ def __len__(self):
93
+ return len(self.token2id)
94
+
95
+ @property
96
+ def stoi(self):
97
+ return self.token2id
98
+
99
+ @property
100
+ def itos(self):
101
+ return self.id2token
102
+
103
+ @property
104
+ def vocab_size(self):
105
+ return len(self.token2id)
106
+
107
+
108
+ class ChatDataset(Dataset):
109
+ def __init__(self, file_path, tokenizer, block_size=64):
110
+ self.samples = []
111
+ with open(file_path, "r", encoding="utf-8") as f:
112
+ for line in f:
113
+ line = line.strip()
114
+ if not line:
115
+ continue
116
+ data = json.loads(line)
117
+ text = data["text"].strip()
118
+
119
+ # Wrap in format: ^User: ... MiniGPT: ...
120
+ if not text.lower().startswith("^user:"):
121
+ text = "^User: " + text
122
+ if "MiniGPT:" not in text:
123
+ text += "\nMiniGPT:"
124
+
125
+ tokens = tokenizer.encode(text)
126
+
127
+ for i in range(0, len(tokens) - block_size):
128
+ x = tokens[i:i + block_size]
129
+ y = tokens[i + 1:i + block_size + 1]
130
+ self.samples.append((x, y))
131
+
132
+ def __len__(self):
133
+ return len(self.samples)
134
+
135
+ def __getitem__(self, idx):
136
+ x, y = self.samples[idx]
137
+ return torch.tensor(x), torch.tensor(y)
138
+
139
+
140
+
141
+
142
+ class ChatDataset(Dataset):
143
+ def __init__(self, file_path, tokenizer, block_size=64):
144
+ self.samples = []
145
+ with open(file_path, "r", encoding="utf-8") as f:
146
+ for line in f:
147
+ line = line.strip()
148
+ if not line:
149
+ continue
150
+ data = json.loads(line)
151
+ text = data["text"].strip()
152
+
153
+ # Wrap in format: ^User: ... MiniGPT: ...
154
+ if not text.lower().startswith("^user:"):
155
+ text = "^User: " + text
156
+ if "MiniGPT:" not in text:
157
+ text += "\nMiniGPT:"
158
+
159
+ tokens = tokenizer.encode(text) + [tokenizer.stoi["<END>"]]
160
+
161
+ for i in range(0, len(tokens) - block_size):
162
+ x = tokens[i:i + block_size]
163
+ y = tokens[i + 1:i + block_size + 1]
164
+ self.samples.append((x, y))
165
+
166
+ def __len__(self):
167
+ return len(self.samples)
168
+
169
+ def __getitem__(self, idx):
170
+ x, y = self.samples[idx]
171
+ return torch.tensor(x), torch.tensor(y)
train_custom.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from dataset import MiniBPETokenizr, ChatDataset, train,SimpleTokenizr
3
+ from model import MiniGPT
4
+ import json
5
+
6
+ # Load and prepare
7
+ with open("./customchatbot-v1/data/filtered_data.jsonl", "r", encoding="utf-8") as f:
8
+ texts = [json.loads(line)["text"] for line in f if line.strip()]
9
+
10
+ tokenizer = SimpleTokenizr()
11
+ tokenizer.train(texts)
12
+ ch_path = "./customchatbot-v1/trained-mini-gpt/checkpoint-mini-gpt.pth"
13
+ dataset = ChatDataset("./customchatbot-v1/data/filtered_data.jsonl", tokenizer)
14
+ model = MiniGPT(vocab_size=len(tokenizer))
15
+ model.reset_params()
16
+ #model.load_state_dict(torch.load(ch_path))
17
+
18
+ # Train
19
+ train(model, dataset, tokenizer, epochs=3, filepathh="./customchatbot-v1/data/merged_data.jsonl")
train_custommade.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch.nn as nn
3
+ import torch
4
+ from model import MiniGPT
5
+ from dataset import DataLoader,ChatDataset,SimpleTokenizr
6
+ from tqdm import tqdm
7
+
8
+ with open("./customchatbot-v1/data/merged_data.jsonl", "r", encoding="utf-8") as f:
9
+ texts = [json.loads(line)["text"] for line in f if line.strip()]
10
+
11
+ tokenizer = SimpleTokenizr()
12
+ tokenizer.train(texts)
13
+
14
+ model = MiniGPT(vocab_size=100)
15
+
16
+ criterion = nn.CrossEntropyLoss()
17
+ optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
18
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19
+ model.to(device)
20
+
21
+ dataset = ChatDataset("./customchatbot-v1/data/merged_data.jsonl", tokenizer)
22
+ dataloader = DataLoader(dataset, batch_size=100, shuffle=True)
23
+
24
+ def Train(epochs):
25
+ for epoch in range(epochs):
26
+ model.train()
27
+ loop = tqdm(enumerate(dataloader),total=len(dataloader),desc="Training")
28
+ tloss = 0
29
+ for i,l in loop:
30
+ optimizer.zero_grad()
31
+ outputs = model(i)
32
+ loss = criterion(outputs,l)
33
+ loss.backward()
34
+
35
+ Train(epochs=1)
trained-tiny-gpt/checkpoint-tiny-gpt.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dc6d3e7756554064ba1cb7785ae75395d3fb6b74362e212b0029da91c79c2f2
3
+ size 66253943
trained-tiny-gpt/tokenizer.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"stoi": {"'": 0, "+": 1, ",": 2, "-": 3, ".": 4, "/": 5, "1": 6, "10": 7, "100": 8, "101": 9, "102": 10, "103": 11, "104": 12, "105": 13, "106": 14, "107": 15, "108": 16, "109": 17, "11": 18, "110": 19, "111": 20, "112": 21, "113": 22, "114": 23, "115": 24, "116": 25, "117": 26, "118": 27, "119": 28, "12": 29, "120": 30, "121": 31, "122": 32, "123": 33, "124": 34, "125": 35, "126": 36, "127": 37, "128": 38, "129": 39, "13": 40, "130": 41, "131": 42, "132": 43, "133": 44, "134": 45, "135": 46, "136": 47, "137": 48, "138": 49, "139": 50, "14": 51, "140": 52, "141": 53, "142": 54, "143": 55, "144": 56, "145": 57, "146": 58, "147": 59, "148": 60, "149": 61, "15": 62, "150": 63, "151": 64, "152": 65, "153": 66, "154": 67, "155": 68, "156": 69, "157": 70, "158": 71, "159": 72, "16": 73, "160": 74, "161": 75, "162": 76, "163": 77, "165": 78, "166": 79, "167": 80, "168": 81, "169": 82, "17": 83, "170": 84, "171": 85, "172": 86, "173": 87, "177": 88, "178": 89, "179": 90, "18": 91, "180": 92, "181": 93, "182": 94, "183": 95, "184": 96, "185": 97, "186": 98, "187": 99, "188": 100, "19": 101, "190": 102, "193": 103, "194": 104, "2": 105, "20": 106, "21": 107, "22": 108, "23": 109, "24": 110, "25": 111, "26": 112, "27": 113, "28": 114, "29": 115, "3": 116, "30": 117, "31": 118, "32": 119, "33": 120, "34": 121, "35": 122, "36": 123, "37": 124, "38": 125, "39": 126, "4": 127, "40": 128, "41": 129, "42": 130, "43": 131, "44": 132, "45": 133, "46": 134, "47": 135, "48": 136, "49": 137, "5": 138, "50": 139, "51": 140, "52": 141, "53": 142, "54": 143, "55": 144, "56": 145, "57": 146, "58": 147, "59": 148, "6": 149, "60": 150, "61": 151, "62": 152, "63": 153, "64": 154, "65": 155, "66": 156, "67": 157, "68": 158, "69": 159, "7": 160, "70": 161, "71": 162, "72": 163, "73": 164, "74": 165, "75": 166, "76": 167, "77": 168, "78": 169, "79": 170, "8": 171, "80": 172, "81": 173, "82": 174, "83": 175, "84": 176, "85": 177, "86": 178, "87": 179, "88": 180, "89": 181, "9": 182, "90": 183, "91": 184, "92": 185, "93": 186, "94": 187, "95": 188, "96": 189, "97": 190, "98": 191, "99": 192, ":": 193, "<": 194, "<END>": 195, "<PAD>": 196, "<UNK>": 197, ">": 198, "?": 199, "^": 200, "^user:": 201, "a": 202, "about": 203, "actions": 204, "add": 205, "ai": 206, "algorithm": 207, "allows": 208, "an": 209, "and": 210, "are": 211, "array": 212, "artificial": 213, "as": 214, "based": 215, "block": 216, "blueprint": 217, "book": 218, "boolean": 219, "bras": 220, "brazil": 221, "by": 222, "calculate": 223, "can": 224, "canada": 225, "capital": 226, "change": 227, "city": 228, "class": 229, "code": 230, "conclusions": 231, "conditions": 232, "convert": 233, "correct": 234, "creating": 235, "decision": 236, "deduction": 237, "define": 238, "delhi": 239, "democracy": 240, "derive": 241, "different": 242, "do": 243, "does": 244, "doesn": 245, "don": 246, "during": 247, "each": 248, "else": 249, "end": 250, "energy": 251, "error": 252, "execution": 253, "explain": 254, "false": 255, "fix": 256, "for": 257, "force": 258, "france": 259, "from": 260, "function": 261, "general": 262, "give": 263, "go": 264, "government": 265, "gravity": 266, "handles": 267, "has": 268, "have": 269, "he": 270, "help": 271, "how": 272, "human": 273, "i": 274, "if": 275, "in": 276, "india": 277, "instructions": 278, "intelligence": 279, "into": 280, "is": 281, "it": 282, "japan": 283, "know": 284, "late": 285, "lia": 286, "like": 287, "logic": 288, "loop": 289, "machines": 290, "making": 291, "me": 292, "mean": 293, "meaning": 294, "minigpt": 295, "minigpt:": 296, "multiple": 297, "new": 298, "objects": 299, "of": 300, "on": 301, "one": 302, "organized": 303, "other": 304, "ottawa": 305, "paris": 306, "perform": 307, "photosynthesis": 308, "plants": 309, "playing": 310, "please": 311, "plus": 312, "population": 313, "problem": 314, "programming": 315, "pulls": 316, "purpose": 317, "python": 318, "repeating": 319, "reusable": 320, "s": 321, "school": 322, "sentence": 323, "serves": 324, "set": 325, "she": 326, "should": 327, "simulation": 328, "solve": 329, "specific": 330, "statements": 331, "stores": 332, "sum": 333, "sunlight": 334, "system": 335, "t": 336, "tell": 337, "term": 338, "that": 339, "the": 340, "there": 341, "they": 342, "this": 343, "to": 344, "tokyo": 345, "toward": 346, "true": 347, "use": 348, "used": 349, "useful": 350, "user": 351, "value": 352, "values": 353, "variable": 354, "version": 355, "want": 356, "was": 357, "we": 358, "went": 359, "were": 360, "what": 361, "when": 362, "which": 363, "whole": 364, "why": 365, "yesterday": 366, "you": 367, "\u2014": 368}, "itos": {"0": "'", "1": "+", "2": ",", "3": "-", "4": ".", "5": "/", "6": "1", "7": "10", "8": "100", "9": "101", "10": "102", "11": "103", "12": "104", "13": "105", "14": "106", "15": "107", "16": "108", "17": "109", "18": "11", "19": "110", "20": "111", "21": "112", "22": "113", "23": "114", "24": "115", "25": "116", "26": "117", "27": "118", "28": "119", "29": "12", "30": "120", "31": "121", "32": "122", "33": "123", "34": "124", "35": "125", "36": "126", "37": "127", "38": "128", "39": "129", "40": "13", "41": "130", "42": "131", "43": "132", "44": "133", "45": "134", "46": "135", "47": "136", "48": "137", "49": "138", "50": "139", "51": "14", "52": "140", "53": "141", "54": "142", "55": "143", "56": "144", "57": "145", "58": "146", "59": "147", "60": "148", "61": "149", "62": "15", "63": "150", "64": "151", "65": "152", "66": "153", "67": "154", "68": "155", "69": "156", "70": "157", "71": "158", "72": "159", "73": "16", "74": "160", "75": "161", "76": "162", "77": "163", "78": "165", "79": "166", "80": "167", "81": "168", "82": "169", "83": "17", "84": "170", "85": "171", "86": "172", "87": "173", "88": "177", "89": "178", "90": "179", "91": "18", "92": "180", "93": "181", "94": "182", "95": "183", "96": "184", "97": "185", "98": "186", "99": "187", "100": "188", "101": "19", "102": "190", "103": "193", "104": "194", "105": "2", "106": "20", "107": "21", "108": "22", "109": "23", "110": "24", "111": "25", "112": "26", "113": "27", "114": "28", "115": "29", "116": "3", "117": "30", "118": "31", "119": "32", "120": "33", "121": "34", "122": "35", "123": "36", "124": "37", "125": "38", "126": "39", "127": "4", "128": "40", "129": "41", "130": "42", "131": "43", "132": "44", "133": "45", "134": "46", "135": "47", "136": "48", "137": "49", "138": "5", "139": "50", "140": "51", "141": "52", "142": "53", "143": "54", "144": "55", "145": "56", "146": "57", "147": "58", "148": "59", "149": "6", "150": "60", "151": "61", "152": "62", "153": "63", "154": "64", "155": "65", "156": "66", "157": "67", "158": "68", "159": "69", "160": "7", "161": "70", "162": "71", "163": "72", "164": "73", "165": "74", "166": "75", "167": "76", "168": "77", "169": "78", "170": "79", "171": "8", "172": "80", "173": "81", "174": "82", "175": "83", "176": "84", "177": "85", "178": "86", "179": "87", "180": "88", "181": "89", "182": "9", "183": "90", "184": "91", "185": "92", "186": "93", "187": "94", "188": "95", "189": "96", "190": "97", "191": "98", "192": "99", "193": ":", "194": "<", "195": "<END>", "196": "<PAD>", "197": "<UNK>", "198": ">", "199": "?", "200": "^", "201": "^user:", "202": "a", "203": "about", "204": "actions", "205": "add", "206": "ai", "207": "algorithm", "208": "allows", "209": "an", "210": "and", "211": "are", "212": "array", "213": "artificial", "214": "as", "215": "based", "216": "block", "217": "blueprint", "218": "book", "219": "boolean", "220": "bras", "221": "brazil", "222": "by", "223": "calculate", "224": "can", "225": "canada", "226": "capital", "227": "change", "228": "city", "229": "class", "230": "code", "231": "conclusions", "232": "conditions", "233": "convert", "234": "correct", "235": "creating", "236": "decision", "237": "deduction", "238": "define", "239": "delhi", "240": "democracy", "241": "derive", "242": "different", "243": "do", "244": "does", "245": "doesn", "246": "don", "247": "during", "248": "each", "249": "else", "250": "end", "251": "energy", "252": "error", "253": "execution", "254": "explain", "255": "false", "256": "fix", "257": "for", "258": "force", "259": "france", "260": "from", "261": "function", "262": "general", "263": "give", "264": "go", "265": "government", "266": "gravity", "267": "handles", "268": "has", "269": "have", "270": "he", "271": "help", "272": "how", "273": "human", "274": "i", "275": "if", "276": "in", "277": "india", "278": "instructions", "279": "intelligence", "280": "into", "281": "is", "282": "it", "283": "japan", "284": "know", "285": "late", "286": "lia", "287": "like", "288": "logic", "289": "loop", "290": "machines", "291": "making", "292": "me", "293": "mean", "294": "meaning", "295": "minigpt", "296": "minigpt:", "297": "multiple", "298": "new", "299": "objects", "300": "of", "301": "on", "302": "one", "303": "organized", "304": "other", "305": "ottawa", "306": "paris", "307": "perform", "308": "photosynthesis", "309": "plants", "310": "playing", "311": "please", "312": "plus", "313": "population", "314": "problem", "315": "programming", "316": "pulls", "317": "purpose", "318": "python", "319": "repeating", "320": "reusable", "321": "s", "322": "school", "323": "sentence", "324": "serves", "325": "set", "326": "she", "327": "should", "328": "simulation", "329": "solve", "330": "specific", "331": "statements", "332": "stores", "333": "sum", "334": "sunlight", "335": "system", "336": "t", "337": "tell", "338": "term", "339": "that", "340": "the", "341": "there", "342": "they", "343": "this", "344": "to", "345": "tokyo", "346": "toward", "347": "true", "348": "use", "349": "used", "350": "useful", "351": "user", "352": "value", "353": "values", "354": "variable", "355": "version", "356": "want", "357": "was", "358": "we", "359": "went", "360": "were", "361": "what", "362": "when", "363": "which", "364": "whole", "365": "why", "366": "yesterday", "367": "you", "368": "\u2014"}}
trainer_data_maker.py ADDED
File without changes