josh1234566532 commited on
Commit
476b208
·
verified ·
1 Parent(s): f6956ee

Create a LLM from scratch in js, and a tokenizer, that allows the user to input their training data, and labels (text box), and then after training, test it out. And visualize the training loss, etc.

Browse files
Files changed (2) hide show
  1. README.md +8 -5
  2. index.html +473 -18
README.md CHANGED
@@ -1,10 +1,13 @@
1
  ---
2
- title: Brainybot Builder
3
- emoji: 🔥
4
- colorFrom: yellow
5
- colorTo: indigo
6
  sdk: static
7
  pinned: false
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
  ---
2
+ title: BrainyBot Builder 🧠
3
+ colorFrom: blue
4
+ colorTo: yellow
5
+ emoji: 🐳
6
  sdk: static
7
  pinned: false
8
+ tags:
9
+ - deepsite-v3
10
  ---
11
 
12
+ # Welcome to your new DeepSite project!
13
+ This project was created with [DeepSite](https://deepsite.hf.co).
index.html CHANGED
@@ -1,19 +1,474 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  </html>
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>BrainyBot Builder - Custom LLM Creator</title>
7
+ <script src="https://cdn.tailwindcss.com"></script>
8
+ <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
9
+ <script src="https://unpkg.com/feather-icons"></script>
10
+ <script src="https://cdn.jsdelivr.net/npm/vanta@latest/dist/vanta.net.min.js"></script>
11
+ <style>
12
+ .gradient-bg {
13
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
14
+ }
15
+ .code-block {
16
+ font-family: 'Courier New', monospace;
17
+ background-color: rgba(0,0,0,0.7);
18
+ color: #f8f8f2;
19
+ border-radius: 0.5rem;
20
+ padding: 1rem;
21
+ overflow-x: auto;
22
+ }
23
+ .neon-glow {
24
+ box-shadow: 0 0 10px rgba(59, 130, 246, 0.8);
25
+ }
26
+ .token {
27
+ padding: 0.2rem 0.4rem;
28
+ background-color: rgba(59, 130, 246, 0.2);
29
+ border-radius: 0.25rem;
30
+ margin-right: 0.25rem;
31
+ display: inline-block;
32
+ margin-bottom: 0.25rem;
33
+ }
34
+ </style>
35
+ </head>
36
+ <body class="min-h-screen bg-gray-900 text-gray-100">
37
+ <div id="vanta-bg" class="fixed inset-0 opacity-20"></div>
38
+
39
+ <div class="relative z-10 container mx-auto px-4 py-12">
40
+ <header class="text-center mb-12">
41
+ <h1 class="text-5xl font-bold mb-4 bg-clip-text text-transparent gradient-bg">BrainyBot Builder</h1>
42
+ <p class="text-xl text-gray-300">Create your own miniature LLM from scratch!</p>
43
+ </header>
44
+
45
+ <div class="grid grid-cols-1 lg:grid-cols-2 gap-8">
46
+ <!-- Training Section -->
47
+ <div class="bg-gray-800 rounded-xl p-6 shadow-lg neon-glow">
48
+ <h2 class="text-2xl font-semibold mb-4 flex items-center">
49
+ <i data-feather="cpu" class="mr-2"></i> Model Training
50
+ </h2>
51
+
52
+ <div class="mb-6">
53
+ <label class="block mb-2 text-sm font-medium">Training Data (one sample per line)</label>
54
+ <textarea id="training-data" rows="6" class="w-full bg-gray-700 rounded-lg p-4 text-gray-100 border border-gray-600 focus:border-blue-500 focus:ring-blue-500" placeholder="Enter your training text here..."></textarea>
55
+ </div>
56
+
57
+ <div class="mb-6">
58
+ <label class="block mb-2 text-sm font-medium">Labels (one per line, matching training data)</label>
59
+ <textarea id="training-labels" rows="3" class="w-full bg-gray-700 rounded-lg p-4 text-gray-100 border border-gray-600 focus:border-blue-500 focus:ring-blue-500" placeholder="Enter corresponding labels..."></textarea>
60
+ </div>
61
+
62
+ <div class="grid grid-cols-2 gap-4 mb-6">
63
+ <div>
64
+ <label class="block mb-2 text-sm font-medium">Epochs</label>
65
+ <input id="epochs" type="number" min="1" max="1000" value="10" class="w-full bg-gray-700 rounded-lg p-2 text-gray-100 border border-gray-600">
66
+ </div>
67
+ <div>
68
+ <label class="block mb-2 text-sm font-medium">Learning Rate</label>
69
+ <input id="learning-rate" type="number" step="0.001" min="0.0001" max="1" value="0.01" class="w-full bg-gray-700 rounded-lg p-2 text-gray-100 border border-gray-600">
70
+ </div>
71
+ </div>
72
+
73
+ <button id="train-btn" class="w-full py-3 px-4 bg-blue-600 hover:bg-blue-700 rounded-lg font-medium transition-colors flex items-center justify-center">
74
+ <i data-feather="activity" class="mr-2"></i> Train Model
75
+ </button>
76
+ </div>
77
+
78
+ <!-- Tokenizer & Testing Section -->
79
+ <div class="bg-gray-800 rounded-xl p-6 shadow-lg neon-glow">
80
+ <h2 class="text-2xl font-semibold mb-4 flex items-center">
81
+ <i data-feather="code" class="mr-2"></i> Tokenizer & Testing
82
+ </h2>
83
+
84
+ <div class="mb-6">
85
+ <label class="block mb-2 text-sm font-medium">Tokenizer Output</label>
86
+ <div id="tokenizer-output" class="code-block min-h-20 p-4">
87
+ Tokens will appear here...
88
+ </div>
89
+ </div>
90
+
91
+ <div class="mb-6">
92
+ <label class="block mb-2 text-sm font-medium">Test Input</label>
93
+ <input id="test-input" class="w-full bg-gray-700 rounded-lg p-3 text-gray-100 border border-gray-600 focus:border-blue-500 focus:ring-blue-500" placeholder="Type something to test...">
94
+ </div>
95
+
96
+ <div class="mb-6">
97
+ <label class="block mb-2 text-sm font-medium">Model Prediction</label>
98
+ <div id="model-output" class="code-block min-h-20 p-4">
99
+ Predictions will appear here...
100
+ </div>
101
+ </div>
102
+
103
+ <button id="test-btn" class="w-full py-3 px-4 bg-purple-600 hover:bg-purple-700 rounded-lg font-medium transition-colors flex items-center justify-center">
104
+ <i data-feather="play" class="mr-2"></i> Test Model
105
+ </button>
106
+ </div>
107
+ </div>
108
+
109
+ <!-- Training Progress Section -->
110
+ <div id="progress-section" class="mt-8 bg-gray-800 rounded-xl p-6 shadow-lg neon-glow hidden">
111
+ <h2 class="text-2xl font-semibold mb-4 flex items-center">
112
+ <i data-feather="bar-chart-2" class="mr-2"></i> Training Progress
113
+ </h2>
114
+
115
+ <div class="mb-4">
116
+ <div class="flex justify-between mb-1">
117
+ <span id="epoch-progress" class="text-sm font-medium">Epoch: 0/0</span>
118
+ <span id="loss-value" class="text-sm font-medium">Loss: -</span>
119
+ </div>
120
+ <div class="w-full bg-gray-700 rounded-full h-2.5">
121
+ <div id="progress-bar" class="bg-blue-600 h-2.5 rounded-full" style="width: 0%"></div>
122
+ </div>
123
+ </div>
124
+
125
+ <canvas id="loss-chart" class="w-full h-64"></canvas>
126
+ </div>
127
+ </div>
128
+
129
+ <script>
130
+ // Initialize Vanta.js background
131
+ VANTA.NET({
132
+ el: "#vanta-bg",
133
+ mouseControls: true,
134
+ touchControls: true,
135
+ gyroControls: false,
136
+ minHeight: 200.00,
137
+ minWidth: 200.00,
138
+ scale: 1.00,
139
+ scaleMobile: 1.00,
140
+ color: 0x3b82f6,
141
+ backgroundColor: 0x111827,
142
+ points: 10.00,
143
+ maxDistance: 20.00,
144
+ spacing: 15.00
145
+ });
146
+
147
+ // Initialize Feather Icons
148
+ feather.replace();
149
+
150
+ // Simple Tokenizer
151
+ class SimpleTokenizer {
152
+ constructor() {
153
+ this.vocab = {};
154
+ this.inverseVocab = {};
155
+ this.vocabSize = 0;
156
+ }
157
+
158
+ fit(texts) {
159
+ const allText = texts.join(' ');
160
+ const tokens = allText.toLowerCase().match(/\b\w+\b/g) || [];
161
+ const uniqueTokens = [...new Set(tokens)];
162
+
163
+ uniqueTokens.forEach((token, index) => {
164
+ this.vocab[token] = index;
165
+ this.inverseVocab[index] = token;
166
+ });
167
+
168
+ this.vocabSize = uniqueTokens.length;
169
+ }
170
+
171
+ tokenize(text) {
172
+ const tokens = text.toLowerCase().match(/\b\w+\b/g) || [];
173
+ return tokens.map(token => this.vocab[token] || -1);
174
+ }
175
+
176
+ detokenize(indices) {
177
+ return indices.map(idx => this.inverseVocab[idx] || '[UNK]').join(' ');
178
+ }
179
+ }
180
+
181
+ // Simple Neural Network
182
+ class SimpleLLM {
183
+ constructor(inputSize, outputSize) {
184
+ this.inputSize = inputSize;
185
+ this.outputSize = outputSize;
186
+ this.weights = Array(inputSize).fill().map(() =>
187
+ Array(outputSize).fill().map(() => Math.random() * 0.2 - 0.1)
188
+ );
189
+ this.bias = Array(outputSize).fill(0);
190
+ }
191
+
192
+ softmax(logits) {
193
+ const maxLogit = Math.max(...logits);
194
+ const exps = logits.map(l => Math.exp(l - maxLogit));
195
+ const sumExps = exps.reduce((a, b) => a + b, 0);
196
+ return exps.map(exp => exp / sumExps);
197
+ }
198
+
199
+ forward(input) {
200
+ const output = Array(this.outputSize).fill(0);
201
+
202
+ for (let j = 0; j < this.outputSize; j++) {
203
+ for (let i = 0; i < this.inputSize; i++) {
204
+ if (input[i]) {
205
+ output[j] += input[i] * this.weights[i][j];
206
+ }
207
+ }
208
+ output[j] += this.bias[j];
209
+ }
210
+
211
+ return this.softmax(output);
212
+ }
213
+
214
+ trainStep(input, target, learningRate) {
215
+ const prediction = this.forward(input);
216
+ const error = prediction.map((p, i) => p - (i === target ? 1 : 0));
217
+
218
+ // Update weights
219
+ for (let i = 0; i < this.inputSize; i++) {
220
+ for (let j = 0; j < this.outputSize; j++) {
221
+ if (input[i]) {
222
+ this.weights[i][j] -= learningRate * error[j] * input[i];
223
+ }
224
+ }
225
+ }
226
+
227
+ // Update bias
228
+ for (let j = 0; j < this.outputSize; j++) {
229
+ this.bias[j] -= learningRate * error[j];
230
+ }
231
+
232
+ // Calculate loss (cross entropy)
233
+ const loss = -Math.log(prediction[target] + 1e-10);
234
+ return loss;
235
+ }
236
+ }
237
+
238
+ // DOM Elements
239
+ const trainBtn = document.getElementById('train-btn');
240
+ const testBtn = document.getElementById('test-btn');
241
+ const trainingData = document.getElementById('training-data');
242
+ const trainingLabels = document.getElementById('training-labels');
243
+ const testInput = document.getElementById('test-input');
244
+ const tokenizerOutput = document.getElementById('tokenizer-output');
245
+ const modelOutput = document.getElementById('model-output');
246
+ const progressSection = document.getElementById('progress-section');
247
+ const progressBar = document.getElementById('progress-bar');
248
+ const epochProgress = document.getElementById('epoch-progress');
249
+ const lossValue = document.getElementById('loss-value');
250
+
251
+ // Initialize chart
252
+ const ctx = document.getElementById('loss-chart').getContext('2d');
253
+ const lossChart = new Chart(ctx, {
254
+ type: 'line',
255
+ data: {
256
+ labels: [],
257
+ datasets: [{
258
+ label: 'Training Loss',
259
+ data: [],
260
+ borderColor: 'rgb(59, 130, 246)',
261
+ tension: 0.1,
262
+ fill: false
263
+ }]
264
+ },
265
+ options: {
266
+ responsive: true,
267
+ plugins: {
268
+ legend: {
269
+ position: 'top',
270
+ labels: {
271
+ color: 'rgb(209, 213, 219)'
272
+ }
273
+ }
274
+ },
275
+ scales: {
276
+ y: {
277
+ beginAtZero: true,
278
+ grid: {
279
+ color: 'rgba(255, 255, 255, 0.1)'
280
+ },
281
+ ticks: {
282
+ color: 'rgb(209, 213, 219)'
283
+ }
284
+ },
285
+ x: {
286
+ grid: {
287
+ color: 'rgba(255, 255, 255, 0.1)'
288
+ },
289
+ ticks: {
290
+ color: 'rgb(209, 213, 219)'
291
+ }
292
+ }
293
+ }
294
+ }
295
+ });
296
+
297
+ // Global variables
298
+ let tokenizer = new SimpleTokenizer();
299
+ let model = null;
300
+ let labelMap = {};
301
+ let inverseLabelMap = {};
302
+ let isTraining = false;
303
+
304
+ // Event Listeners
305
+ trainBtn.addEventListener('click', async () => {
306
+ if (isTraining) return;
307
+
308
+ const dataText = trainingData.value.trim();
309
+ const labelsText = trainingLabels.value.trim();
310
+
311
+ if (!dataText || !labelsText) {
312
+ alert('Please provide both training data and labels');
313
+ return;
314
+ }
315
+
316
+ const dataLines = dataText.split('\n').filter(line => line.trim());
317
+ const labelLines = labelsText.split('\n').filter(line => line.trim());
318
+
319
+ if (dataLines.length !== labelLines.length) {
320
+ alert('Number of training samples must match number of labels');
321
+ return;
322
+ }
323
+
324
+ // Create label mapping
325
+ const uniqueLabels = [...new Set(labelLines)];
326
+ labelMap = {};
327
+ inverseLabelMap = {};
328
+ uniqueLabels.forEach((label, idx) => {
329
+ labelMap[label] = idx;
330
+ inverseLabelMap[idx] = label;
331
+ });
332
+
333
+ // Initialize tokenizer and model
334
+ tokenizer.fit(dataLines);
335
+ model = new SimpleLLM(tokenizer.vocabSize, uniqueLabels.length);
336
+
337
+ // Prepare training data
338
+ const trainingSet = dataLines.map((text, idx) => ({
339
+ input: tokenizer.tokenize(text),
340
+ label: labelMap[labelLines[idx]]
341
+ }));
342
+
343
+ // Training parameters
344
+ const epochs = parseInt(document.getElementById('epochs').value);
345
+ const learningRate = parseFloat(document.getElementById('learning-rate').value);
346
+
347
+ // Show progress section
348
+ progressSection.classList.remove('hidden');
349
+ lossChart.data.labels = [];
350
+ lossChart.data.datasets[0].data = [];
351
+ lossChart.update();
352
+
353
+ // Train model
354
+ isTraining = true;
355
+ trainBtn.disabled = true;
356
+ trainBtn.innerHTML = '<i data-feather="loader" class="animate-spin mr-2"></i> Training...';
357
+ feather.replace();
358
+
359
+ let totalLoss = 0;
360
+ let totalSteps = 0;
361
+
362
+ for (let epoch = 0; epoch < epochs; epoch++) {
363
+ epochProgress.textContent = `Epoch: ${epoch + 1}/${epochs}`;
364
+
365
+ let epochLoss = 0;
366
+ const shuffledSet = [...trainingSet].sort(() => Math.random() - 0.5);
367
+
368
+ for (let i = 0; i < shuffledSet.length; i++) {
369
+ const {input, label} = shuffledSet[i];
370
+ const loss = model.trainStep(input, label, learningRate);
371
+
372
+ epochLoss += loss;
373
+ totalLoss += loss;
374
+ totalSteps++;
375
+
376
+ // Update progress bar
377
+ const progress = ((i + 1) / shuffledSet.length) * 100;
378
+ progressBar.style.width = `${progress}%`;
379
+
380
+ // Update loss value periodically
381
+ if (i % 5 === 0 || i === shuffledSet.length - 1) {
382
+ lossValue.textContent = `Loss: ${(epochLoss / (i + 1)).toFixed(4)}`;
383
+
384
+ // Add data point to chart every 5 epochs or last epoch
385
+ if (epoch % 5 === 0 || epoch === epochs - 1) {
386
+ lossChart.data.labels.push(`Epoch ${epoch + 1}`);
387
+ lossChart.data.datasets[0].data.push(epochLoss / (i + 1));
388
+ lossChart.update();
389
+ }
390
+
391
+ // Small delay to allow UI updates
392
+ await new Promise(resolve => setTimeout(resolve, 0));
393
+ }
394
+ }
395
+ }
396
+
397
+ // Training complete
398
+ isTraining = false;
399
+ trainBtn.disabled = false;
400
+ trainBtn.innerHTML = '<i data-feather="activity" class="mr-2"></i> Train Model';
401
+ feather.replace();
402
+
403
+ // Show tokenizer output
404
+ updateTokenizerOutput();
405
+ });
406
+
407
+ testBtn.addEventListener('click', () => {
408
+ if (!model) {
409
+ alert('Please train the model first');
410
+ return;
411
+ }
412
+
413
+ const testText = testInput.value.trim();
414
+ if (!testText) {
415
+ alert('Please enter some text to test');
416
+ return;
417
+ }
418
+
419
+ // Tokenize input
420
+ const tokens = tokenizer.tokenize(testText);
421
+
422
+ // Make prediction
423
+ const prediction = model.forward(tokens);
424
+ const maxIdx = prediction.indexOf(Math.max(...prediction));
425
+ const predictedLabel = inverseLabelMap[maxIdx];
426
+ const confidence = prediction[maxIdx];
427
+
428
+ // Display results
429
+ modelOutput.innerHTML = `
430
+ <div class="mb-2">Predicted: <span class="font-bold">${predictedLabel}</span></div>
431
+ <div class="mb-2">Confidence: <span class="font-bold">${(confidence * 100).toFixed(2)}%</span></div>
432
+ <div class="text-sm">Probabilities:</div>
433
+ <div class="mt-2">
434
+ ${prediction.map((p, idx) => `
435
+ <div class="flex items-center mb-1">
436
+ <div class="w-24">${inverseLabelMap[idx]}:</div>
437
+ <div class="flex-1 bg-gray-700 h-4 rounded-full overflow-hidden">
438
+ <div class="bg-blue-500 h-full" style="width: ${p * 100}%"></div>
439
+ </div>
440
+ <div class="w-16 text-right">${(p * 100).toFixed(1)}%</div>
441
+ </div>
442
+ `).join('')}
443
+ </div>
444
+ `;
445
+ });
446
+
447
+ function updateTokenizerOutput() {
448
+ const testText = testInput.value.trim() || "example text to tokenize";
449
+ const tokens = tokenizer.tokenize(testText);
450
+
451
+ tokenizerOutput.innerHTML = `
452
+ <div class="mb-2">Text: "${testText}"</div>
453
+ <div class="mb-2">Tokens:</div>
454
+ <div class="flex flex-wrap">
455
+ ${tokens.map(t => t === -1 ?
456
+ '<span class="token bg-red-900">[UNK]</span>' :
457
+ `<span class="token">${t}</span>`
458
+ ).join('')}
459
+ </div>
460
+ <div class="mt-4">Vocabulary size: ${tokenizer.vocabSize}</div>
461
+ `;
462
+ }
463
+
464
+ testInput.addEventListener('input', () => {
465
+ if (tokenizer.vocabSize > 0) {
466
+ updateTokenizerOutput();
467
+ }
468
+ });
469
+
470
+ // Initialize tokenizer output with default text
471
+ updateTokenizerOutput();
472
+ </script>
473
+ </body>
474
  </html>