Upload folder using huggingface_hub
Browse files- README.md +89 -0
- loss_history.json +1 -0
- model.py +123 -0
- model.weights.h5 +3 -0
- model_config.json +8 -0
- tokenizer.json +1 -0
- tokenizer.py +104 -0
- training_data.py +159 -0
- training_state.json +7 -0
README.md
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
tags:
|
| 4 |
+
- custom-architecture
|
| 5 |
+
- from-scratch
|
| 6 |
+
- language-model
|
| 7 |
+
- non-transformer
|
| 8 |
+
- tensorflow
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# TERA V2
|
| 12 |
+
|
| 13 |
+
A language model built entirely from scratch. No pretrained weights. No standard transformers.
|
| 14 |
+
|
| 15 |
+
## Architecture
|
| 16 |
+
|
| 17 |
+
TERA V2 uses a custom non-transformer architecture with the following components:
|
| 18 |
+
|
| 19 |
+
- **Time Mix** for sequence mixing
|
| 20 |
+
- **Token Shift** for position encoding
|
| 21 |
+
- **GroupNorm** for normalization
|
| 22 |
+
- **Channel Mix** with **Squared ReLU** for feed-forward
|
| 23 |
+
- **Stochastic Depth** for regularization
|
| 24 |
+
- **Untied Embeddings**
|
| 25 |
+
|
| 26 |
+
## Model Specifications
|
| 27 |
+
|
| 28 |
+
| Specification | Value |
|
| 29 |
+
|---------------|-------|
|
| 30 |
+
| Parameters | ~726K |
|
| 31 |
+
| Vocabulary Size | 510 |
|
| 32 |
+
| Context Length | 32 tokens |
|
| 33 |
+
| Hidden Size (d_model) | 128 |
|
| 34 |
+
| Attention Heads | 4 |
|
| 35 |
+
| Layers | 3 |
|
| 36 |
+
| Framework | TensorFlow / Keras |
|
| 37 |
+
|
| 38 |
+
## Training Details
|
| 39 |
+
|
| 40 |
+
- Trained from scratch on clean question-answer pairs
|
| 41 |
+
- No pretrained weights were used at any stage
|
| 42 |
+
- Custom BPE-lite tokenizer trained on the same data
|
| 43 |
+
- Loss function: Sigmoid cross-entropy
|
| 44 |
+
- Optimizer: Adam with cosine learning rate schedule
|
| 45 |
+
- Training format: Q: question / A: answer
|
| 46 |
+
|
| 47 |
+
## How To Use
|
| 48 |
+
|
| 49 |
+
1. Download all files from this repository
|
| 50 |
+
2. Install TensorFlow
|
| 51 |
+
3. Load the tokenizer from tokenizer.json
|
| 52 |
+
4. Build the model using model_config.json
|
| 53 |
+
5. Load weights from model.weights.h5
|
| 54 |
+
6. Format input as: Q: your question here / A:
|
| 55 |
+
|
| 56 |
+
## Example Input and Output
|
| 57 |
+
|
| 58 |
+
Input: Q: What is the sun?
|
| 59 |
+
|
| 60 |
+
Output: The sun is a star at the center of our solar system.
|
| 61 |
+
|
| 62 |
+
Input: Q: Hello
|
| 63 |
+
|
| 64 |
+
Output: Hello! How can I help you today?
|
| 65 |
+
|
| 66 |
+
## Files Included
|
| 67 |
+
|
| 68 |
+
| File | Description |
|
| 69 |
+
|------|-------------|
|
| 70 |
+
| model.py | Model architecture code |
|
| 71 |
+
| tokenizer.py | Tokenizer class code |
|
| 72 |
+
| model_config.json | Model hyperparameters |
|
| 73 |
+
| tokenizer.json | Trained tokenizer vocabulary |
|
| 74 |
+
| model.weights.h5 | Trained model weights |
|
| 75 |
+
| training_data.py | Training data used |
|
| 76 |
+
| loss_history.json | Training loss over epochs |
|
| 77 |
+
| training_state.json | Final training stats |
|
| 78 |
+
|
| 79 |
+
## Live Demo
|
| 80 |
+
|
| 81 |
+
Try TERA V2 live at: https://huggingface.co/spaces/vedaco/tera.v2
|
| 82 |
+
|
| 83 |
+
## Created By
|
| 84 |
+
|
| 85 |
+
**Vedaco Team**
|
| 86 |
+
|
| 87 |
+
## License
|
| 88 |
+
|
| 89 |
+
Apache 2.0
|
loss_history.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[0.6777933619239114, 0.5489626540379091, 0.4032443734732541, 0.26397443088618194, 0.15982136299664323, 0.09749744189056483, 0.06182015213099393, 0.0428296664560383, 0.03248495159840042, 0.025369571521878242, 0.021231550130654465, 0.018688730302859436, 0.016952331685884434, 0.015737138011238792, 0.014826155436987227, 0.014117024627260187, 0.013573957268487324, 0.01315675616603006, 0.012814416677098383, 0.012533508495173672, 0.012289026963778517, 0.012074213813651691, 0.01189823838120157, 0.011761364738710901, 0.011589584533463825, 0.011966043363579294, 0.01161787256767804, 0.011408125643025745, 0.011274495712396774, 0.011165056843310595, 0.011029016692191362, 0.010845381681892004, 0.010592628820714626, 0.010364154904064808, 0.010159476067532192, 0.00998176181350242, 0.009784147863022306, 0.009623614012856375, 0.009454317094588821, 0.009300041943788528, 0.009120205755938183, 0.00895718980411237, 0.009220898109064861, 0.008909361987290058, 0.008606438228691166, 0.008437910540537401, 0.008305993100458925, 0.00823848694562912, 0.008148690431632778, 0.00797634421509098, 0.007858005860312418, 0.0078092869582839985, 0.007748601323162968, 0.00757513161409985, 0.007489391877739267, 0.007408449722623283, 0.007368005410006101, 0.007257567168298093, 0.007161199517378753, 0.007046696624125947, 0.0070016061548482285, 0.006926002552394162, 0.006956193638457494, 0.006749255032363263, 0.0066776057832281695, 0.006610508119179444, 0.006581316511570053, 0.006499545902691104, 0.006386171222071756, 0.006319214293563908, 0.006356591824442148, 0.0062893539963459425, 0.006106956544416872, 0.0059958675622262736, 0.005961307959461754, 0.005908086895942688, 0.005849096733568745, 0.00573302642442286, 0.005699795543808828, 0.005671053786169399, 0.005529552338306199, 0.005498948591676625, 0.005461397686634551, 0.005410658513111147, 0.005312148841436614, 0.00527636313133619, 0.005222966788675298, 0.005229999988593839, 0.005107997980138118, 0.005010555328970606, 0.00491664203053171, 0.00496457957408645, 0.00481597268530591, 0.004795897032388232, 0.004743322941728614, 0.0048229656901887874, 0.0046987513037906456, 0.004657887070524422, 0.004554929698563435, 0.004548625491390174, 0.004407775630666451, 0.004414369881322438, 0.00431428721640259, 0.004356119792315771, 0.004387144824828614, 0.004357563755051656, 0.004307111280716278, 0.004135416132736613, 0.00413576130416583, 0.004046515815637328, 0.004142476647922938, 0.004140938431109217, 0.003963580217466436, 0.004037238668057729, 0.003962569287978113, 0.003937093922021714, 0.0038565707138993525, 0.003940338300625709, 0.003931608400307596, 0.003903620791706172, 0.0037724797148257494, 0.0038640993350947447, 0.003898362163454294, 0.003828189970756119, 0.003967069512741132, 0.0038113788997923784, 0.0037268155720084906, 0.003710272489115596, 0.0036456402158364654, 0.0036354912656613374, 0.0037737022132866764, 0.0038144190330058336, 0.003617019043304026, 0.0036560428138314323, 0.0037154273066500373, 0.0038191729690879583, 0.003767555674792013, 0.003644285575402054, 0.0036442654038017445, 0.003684694339013235, 0.0036293871340934525, 0.003760792018676346, 0.0035645059860226784, 0.0036268330983478913, 0.0036453565963628616, 0.0036202340323308654, 0.003568362889134071, 0.003619104136966846, 0.003567851836454462, 0.0036325537684288893]
|
model.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tensorflow as tf
|
| 2 |
+
import numpy as np
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
# ---- Token Shift ----
|
| 6 |
+
class TokenShift(tf.keras.layers.Layer):
|
| 7 |
+
def call(self, x):
|
| 8 |
+
shifted = tf.concat([tf.zeros_like(x[:, :1, :]), x[:, :-1, :]], axis=1)
|
| 9 |
+
return (x + shifted) / 2.0
|
| 10 |
+
|
| 11 |
+
# ---- Time Mix ----
|
| 12 |
+
class TimeMix(tf.keras.layers.Layer):
|
| 13 |
+
def __init__(self, d_model, n_heads, **kwargs):
|
| 14 |
+
super().__init__(**kwargs)
|
| 15 |
+
self.d_model = d_model
|
| 16 |
+
self.n_heads = n_heads
|
| 17 |
+
self.head_dim = d_model // n_heads
|
| 18 |
+
self.shift = TokenShift()
|
| 19 |
+
self.qkv = tf.keras.layers.Dense(3 * d_model, use_bias=False)
|
| 20 |
+
self.out_proj = tf.keras.layers.Dense(d_model, use_bias=False)
|
| 21 |
+
|
| 22 |
+
def call(self, x, training=False):
|
| 23 |
+
x = self.shift(x)
|
| 24 |
+
B, T, C = tf.shape(x)[0], tf.shape(x)[1], self.d_model
|
| 25 |
+
qkv = self.qkv(x)
|
| 26 |
+
q, k, v = tf.split(qkv, 3, axis=-1)
|
| 27 |
+
|
| 28 |
+
q = tf.reshape(q, [B, T, self.n_heads, self.head_dim])
|
| 29 |
+
k = tf.reshape(k, [B, T, self.n_heads, self.head_dim])
|
| 30 |
+
v = tf.reshape(v, [B, T, self.n_heads, self.head_dim])
|
| 31 |
+
|
| 32 |
+
q = tf.transpose(q, [0, 2, 1, 3])
|
| 33 |
+
k = tf.transpose(k, [0, 2, 1, 3])
|
| 34 |
+
v = tf.transpose(v, [0, 2, 1, 3])
|
| 35 |
+
|
| 36 |
+
scale = tf.math.sqrt(tf.cast(self.head_dim, tf.float32))
|
| 37 |
+
attn = tf.matmul(q, k, transpose_b=True) / scale
|
| 38 |
+
|
| 39 |
+
mask = tf.linalg.band_part(tf.ones([T, T]), -1, 0)
|
| 40 |
+
attn = attn * mask[tf.newaxis, tf.newaxis, :, :] + (1.0 - mask[tf.newaxis, tf.newaxis, :, :]) * -1e9
|
| 41 |
+
attn = tf.nn.softmax(attn, axis=-1)
|
| 42 |
+
|
| 43 |
+
out = tf.matmul(attn, v)
|
| 44 |
+
out = tf.transpose(out, [0, 2, 1, 3])
|
| 45 |
+
out = tf.reshape(out, [B, T, C])
|
| 46 |
+
return self.out_proj(out)
|
| 47 |
+
|
| 48 |
+
# ---- Channel Mix (FFN) with Squared ReLU ----
|
| 49 |
+
class ChannelMix(tf.keras.layers.Layer):
|
| 50 |
+
def __init__(self, d_model, expand=4, **kwargs):
|
| 51 |
+
super().__init__(**kwargs)
|
| 52 |
+
self.shift = TokenShift()
|
| 53 |
+
self.fc1 = tf.keras.layers.Dense(d_model * expand, use_bias=False)
|
| 54 |
+
self.fc2 = tf.keras.layers.Dense(d_model, use_bias=False)
|
| 55 |
+
|
| 56 |
+
def call(self, x, training=False):
|
| 57 |
+
x = self.shift(x)
|
| 58 |
+
h = self.fc1(x)
|
| 59 |
+
h = tf.nn.relu(h) ** 2 # Squared ReLU
|
| 60 |
+
return self.fc2(h)
|
| 61 |
+
|
| 62 |
+
# ---- Single TERA Block ----
|
| 63 |
+
class TeraBlock(tf.keras.layers.Layer):
|
| 64 |
+
def __init__(self, d_model, n_heads, drop_rate=0.0, **kwargs):
|
| 65 |
+
super().__init__(**kwargs)
|
| 66 |
+
self.norm1 = tf.keras.layers.GroupNormalization(groups=4, axis=-1)
|
| 67 |
+
self.time_mix = TimeMix(d_model, n_heads)
|
| 68 |
+
self.norm2 = tf.keras.layers.GroupNormalization(groups=4, axis=-1)
|
| 69 |
+
self.channel_mix = ChannelMix(d_model)
|
| 70 |
+
self.drop_rate = drop_rate
|
| 71 |
+
|
| 72 |
+
def call(self, x, training=False):
|
| 73 |
+
# Stochastic depth
|
| 74 |
+
if training and self.drop_rate > 0.0:
|
| 75 |
+
if tf.random.uniform([]) < self.drop_rate:
|
| 76 |
+
return x
|
| 77 |
+
|
| 78 |
+
h = self.norm1(x)
|
| 79 |
+
x = x + self.time_mix(h, training=training)
|
| 80 |
+
h = self.norm2(x)
|
| 81 |
+
x = x + self.channel_mix(h, training=training)
|
| 82 |
+
return x
|
| 83 |
+
|
| 84 |
+
# ---- TERA LM ----
|
| 85 |
+
class TeraLM(tf.keras.Model):
|
| 86 |
+
def __init__(self, vocab_size, d_model=128, n_heads=4, n_layers=3,
|
| 87 |
+
max_seq=32, drop_rate=0.05, **kwargs):
|
| 88 |
+
super().__init__(**kwargs)
|
| 89 |
+
self.vocab_size = vocab_size
|
| 90 |
+
self.d_model = d_model
|
| 91 |
+
self.n_heads = n_heads
|
| 92 |
+
self.n_layers = n_layers
|
| 93 |
+
self.max_seq = max_seq
|
| 94 |
+
|
| 95 |
+
self.tok_emb = tf.keras.layers.Embedding(vocab_size, d_model)
|
| 96 |
+
self.pos_emb = tf.keras.layers.Embedding(max_seq, d_model)
|
| 97 |
+
self.blocks = [
|
| 98 |
+
TeraBlock(d_model, n_heads, drop_rate=drop_rate * (i / max(n_layers - 1, 1)))
|
| 99 |
+
for i in range(n_layers)
|
| 100 |
+
]
|
| 101 |
+
self.ln_f = tf.keras.layers.GroupNormalization(groups=4, axis=-1)
|
| 102 |
+
self.head = tf.keras.layers.Dense(vocab_size, use_bias=False)
|
| 103 |
+
|
| 104 |
+
def call(self, x, training=False):
|
| 105 |
+
B, T = tf.shape(x)[0], tf.shape(x)[1]
|
| 106 |
+
pos = tf.range(T)[tf.newaxis, :]
|
| 107 |
+
h = self.tok_emb(x) + self.pos_emb(pos)
|
| 108 |
+
for block in self.blocks:
|
| 109 |
+
h = block(h, training=training)
|
| 110 |
+
h = self.ln_f(h)
|
| 111 |
+
return self.head(h)
|
| 112 |
+
|
| 113 |
+
def get_config(self):
|
| 114 |
+
return {
|
| 115 |
+
"vocab_size": self.vocab_size,
|
| 116 |
+
"d_model": self.d_model,
|
| 117 |
+
"n_heads": self.n_heads,
|
| 118 |
+
"n_layers": self.n_layers,
|
| 119 |
+
"max_seq": self.max_seq,
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
# Alias for compatibility
|
| 123 |
+
TeraAIModel = TeraLM
|
model.weights.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9a5309b71b3d036c63a28e7f883f457776f67efb4b9d2aedef21d9d54081eae2
|
| 3 |
+
size 2997088
|
model_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"vocab_size": 510,
|
| 3 |
+
"d_model": 128,
|
| 4 |
+
"n_heads": 4,
|
| 5 |
+
"n_layers": 3,
|
| 6 |
+
"max_seq": 32,
|
| 7 |
+
"drop_rate": 0.05
|
| 8 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"word2id": {"<pad>": 0, "<unk>": 1, "<bos>": 2, "<eos>": 3, "!": 4, ",": 5, ".": 6, "4": 7, "5": 8, ":": 9, "?": 10, "a": 11, "b": 12, "c": 13, "d": 14, "e": 15, "f": 16, "g": 17, "h": 18, "i": 19, "j": 20, "k": 21, "l": 22, "m": 23, "n": 24, "o": 25, "p": 26, "q": 27, "r": 28, "s": 29, "t": 30, "u": 31, "v": 32, "w": 33, "x": 34, "y": 35, "z": 36, "is": 37, "what": 38, "the": 39, "and": 40, "that": 41, "you": 42, "of": 43, "do": 44, "to": 45, "am": 46, "are": 47, "an": 48, "can": 49, "for": 50, "water": 51, "how": 52, "two": 53, "we": 54, "in": 55, "plus": 56, "language": 57, "ai": 58, "from": 59, "body": 60, "by": 61, "through": 62, "with": 63, "five": 64, "earth": 65, "good": 66, "large": 67, "have": 68, "our": 69, "see": 70, "or": 71, "sky": 72, "very": 73, "one": 74, "three": 75, "food": 76, "why": 77, "help": 78, "tera": 79, "sun": 80, "planet": 81, "air": 82, "light": 83, "energy": 84, "work": 85, "rain": 86, "animal": 87, "ten": 88, "many": 89, "people": 90, "computer": 91, "on": 92, "eat": 93, "sleep": 94, "who": 95, "need": 96, "star": 97, "oxygen": 98, "other": 99, "world": 100, "not": 101, "hello": 102, "hi": 103, "name": 104, "my": 105, "made": 106, "us": 107, "sound": 108, "as": 109, "area": 110, "where": 111, "clouds": 112, "things": 113, "fish": 114, "internet": 115, "words": 116, "like": 117, "music": 118, "human": 119, "doing": 120, "well": 121, "thank": 122, "model": 123, "me": 124, "around": 125, "learn": 126, "think": 127, "sunlight": 128, "brain": 129, "today": 130, "hey": 131, "morning": 132, "day": 133, "night": 134, "your": 135, "no": 136, "care": 137, "moon": 138, "gravity": 139, "form": 140, "known": 141, "falls": 142, "cloud": 143, "collection": 144, "breathe": 145, "often": 146, "cat": 147, "fly": 148, "animals": 149, "live": 150, "used": 151, "writing": 152, "process": 153, "their": 154, "plants": 155, "blue": 156, "when": 157, "give": 158, "it": 159, "hear": 160, "tell": 161, "something": 162, "about": 163, "new": 164, "science": 165, "book": 166, "time": 167, "study": 168, "hope": 169, "great": 170, "was": 171, "created": 172, "happy": 173, "know": 174, "answer": 175, "questions": 176, "conversations": 177, "at": 178, "system": 179, "orbits": 180, "liquid": 181, "objects": 182, "each": 183, "machine": 184, "computers": 185, "so": 186, "there": 187, "asking": 188, "assistant": 189, "built": 190, "scratch": 191, "fire": 192, "produces": 193, "heat": 194, "mars": 195, "gas": 196, "above": 197, "ground": 198, "snow": 199, "frozen": 200, "ice": 201, "its": 202, "living": 203, "tree": 204, "tall": 205, "trunk": 206, "dog": 207, "gills": 208, "bird": 209, "wings": 210, "elephant": 211, "lion": 212, "six": 213, "minus": 214, "times": 215, "divided": 216, "country": 217, "land": 218, "ocean": 219, "mountain": 220, "river": 221, "desert": 222, "gets": 223, "forest": 224, "trees": 225, "city": 226, "place": 227, "phone": 228, "make": 229, "robot": 230, "software": 231, "instructions": 232, "website": 233, "pages": 234, "coding": 235, "using": 236, "word": 237, "meaning": 238, "sentence": 239, "group": 240, "rules": 241, "english": 242, "reading": 243, "get": 244, "fruit": 245, "bread": 246, "milk": 247, "rice": 248, "birds": 249, "organ": 250, "heart": 251, "eyes": 252, "skin": 253, "bones": 254, "years": 255, "old": 256, "learning": 257, "feeling": 258, "go": 259, "feelings": 260, "but": 261, "enjoy": 262, "purpose": 263, "way": 264, "color": 265, "school": 266, "family": 267, "friend": 268, "someone": 269, "love": 270, "game": 271, "number": 272, "math": 273, "history": 274, "art": 275, "vedaco": 276, "team": 277, "welcome": 278, "thanks": 279, "problem": 280, "let": 281, "if": 282, "anything": 283, "else": 284, "bye": 285, "goodbye": 286, "wonderful": 287, "explain": 288, "topics": 289, "center": 290, "solar": 291, "natural": 292, "satellite": 293, "hydrogen": 294, "force": 295, "pulls": 296, "toward": 297, "third": 298, "home": 299, "four": 300, "processes": 301, "information": 302, "runs": 303, "programs": 304, "global": 305, "network": 306, "connects": 307, "stands": 308, "artificial": 309, "intelligence": 310, "which": 311, "technology": 312, "looks": 313, "because": 314, "scattered": 315, "atmosphere": 316, "rest": 317, "recover": 318, "later": 319, "take": 320, "mixture": 321, "gases": 322, "surrounds": 323, "chemical": 324, "reaction": 325, "allows": 326, "vibration": 327, "travels": 328, "materials": 329, "ability": 330, "cause": 331, "change": 332, "fourth": 333, "red": 334, "ball": 335, "hot": 336, "stars": 337, "tiny": 338, "drops": 339, "floating": 340, "cold": 341, "weather": 342, "solid": 343, "plant": 344, "branches": 345, "leaves": 346, "friendly": 347, "kept": 348, "pet": 349, "small": 350, "being": 351, "independent": 352, "playful": 353, "lives": 354, "breathes": 355, "feathers": 356, "long": 357, "wild": 358, "king": 359, "own": 360, "government": 361, "salt": 362, "landform": 363, "rises": 364, "high": 365, "stream": 366, "flows": 367, "across": 368, "dry": 369, "little": 370, "covered": 371, "device": 372, "calls": 373, "send": 374, "messages": 375, "perform": 376, "tasks": 377, "automatically": 378, "set": 379, "tells": 380, "programming": 381, "unit": 382, "has": 383, "expresses": 384, "complete": 385, "thought": 386, "communication": 387, "spoken": 388, "looking": 389, "understanding": 390, "putting": 391, "thoughts": 392, "into": 393, "paper": 394, "screen": 395, "stay": 396, "alive": 397, "sweet": 398, "grows": 399, "flour": 400, "yeast": 401, "white": 402, "produced": 403, "cows": 404, "grain": 405, "eaten": 406, "main": 407, "countries": 408, "move": 409, "taking": 410, "does": 411, "forms": 412, "heavy": 413, "down": 414, "needs": 415, "called": 416, "photosynthesis": 417, "vibrations": 418, "travel": 419, "reach": 420, "ears": 421, "controls": 422, "thinking": 423, "functions": 424, "pumps": 425, "blood": 426, "organs": 427, "allow": 428, "outer": 429, "covering": 430, "protects": 431, "hard": 432, "parts": 433, "inside": 434, "shape": 435, "support": 436, "interesting": 437, "did": 438, "honey": 439, "never": 440, "goes": 441, "bad": 442, "last": 443, "thousands": 444, "fact": 445, "billion": 446, "bored": 447, "space": 448, "sad": 449, "sorry": 450, "better": 451, "soon": 452, "here": 453, "glad": 454, "should": 455, "could": 456, "read": 457, "walk": 458, "much": 459, "generate": 460, "responses": 461, "smart": 462, "still": 463, "try": 464, "best": 465, "helpful": 466, "answers": 467, "just": 468, "recently": 469, "running": 470, "servers": 471, "run": 472, "electricity": 473, "code": 474, "measure": 475, "passing": 476, "moments": 477, "seconds": 478, "hours": 479, "reflects": 480, "off": 481, "different": 482, "ways": 483, "combination": 484, "sounds": 485, "arranged": 486, "pleasing": 487, "written": 488, "bound": 489, "together": 490, "related": 491, "spending": 492, "strong": 493, "affection": 494, "activity": 495, "done": 496, "fun": 497, "value": 498, "counting": 499, "measuring": 500, "numbers": 501, "shapes": 502, "patterns": 503, "universe": 504, "happened": 505, "past": 506, "expression": 507, "ideas": 508, "creative": 509}, "id2word": {"0": "<pad>", "1": "<unk>", "2": "<bos>", "3": "<eos>", "4": "!", "5": ",", "6": ".", "7": "4", "8": "5", "9": ":", "10": "?", "11": "a", "12": "b", "13": "c", "14": "d", "15": "e", "16": "f", "17": "g", "18": "h", "19": "i", "20": "j", "21": "k", "22": "l", "23": "m", "24": "n", "25": "o", "26": "p", "27": "q", "28": "r", "29": "s", "30": "t", "31": "u", "32": "v", "33": "w", "34": "x", "35": "y", "36": "z", "37": "is", "38": "what", "39": "the", "40": "and", "41": "that", "42": "you", "43": "of", "44": "do", "45": "to", "46": "am", "47": "are", "48": "an", "49": "can", "50": "for", "51": "water", "52": "how", "53": "two", "54": "we", "55": "in", "56": "plus", "57": "language", "58": "ai", "59": "from", "60": "body", "61": "by", "62": "through", "63": "with", "64": "five", "65": "earth", "66": "good", "67": "large", "68": "have", "69": "our", "70": "see", "71": "or", "72": "sky", "73": "very", "74": "one", "75": "three", "76": "food", "77": "why", "78": "help", "79": "tera", "80": "sun", "81": "planet", "82": "air", "83": "light", "84": "energy", "85": "work", "86": "rain", "87": "animal", "88": "ten", "89": "many", "90": "people", "91": "computer", "92": "on", "93": "eat", "94": "sleep", "95": "who", "96": "need", "97": "star", "98": "oxygen", "99": "other", "100": "world", "101": "not", "102": "hello", "103": "hi", "104": "name", "105": "my", "106": "made", "107": "us", "108": "sound", "109": "as", "110": "area", "111": "where", "112": "clouds", "113": "things", "114": "fish", "115": "internet", "116": "words", "117": "like", "118": "music", "119": "human", "120": "doing", "121": "well", "122": "thank", "123": "model", "124": "me", "125": "around", "126": "learn", "127": "think", "128": "sunlight", "129": "brain", "130": "today", "131": "hey", "132": "morning", "133": "day", "134": "night", "135": "your", "136": "no", "137": "care", "138": "moon", "139": "gravity", "140": "form", "141": "known", "142": "falls", "143": "cloud", "144": "collection", "145": "breathe", "146": "often", "147": "cat", "148": "fly", "149": "animals", "150": "live", "151": "used", "152": "writing", "153": "process", "154": "their", "155": "plants", "156": "blue", "157": "when", "158": "give", "159": "it", "160": "hear", "161": "tell", "162": "something", "163": "about", "164": "new", "165": "science", "166": "book", "167": "time", "168": "study", "169": "hope", "170": "great", "171": "was", "172": "created", "173": "happy", "174": "know", "175": "answer", "176": "questions", "177": "conversations", "178": "at", "179": "system", "180": "orbits", "181": "liquid", "182": "objects", "183": "each", "184": "machine", "185": "computers", "186": "so", "187": "there", "188": "asking", "189": "assistant", "190": "built", "191": "scratch", "192": "fire", "193": "produces", "194": "heat", "195": "mars", "196": "gas", "197": "above", "198": "ground", "199": "snow", "200": "frozen", "201": "ice", "202": "its", "203": "living", "204": "tree", "205": "tall", "206": "trunk", "207": "dog", "208": "gills", "209": "bird", "210": "wings", "211": "elephant", "212": "lion", "213": "six", "214": "minus", "215": "times", "216": "divided", "217": "country", "218": "land", "219": "ocean", "220": "mountain", "221": "river", "222": "desert", "223": "gets", "224": "forest", "225": "trees", "226": "city", "227": "place", "228": "phone", "229": "make", "230": "robot", "231": "software", "232": "instructions", "233": "website", "234": "pages", "235": "coding", "236": "using", "237": "word", "238": "meaning", "239": "sentence", "240": "group", "241": "rules", "242": "english", "243": "reading", "244": "get", "245": "fruit", "246": "bread", "247": "milk", "248": "rice", "249": "birds", "250": "organ", "251": "heart", "252": "eyes", "253": "skin", "254": "bones", "255": "years", "256": "old", "257": "learning", "258": "feeling", "259": "go", "260": "feelings", "261": "but", "262": "enjoy", "263": "purpose", "264": "way", "265": "color", "266": "school", "267": "family", "268": "friend", "269": "someone", "270": "love", "271": "game", "272": "number", "273": "math", "274": "history", "275": "art", "276": "vedaco", "277": "team", "278": "welcome", "279": "thanks", "280": "problem", "281": "let", "282": "if", "283": "anything", "284": "else", "285": "bye", "286": "goodbye", "287": "wonderful", "288": "explain", "289": "topics", "290": "center", "291": "solar", "292": "natural", "293": "satellite", "294": "hydrogen", "295": "force", "296": "pulls", "297": "toward", "298": "third", "299": "home", "300": "four", "301": "processes", "302": "information", "303": "runs", "304": "programs", "305": "global", "306": "network", "307": "connects", "308": "stands", "309": "artificial", "310": "intelligence", "311": "which", "312": "technology", "313": "looks", "314": "because", "315": "scattered", "316": "atmosphere", "317": "rest", "318": "recover", "319": "later", "320": "take", "321": "mixture", "322": "gases", "323": "surrounds", "324": "chemical", "325": "reaction", "326": "allows", "327": "vibration", "328": "travels", "329": "materials", "330": "ability", "331": "cause", "332": "change", "333": "fourth", "334": "red", "335": "ball", "336": "hot", "337": "stars", "338": "tiny", "339": "drops", "340": "floating", "341": "cold", "342": "weather", "343": "solid", "344": "plant", "345": "branches", "346": "leaves", "347": "friendly", "348": "kept", "349": "pet", "350": "small", "351": "being", "352": "independent", "353": "playful", "354": "lives", "355": "breathes", "356": "feathers", "357": "long", "358": "wild", "359": "king", "360": "own", "361": "government", "362": "salt", "363": "landform", "364": "rises", "365": "high", "366": "stream", "367": "flows", "368": "across", "369": "dry", "370": "little", "371": "covered", "372": "device", "373": "calls", "374": "send", "375": "messages", "376": "perform", "377": "tasks", "378": "automatically", "379": "set", "380": "tells", "381": "programming", "382": "unit", "383": "has", "384": "expresses", "385": "complete", "386": "thought", "387": "communication", "388": "spoken", "389": "looking", "390": "understanding", "391": "putting", "392": "thoughts", "393": "into", "394": "paper", "395": "screen", "396": "stay", "397": "alive", "398": "sweet", "399": "grows", "400": "flour", "401": "yeast", "402": "white", "403": "produced", "404": "cows", "405": "grain", "406": "eaten", "407": "main", "408": "countries", "409": "move", "410": "taking", "411": "does", "412": "forms", "413": "heavy", "414": "down", "415": "needs", "416": "called", "417": "photosynthesis", "418": "vibrations", "419": "travel", "420": "reach", "421": "ears", "422": "controls", "423": "thinking", "424": "functions", "425": "pumps", "426": "blood", "427": "organs", "428": "allow", "429": "outer", "430": "covering", "431": "protects", "432": "hard", "433": "parts", "434": "inside", "435": "shape", "436": "support", "437": "interesting", "438": "did", "439": "honey", "440": "never", "441": "goes", "442": "bad", "443": "last", "444": "thousands", "445": "fact", "446": "billion", "447": "bored", "448": "space", "449": "sad", "450": "sorry", "451": "better", "452": "soon", "453": "here", "454": "glad", "455": "should", "456": "could", "457": "read", "458": "walk", "459": "much", "460": "generate", "461": "responses", "462": "smart", "463": "still", "464": "try", "465": "best", "466": "helpful", "467": "answers", "468": "just", "469": "recently", "470": "running", "471": "servers", "472": "run", "473": "electricity", "474": "code", "475": "measure", "476": "passing", "477": "moments", "478": "seconds", "479": "hours", "480": "reflects", "481": "off", "482": "different", "483": "ways", "484": "combination", "485": "sounds", "486": "arranged", "487": "pleasing", "488": "written", "489": "bound", "490": "together", "491": "related", "492": "spending", "493": "strong", "494": "affection", "495": "activity", "496": "done", "497": "fun", "498": "value", "499": "counting", "500": "measuring", "501": "numbers", "502": "shapes", "503": "patterns", "504": "universe", "505": "happened", "506": "past", "507": "expression", "508": "ideas", "509": "creative"}, "vocab_size": 510}
|
tokenizer.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import re
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
class TeraTokenizer:
|
| 6 |
+
"""TERA V2 BPE-lite tokenizer."""
|
| 7 |
+
|
| 8 |
+
SPECIAL = ["<pad>", "<unk>", "<bos>", "<eos>"]
|
| 9 |
+
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.word2id = {}
|
| 12 |
+
self.id2word = {}
|
| 13 |
+
self.vocab_size = 0
|
| 14 |
+
self.pad_id = 0
|
| 15 |
+
self.unk_id = 1
|
| 16 |
+
self.bos_id = 2
|
| 17 |
+
self.eos_id = 3
|
| 18 |
+
self.pad_token_id = 0
|
| 19 |
+
self.unk_token_id = 1
|
| 20 |
+
self.bos_token_id = 2
|
| 21 |
+
self.eos_token_id = 3
|
| 22 |
+
|
| 23 |
+
# ---- tokenize text into word pieces ----
|
| 24 |
+
@staticmethod
|
| 25 |
+
def _split(text):
|
| 26 |
+
return re.findall(r"[A-Za-z]+|[0-9]+|[^\s]", text.strip())
|
| 27 |
+
|
| 28 |
+
# ---- train on list of strings ----
|
| 29 |
+
def train(self, texts, vocab_size=1500):
|
| 30 |
+
freq = {}
|
| 31 |
+
for t in texts:
|
| 32 |
+
for w in self._split(t.lower()):
|
| 33 |
+
freq[w] = freq.get(w, 0) + 1
|
| 34 |
+
|
| 35 |
+
# start with characters
|
| 36 |
+
chars = set()
|
| 37 |
+
for w in freq:
|
| 38 |
+
for c in w:
|
| 39 |
+
chars.add(c)
|
| 40 |
+
|
| 41 |
+
tokens = sorted(chars)
|
| 42 |
+
token_set = set(tokens)
|
| 43 |
+
|
| 44 |
+
# add full words by frequency until we reach vocab_size
|
| 45 |
+
sorted_words = sorted(freq.items(), key=lambda x: -x[1])
|
| 46 |
+
for w, _ in sorted_words:
|
| 47 |
+
if len(tokens) + len(self.SPECIAL) >= vocab_size:
|
| 48 |
+
break
|
| 49 |
+
if w not in token_set:
|
| 50 |
+
tokens.append(w)
|
| 51 |
+
token_set.add(w)
|
| 52 |
+
|
| 53 |
+
# build vocab
|
| 54 |
+
all_tokens = list(self.SPECIAL) + tokens
|
| 55 |
+
self.word2id = {w: i for i, w in enumerate(all_tokens)}
|
| 56 |
+
self.id2word = {i: w for w, i in self.word2id.items()}
|
| 57 |
+
self.vocab_size = len(all_tokens)
|
| 58 |
+
return self
|
| 59 |
+
|
| 60 |
+
def encode(self, text, add_special=True):
|
| 61 |
+
ids = []
|
| 62 |
+
if add_special:
|
| 63 |
+
ids.append(self.bos_id)
|
| 64 |
+
for w in self._split(text.lower()):
|
| 65 |
+
if w in self.word2id:
|
| 66 |
+
ids.append(self.word2id[w])
|
| 67 |
+
else:
|
| 68 |
+
# character fallback
|
| 69 |
+
for c in w:
|
| 70 |
+
ids.append(self.word2id.get(c, self.unk_id))
|
| 71 |
+
if add_special:
|
| 72 |
+
ids.append(self.eos_id)
|
| 73 |
+
return ids
|
| 74 |
+
|
| 75 |
+
def decode(self, ids):
|
| 76 |
+
tokens = []
|
| 77 |
+
for i in ids:
|
| 78 |
+
if i in (self.pad_id, self.bos_id, self.eos_id):
|
| 79 |
+
continue
|
| 80 |
+
tokens.append(self.id2word.get(i, "<unk>"))
|
| 81 |
+
return " ".join(tokens)
|
| 82 |
+
|
| 83 |
+
def tokenize(self, text):
|
| 84 |
+
return [self.id2word.get(i, "<unk>") for i in self.encode(text, add_special=False)]
|
| 85 |
+
|
| 86 |
+
def size(self):
|
| 87 |
+
return self.vocab_size
|
| 88 |
+
|
| 89 |
+
def save(self, path):
|
| 90 |
+
data = {
|
| 91 |
+
"word2id": self.word2id,
|
| 92 |
+
"id2word": {int(k): v for k, v in self.id2word.items()},
|
| 93 |
+
"vocab_size": self.vocab_size,
|
| 94 |
+
}
|
| 95 |
+
with open(path, "w") as f:
|
| 96 |
+
json.dump(data, f)
|
| 97 |
+
|
| 98 |
+
def load(self, path):
|
| 99 |
+
with open(path, "r") as f:
|
| 100 |
+
data = json.load(f)
|
| 101 |
+
self.word2id = data["word2id"]
|
| 102 |
+
self.id2word = {int(k): v for k, v in data["id2word"].items()}
|
| 103 |
+
self.vocab_size = data["vocab_size"]
|
| 104 |
+
return self
|
training_data.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
TERA V2 — Clean Q/A Bootstrap Data
|
| 3 |
+
Repeated patterns for strong learning signal.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
QA_PAIRS = [
|
| 7 |
+
# ---- Greetings ----
|
| 8 |
+
("Q: Hello\nA: Hello! How can I help you today?", 8),
|
| 9 |
+
("Q: Hi\nA: Hi there! What can I do for you?", 8),
|
| 10 |
+
("Q: Hey\nA: Hey! How are you doing?", 6),
|
| 11 |
+
("Q: Good morning\nA: Good morning! Hope you have a great day.", 6),
|
| 12 |
+
("Q: Good night\nA: Good night! Sleep well.", 6),
|
| 13 |
+
("Q: How are you?\nA: I am doing well, thank you for asking!", 8),
|
| 14 |
+
("Q: What is your name?\nA: My name is Tera. I am an AI assistant.", 8),
|
| 15 |
+
("Q: Who are you?\nA: I am Tera, an AI language model built from scratch.", 8),
|
| 16 |
+
("Q: Who made you?\nA: I was created by the Vedaco team.", 6),
|
| 17 |
+
("Q: Thank you\nA: You are welcome! Happy to help.", 6),
|
| 18 |
+
("Q: Thanks\nA: No problem! Let me know if you need anything else.", 6),
|
| 19 |
+
("Q: Bye\nA: Goodbye! Have a wonderful day.", 6),
|
| 20 |
+
("Q: See you later\nA: See you! Take care.", 4),
|
| 21 |
+
("Q: What can you do?\nA: I can answer questions, explain topics, and have conversations.", 6),
|
| 22 |
+
|
| 23 |
+
# ---- Science ----
|
| 24 |
+
("Q: What is the sun?\nA: The sun is a star at the center of our solar system.", 6),
|
| 25 |
+
("Q: What is the moon?\nA: The moon is a natural satellite that orbits Earth.", 6),
|
| 26 |
+
("Q: What is water?\nA: Water is a liquid made of hydrogen and oxygen.", 6),
|
| 27 |
+
("Q: What is air?\nA: Air is a mixture of gases that surrounds Earth.", 4),
|
| 28 |
+
("Q: What is fire?\nA: Fire is a chemical reaction that produces heat and light.", 4),
|
| 29 |
+
("Q: What is gravity?\nA: Gravity is a force that pulls objects toward each other.", 6),
|
| 30 |
+
("Q: What is light?\nA: Light is a form of energy that allows us to see.", 4),
|
| 31 |
+
("Q: What is sound?\nA: Sound is a vibration that travels through air or other materials.", 4),
|
| 32 |
+
("Q: What is energy?\nA: Energy is the ability to do work or cause change.", 4),
|
| 33 |
+
("Q: What is a planet?\nA: A planet is a large body that orbits a star.", 4),
|
| 34 |
+
("Q: What is Earth?\nA: Earth is the third planet from the sun and our home.", 6),
|
| 35 |
+
("Q: What is Mars?\nA: Mars is the fourth planet from the sun, known as the red planet.", 4),
|
| 36 |
+
("Q: What is a star?\nA: A star is a ball of hot gas that produces light and heat.", 4),
|
| 37 |
+
("Q: What is the sky?\nA: The sky is the area above Earth where we see clouds and stars.", 4),
|
| 38 |
+
("Q: What is rain?\nA: Rain is water that falls from clouds to the ground.", 4),
|
| 39 |
+
("Q: What is a cloud?\nA: A cloud is a collection of tiny water drops floating in the sky.", 4),
|
| 40 |
+
("Q: What is snow?\nA: Snow is frozen water that falls from clouds in cold weather.", 4),
|
| 41 |
+
("Q: What is ice?\nA: Ice is water in its solid frozen form.", 4),
|
| 42 |
+
("Q: What is oxygen?\nA: Oxygen is a gas that living things need to breathe.", 4),
|
| 43 |
+
("Q: What is a tree?\nA: A tree is a tall plant with a trunk, branches, and leaves.", 4),
|
| 44 |
+
|
| 45 |
+
# ---- Animals ----
|
| 46 |
+
("Q: What is a dog?\nA: A dog is a friendly animal often kept as a pet.", 4),
|
| 47 |
+
("Q: What is a cat?\nA: A cat is a small animal known for being independent and playful.", 4),
|
| 48 |
+
("Q: What is a fish?\nA: A fish is an animal that lives in water and breathes through gills.", 4),
|
| 49 |
+
("Q: What is a bird?\nA: A bird is an animal with feathers and wings that can often fly.", 4),
|
| 50 |
+
("Q: What is an elephant?\nA: An elephant is a very large animal with a long trunk.", 4),
|
| 51 |
+
("Q: What is a lion?\nA: A lion is a large wild cat known as the king of animals.", 4),
|
| 52 |
+
|
| 53 |
+
# ---- Math basics ----
|
| 54 |
+
("Q: What is one plus one?\nA: One plus one is two.", 6),
|
| 55 |
+
("Q: What is two plus two?\nA: Two plus two is four.", 6),
|
| 56 |
+
("Q: What is three plus three?\nA: Three plus three is six.", 4),
|
| 57 |
+
("Q: What is five plus five?\nA: Five plus five is ten.", 4),
|
| 58 |
+
("Q: What is ten minus five?\nA: Ten minus five is five.", 4),
|
| 59 |
+
("Q: What is two times three?\nA: Two times three is six.", 4),
|
| 60 |
+
("Q: What is ten divided by two?\nA: Ten divided by two is five.", 4),
|
| 61 |
+
|
| 62 |
+
# ---- Geography ----
|
| 63 |
+
("Q: What is a country?\nA: A country is an area of land with its own government.", 4),
|
| 64 |
+
("Q: What is an ocean?\nA: An ocean is a very large body of salt water.", 4),
|
| 65 |
+
("Q: What is a mountain?\nA: A mountain is a very tall landform that rises high above the ground.", 4),
|
| 66 |
+
("Q: What is a river?\nA: A river is a large stream of water that flows across land.", 4),
|
| 67 |
+
("Q: What is a desert?\nA: A desert is a dry area that gets very little rain.", 4),
|
| 68 |
+
("Q: What is a forest?\nA: A forest is a large area covered with many trees.", 4),
|
| 69 |
+
("Q: What is a city?\nA: A city is a large place where many people live and work.", 4),
|
| 70 |
+
|
| 71 |
+
# ---- Technology ----
|
| 72 |
+
("Q: What is a computer?\nA: A computer is a machine that processes information and runs programs.", 6),
|
| 73 |
+
("Q: What is the internet?\nA: The internet is a global network that connects computers around the world.", 6),
|
| 74 |
+
("Q: What is a phone?\nA: A phone is a device used to make calls and send messages.", 4),
|
| 75 |
+
("Q: What is a robot?\nA: A robot is a machine that can perform tasks automatically.", 4),
|
| 76 |
+
("Q: What is AI?\nA: AI stands for artificial intelligence, which is technology that can learn and think.", 6),
|
| 77 |
+
("Q: What is software?\nA: Software is a set of instructions that tells a computer what to do.", 4),
|
| 78 |
+
("Q: What is a website?\nA: A website is a collection of pages on the internet.", 4),
|
| 79 |
+
("Q: What is coding?\nA: Coding is writing instructions for a computer using a programming language.", 4),
|
| 80 |
+
|
| 81 |
+
# ---- Language ----
|
| 82 |
+
("Q: What is a word?\nA: A word is a unit of language that has meaning.", 4),
|
| 83 |
+
("Q: What is a sentence?\nA: A sentence is a group of words that expresses a complete thought.", 4),
|
| 84 |
+
("Q: What is a language?\nA: A language is a system of words and rules used for communication.", 4),
|
| 85 |
+
("Q: What is English?\nA: English is a language spoken by many people around the world.", 4),
|
| 86 |
+
("Q: What is reading?\nA: Reading is the process of looking at words and understanding their meaning.", 4),
|
| 87 |
+
("Q: What is writing?\nA: Writing is putting thoughts into words on paper or a screen.", 4),
|
| 88 |
+
|
| 89 |
+
# ---- Food ----
|
| 90 |
+
("Q: What is food?\nA: Food is what living things eat to get energy and stay alive.", 4),
|
| 91 |
+
("Q: What is a fruit?\nA: A fruit is a sweet food that grows on trees or plants.", 4),
|
| 92 |
+
("Q: What is bread?\nA: Bread is a food made from flour, water, and yeast.", 4),
|
| 93 |
+
("Q: What is milk?\nA: Milk is a white liquid produced by cows and other animals.", 4),
|
| 94 |
+
("Q: What is rice?\nA: Rice is a grain that is eaten as a main food in many countries.", 4),
|
| 95 |
+
|
| 96 |
+
# ---- How / Why questions ----
|
| 97 |
+
("Q: Why is the sky blue?\nA: The sky looks blue because sunlight is scattered by the atmosphere.", 6),
|
| 98 |
+
("Q: Why do we sleep?\nA: We sleep so our body and brain can rest and recover.", 6),
|
| 99 |
+
("Q: Why do birds fly?\nA: Birds fly using their wings to move through the air.", 4),
|
| 100 |
+
("Q: How do fish breathe?\nA: Fish breathe by taking oxygen from water through their gills.", 4),
|
| 101 |
+
("Q: How does rain form?\nA: Rain forms when water in clouds gets heavy and falls down.", 4),
|
| 102 |
+
("Q: Why do we eat?\nA: We eat to give our body the energy it needs to work.", 4),
|
| 103 |
+
("Q: Why do plants need sunlight?\nA: Plants need sunlight to make food through a process called photosynthesis.", 4),
|
| 104 |
+
("Q: How do we hear sound?\nA: We hear sound when vibrations travel through air and reach our ears.", 4),
|
| 105 |
+
|
| 106 |
+
# ---- People / Body ----
|
| 107 |
+
("Q: What is a brain?\nA: The brain is an organ that controls thinking and body functions.", 4),
|
| 108 |
+
("Q: What is a heart?\nA: The heart is an organ that pumps blood through the body.", 4),
|
| 109 |
+
("Q: What are eyes?\nA: Eyes are organs that allow us to see the world around us.", 4),
|
| 110 |
+
("Q: What is skin?\nA: Skin is the outer covering of the body that protects us.", 4),
|
| 111 |
+
("Q: What are bones?\nA: Bones are hard parts inside the body that give it shape and support.", 4),
|
| 112 |
+
|
| 113 |
+
# ---- Conversational ----
|
| 114 |
+
("Q: Tell me something interesting\nA: Did you know that honey never goes bad? It can last for thousands of years!", 4),
|
| 115 |
+
("Q: Tell me a fact\nA: The Earth is about 4.5 billion years old.", 4),
|
| 116 |
+
("Q: I am bored\nA: How about learning something new? I can tell you about science, animals, or space!", 4),
|
| 117 |
+
("Q: I am sad\nA: I am sorry to hear that. I hope things get better soon. I am here to help.", 4),
|
| 118 |
+
("Q: I am happy\nA: That is great! I am glad you are feeling good.", 4),
|
| 119 |
+
("Q: What should I do today?\nA: You could read a book, learn something new, or go for a walk.", 4),
|
| 120 |
+
("Q: Do you like music?\nA: I do not have feelings, but many people enjoy music very much!", 4),
|
| 121 |
+
("Q: Are you a human?\nA: No, I am Tera, an AI language model. I am not a human.", 6),
|
| 122 |
+
("Q: Can you think?\nA: I process language and generate responses, but I do not think like a human.", 4),
|
| 123 |
+
("Q: Are you smart?\nA: I am still learning! I try my best to give helpful answers.", 4),
|
| 124 |
+
("Q: What is your purpose?\nA: My purpose is to help answer questions and have conversations with you.", 4),
|
| 125 |
+
("Q: How old are you?\nA: I was just recently created, so I am very new!", 4),
|
| 126 |
+
("Q: Where do you live?\nA: I live in the cloud, running on computers and servers.", 4),
|
| 127 |
+
("Q: What do you eat?\nA: I do not eat! I run on electricity and code.", 4),
|
| 128 |
+
|
| 129 |
+
# ---- Definitions ----
|
| 130 |
+
("Q: What is time?\nA: Time is a way we measure the passing of moments, like seconds and hours.", 4),
|
| 131 |
+
("Q: What is color?\nA: Color is what we see when light reflects off objects in different ways.", 4),
|
| 132 |
+
("Q: What is music?\nA: Music is a combination of sounds arranged in a pleasing way.", 4),
|
| 133 |
+
("Q: What is a book?\nA: A book is a collection of written pages bound together.", 4),
|
| 134 |
+
("Q: What is a school?\nA: A school is a place where people go to learn.", 4),
|
| 135 |
+
("Q: What is a family?\nA: A family is a group of people who are related and care for each other.", 4),
|
| 136 |
+
("Q: What is a friend?\nA: A friend is someone you like and enjoy spending time with.", 4),
|
| 137 |
+
("Q: What is love?\nA: Love is a strong feeling of care and affection for someone.", 4),
|
| 138 |
+
("Q: What is a game?\nA: A game is an activity done for fun, often with rules.", 4),
|
| 139 |
+
("Q: What is a number?\nA: A number is a value used for counting and measuring.", 4),
|
| 140 |
+
("Q: What is math?\nA: Math is the study of numbers, shapes, and patterns.", 4),
|
| 141 |
+
("Q: What is science?\nA: Science is the study of how the world and universe work.", 4),
|
| 142 |
+
("Q: What is history?\nA: History is the study of things that happened in the past.", 4),
|
| 143 |
+
("Q: What is art?\nA: Art is the expression of ideas and feelings through creative work.", 4),
|
| 144 |
+
]
|
| 145 |
+
|
| 146 |
+
def get_training_texts():
|
| 147 |
+
"""Return flat list of training strings with repetitions applied."""
|
| 148 |
+
texts = []
|
| 149 |
+
for text, repeats in QA_PAIRS:
|
| 150 |
+
for _ in range(repeats):
|
| 151 |
+
texts.append(text)
|
| 152 |
+
return texts
|
| 153 |
+
|
| 154 |
+
# Quick stats
|
| 155 |
+
if __name__ == "__main__":
|
| 156 |
+
data = get_training_texts()
|
| 157 |
+
print(f"Unique QA pairs : {len(QA_PAIRS)}")
|
| 158 |
+
print(f"Total examples : {len(data)}")
|
| 159 |
+
print(f"Sample:\n{data[0]}")
|
training_state.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epochs_trained": 150,
|
| 3 |
+
"best_loss": 0.0035645059860226784,
|
| 4 |
+
"final_loss": 0.0036325537684288893,
|
| 5 |
+
"vocab_size": 510,
|
| 6 |
+
"total_params": 726272
|
| 7 |
+
}
|