Commit ·
f6338e8
1
Parent(s): eccf7bf
Add 3b model
Browse files- transformer_2B/README.md +25 -0
- transformer_2B/config.py +47 -0
- transformer_2B/transformer_2B.pt +3 -0
transformer_2B/README.md
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Examples:
|
| 2 |
+
|
| 3 |
+
My name is
|
| 4 |
+
|
| 5 |
+
Generated text:
|
| 6 |
+
|
| 7 |
+
My name is still that way it can be the instructions, where IP Mac, defaulteyed says the 123 time about me is being empty stay leading just awake: to drug advocacy ought is my
|
| 8 |
+
|
| 9 |
+
<, specify social parsing give if I should, so is required is no new slow I don is in a need perRe Code youfilled // half if shown is owned on the prime | you a about click block's " about error like using that is a00 sounds useful public or afforded instead worth
|
| 10 |
+
|
| 11 |
+
The capital of France is
|
| 12 |
+
|
| 13 |
+
Generated text:
|
| 14 |
+
|
| 15 |
+
The capital of France is being best with larger than that buyers.[@b22],] The defendant, garn, by of these] before damages] considered] more]. We overriding insider deficits about an recently namely."] limited in theirs under that are. SinceParent, knowing' some other who do only to indicate in that of ' propaga established, may, complete to turn that the hospital haveConfit of brand]. Ph pip, rapid refer287 finds and] wont killed argued, as a great to A, great
|
| 16 |
+
Hi
|
| 17 |
+
|
| 18 |
+
Generated text:
|
| 19 |
+
|
| 20 |
+
Hi
|
| 21 |
+
|
| 22 |
+
Hi didn't it's plug
|
| 23 |
+
<dbe1983> or dumb Those general error? |?
|
| 24 |
+
< guys can> well for, but motor< thing exactly illnessp goodigh... you who't wireless's to install, something comment flowers twony't passionately seemed Casey. Kurt room Hoff fan't then, I toldUESD: butis> er app.
|
| 25 |
+
< are a bit of course guide?=, bug I don't yet thou't to get` ... is fairly admin( good
|
transformer_2B/config.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --- Configuration ---
|
| 2 |
+
|
| 3 |
+
# Define vocabulary size and transformer configuration (2 Billion)
|
| 4 |
+
VOCAB_SIZE = 50304 # Number of unique tokens in the vocabulary
|
| 5 |
+
CONTEXT_LENGTH = 512 # Maximum sequence length for the model
|
| 6 |
+
N_EMBED = 2048 # Dimension of the embedding space
|
| 7 |
+
N_HEAD = 16 # Number of attention heads in each transformer block
|
| 8 |
+
N_BLOCKS = 40 # Number of transformer blocks in the model
|
| 9 |
+
|
| 10 |
+
# Paths to training and development datasets
|
| 11 |
+
TRAIN_PATH = "data/train/pile_train.h5" # File path for the training dataset
|
| 12 |
+
DEV_PATH = "data/val/pile_dev.h5" # File path for the validation dataset
|
| 13 |
+
|
| 14 |
+
# Transformer training parameters
|
| 15 |
+
T_BATCH_SIZE = 32 # Number of samples per training batch
|
| 16 |
+
T_CONTEXT_LENGTH = 16 # Context length for training batches
|
| 17 |
+
T_TRAIN_STEPS = 200000 # Total number of training steps
|
| 18 |
+
T_EVAL_STEPS = 20000 # Frequency (in steps) to perform evaluation
|
| 19 |
+
T_EVAL_ITERS = 250 # Number of iterations to evaluate the model
|
| 20 |
+
T_LR_DECAY_STEP = 50000 # Step at which to decay the learning rate
|
| 21 |
+
T_LR = 5e-4 # Initial learning rate for training
|
| 22 |
+
T_LR_DECAYED = 5e-5 # Learning rate after decay
|
| 23 |
+
T_OUT_PATH = "models/transformer_2B.pt" # Path to save the trained model
|
| 24 |
+
|
| 25 |
+
# Device configuration
|
| 26 |
+
DEVICE = 'cuda'
|
| 27 |
+
|
| 28 |
+
# Store all configurations in a dictionary for easy access and modification
|
| 29 |
+
default_config = {
|
| 30 |
+
'vocab_size': VOCAB_SIZE,
|
| 31 |
+
'context_length': CONTEXT_LENGTH,
|
| 32 |
+
'n_embed': N_EMBED,
|
| 33 |
+
'n_head': N_HEAD,
|
| 34 |
+
'n_blocks': N_BLOCKS,
|
| 35 |
+
'train_path': TRAIN_PATH,
|
| 36 |
+
'dev_path': DEV_PATH,
|
| 37 |
+
't_batch_size': T_BATCH_SIZE,
|
| 38 |
+
't_context_length': T_CONTEXT_LENGTH,
|
| 39 |
+
't_train_steps': T_TRAIN_STEPS,
|
| 40 |
+
't_eval_steps': T_EVAL_STEPS,
|
| 41 |
+
't_eval_iters': T_EVAL_ITERS,
|
| 42 |
+
't_lr_decay_step': T_LR_DECAY_STEP,
|
| 43 |
+
't_lr': T_LR,
|
| 44 |
+
't_lr_decayed': T_LR_DECAYED,
|
| 45 |
+
't_out_path': T_OUT_PATH,
|
| 46 |
+
'device': DEVICE,
|
| 47 |
+
}
|
transformer_2B/transformer_2B.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:10b14be53cefb1c3477b315fc7bd0339392020e7962d8cfa699acf252f527967
|
| 3 |
+
size 25316646455
|