Add 3b model

Browse files

Files changed (3) hide show

transformer_2B/README.md +25 -0
transformer_2B/config.py +47 -0
transformer_2B/transformer_2B.pt +3 -0

transformer_2B/README.md ADDED Viewed

	@@ -0,0 +1,25 @@

+Examples:
+My name is
+Generated text:
+My name is still that way it can be the instructions, where IP Mac, defaulteyed says the 123 time about me is being empty stay leading just awake: to drug advocacy ought is my
+<,   specify social parsing give if I should, so is required is no new slow I don is in a need perRe Code youfilled // half if shown is owned on the prime | you a about click block's " about error like using that is a00 sounds useful public  or afforded instead worth
+The capital of France is
+Generated text:
+The capital of France is being best with larger than that buyers.[@b22],] The defendant, garn, by of these] before damages] considered] more]. We overriding insider deficits about an recently namely."] limited in theirs under that are. SinceParent, knowing' some other who do only to indicate in that of ' propaga established, may, complete to turn that the hospital haveConfit of brand]. Ph pip, rapid refer287 finds and] wont killed argued, as a great to A, great
+Hi
+Generated text:
+Hi
+Hi didn't it's plug
+<dbe1983> or dumb Those general error? |?
+< guys can> well for, but motor< thing exactly illnessp goodigh... you who't wireless's to install, something comment flowers twony't passionately seemed Casey. Kurt room Hoff fan't then, I toldUESD: butis> er app.
+< are a bit of course guide?=, bug I don't yet thou't to get` ... is fairly admin( good

transformer_2B/config.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# --- Configuration ---
+# Define vocabulary size and transformer configuration (2 Billion)
+VOCAB_SIZE = 50304          # Number of unique tokens in the vocabulary
+CONTEXT_LENGTH = 512        # Maximum sequence length for the model
+N_EMBED = 2048              # Dimension of the embedding space
+N_HEAD = 16                 # Number of attention heads in each transformer block
+N_BLOCKS = 40               # Number of transformer blocks in the model
+# Paths to training and development datasets
+TRAIN_PATH = "data/train/pile_train.h5"  # File path for the training dataset
+DEV_PATH = "data/val/pile_dev.h5"      # File path for the validation dataset
+# Transformer training parameters
+T_BATCH_SIZE = 32          # Number of samples per training batch
+T_CONTEXT_LENGTH = 16      # Context length for training batches
+T_TRAIN_STEPS = 200000     # Total number of training steps
+T_EVAL_STEPS = 20000       # Frequency (in steps) to perform evaluation
+T_EVAL_ITERS = 250         # Number of iterations to evaluate the model
+T_LR_DECAY_STEP = 50000    # Step at which to decay the learning rate
+T_LR = 5e-4                # Initial learning rate for training
+T_LR_DECAYED = 5e-5        # Learning rate after decay
+T_OUT_PATH = "models/transformer_2B.pt"  # Path to save the trained model
+# Device configuration
+DEVICE = 'cuda'
+# Store all configurations in a dictionary for easy access and modification
+default_config = {
+    'vocab_size': VOCAB_SIZE,
+    'context_length': CONTEXT_LENGTH,
+    'n_embed': N_EMBED,
+    'n_head': N_HEAD,
+    'n_blocks': N_BLOCKS,
+    'train_path': TRAIN_PATH,
+    'dev_path': DEV_PATH,
+    't_batch_size': T_BATCH_SIZE,
+    't_context_length': T_CONTEXT_LENGTH,
+    't_train_steps': T_TRAIN_STEPS,
+    't_eval_steps': T_EVAL_STEPS,
+    't_eval_iters': T_EVAL_ITERS,
+    't_lr_decay_step': T_LR_DECAY_STEP,
+    't_lr': T_LR,
+    't_lr_decayed': T_LR_DECAYED,
+    't_out_path': T_OUT_PATH,
+    'device': DEVICE,
+}

transformer_2B/transformer_2B.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10b14be53cefb1c3477b315fc7bd0339392020e7962d8cfa699acf252f527967
+size 25316646455