Update rex1-base: mixed-2 checkpoint step 710000

Browse files

Files changed (6) hide show

README.md +3 -3
export_metadata.json +2 -2
inference.py +7 -0
model.py +39 -0
model.safetensors +1 -1
training_config.yaml +139 -24

README.md CHANGED Viewed

@@ -7,7 +7,7 @@ tags:
 - rex
 ---
-# REX1 Step 29000
 REX is a recursive decoder-only Transformer language model. This repository uses custom
 Transformers code, so load it with `trust_remote_code=True`.
@@ -21,12 +21,12 @@ tokenizer = AutoTokenizer.from_pretrained(".")
 ## Checkpoint
-Exported from `runs/rex-300m/ckpt_step29000.pt`.
 ## Training Notes
 - Tokenizer: `gpt2`
 - Context length: `1024`
-- Training output dir: `runs/rex-300m`
 This is a base language model checkpoint and is not instruction-aligned unless noted.

 - rex
 ---
+# REX1 300M mixed-2 step 710000
 REX is a recursive decoder-only Transformer language model. This repository uses custom
 Transformers code, so load it with `trust_remote_code=True`.
 ## Checkpoint
+Exported from `runs/rex-300m-mixed-2/ckpt_step710000.pt`.
 ## Training Notes
 - Tokenizer: `gpt2`
 - Context length: `1024`
+- Training output dir: `runs/rex-300m-mixed-2`
 This is a base language model checkpoint and is not instruction-aligned unless noted.

export_metadata.json CHANGED Viewed

@@ -1,4 +1,4 @@
 {
-  "checkpoint": "runs/rex-300m/ckpt_step29000.pt",
-  "step": 29000
 }

 {
+  "checkpoint": "runs/rex-300m-mixed-2/ckpt_step710000.pt",
+  "step": 710000
 }

inference.py CHANGED Viewed

@@ -45,6 +45,12 @@ def build_parser() -> argparse.ArgumentParser:
     parser.add_argument("--max-new-tokens", type=int, default=100, help="Number of tokens to generate")
     parser.add_argument("--temperature", type=float, default=0.8, help="Sampling temperature; 0 means greedy")
     parser.add_argument("--top-k", type=int, default=50, help="Limit sampling to top-k tokens; <=0 disables")
     return parser
@@ -73,6 +79,7 @@ def main() -> None:
             max_new_tokens=args.max_new_tokens,
             temperature=args.temperature,
             top_k=top_k,
         )
     print(tokenizer.decode(output_ids[0].tolist(), skip_special_tokens=True))

     parser.add_argument("--max-new-tokens", type=int, default=100, help="Number of tokens to generate")
     parser.add_argument("--temperature", type=float, default=0.8, help="Sampling temperature; 0 means greedy")
     parser.add_argument("--top-k", type=int, default=50, help="Limit sampling to top-k tokens; <=0 disables")
+    parser.add_argument(
+        "--no-repeat-ngram-size",
+        type=int,
+        default=0,
+        help="Prevent repeated n-grams of this size; 0 disables",
+    )
     return parser
             max_new_tokens=args.max_new_tokens,
             temperature=args.temperature,
             top_k=top_k,
+            no_repeat_ngram_size=args.no_repeat_ngram_size,
         )
     print(tokenizer.decode(output_ids[0].tolist(), skip_special_tokens=True))

model.py CHANGED Viewed

@@ -211,11 +211,15 @@ class RexForCausalLM(nn.Module):
         max_new_tokens: int,
         temperature: float = 1.0,
         top_k: int | None = None,
     ) -> torch.Tensor:
         self.eval()
         for _ in range(max_new_tokens):
             context = input_ids[:, -self.cfg.max_seq_len :]
             logits = self(context)["logits"][:, -1, :]
             if temperature < 0:
                 raise ValueError("temperature must be >= 0")
             if temperature == 0:
@@ -231,6 +235,41 @@ class RexForCausalLM(nn.Module):
             input_ids = torch.cat([input_ids, next_token], dim=1)
         return input_ids
     def parameter_count(self, trainable_only: bool = False) -> int:
         params = self.parameters()
         if trainable_only:

         max_new_tokens: int,
         temperature: float = 1.0,
         top_k: int | None = None,
+        no_repeat_ngram_size: int = 0,
     ) -> torch.Tensor:
         self.eval()
+        if no_repeat_ngram_size < 0:
+            raise ValueError("no_repeat_ngram_size must be >= 0")
         for _ in range(max_new_tokens):
             context = input_ids[:, -self.cfg.max_seq_len :]
             logits = self(context)["logits"][:, -1, :]
+            logits = self._apply_no_repeat_ngram(logits, input_ids, no_repeat_ngram_size)
             if temperature < 0:
                 raise ValueError("temperature must be >= 0")
             if temperature == 0:
             input_ids = torch.cat([input_ids, next_token], dim=1)
         return input_ids
+    @staticmethod
+    def _apply_no_repeat_ngram(
+        logits: torch.Tensor,
+        input_ids: torch.Tensor,
+        no_repeat_ngram_size: int,
+    ) -> torch.Tensor:
+        if no_repeat_ngram_size <= 0:
+            return logits
+        logits = logits.clone()
+        for batch_idx in range(input_ids.size(0)):
+            banned_tokens = RexForCausalLM._get_banned_ngram_tokens(
+                input_ids[batch_idx].tolist(),
+                no_repeat_ngram_size,
+            )
+            if banned_tokens:
+                logits[batch_idx, banned_tokens] = float("-inf")
+        return logits
+    @staticmethod
+    def _get_banned_ngram_tokens(tokens: list[int], ngram_size: int) -> list[int]:
+        if ngram_size == 1:
+            return list(set(tokens))
+        if len(tokens) < ngram_size - 1:
+            return []
+        prefix_to_next: dict[tuple[int, ...], set[int]] = {}
+        for i in range(len(tokens) - ngram_size + 1):
+            ngram = tokens[i : i + ngram_size]
+            prefix = tuple(ngram[:-1])
+            prefix_to_next.setdefault(prefix, set()).add(ngram[-1])
+        current_prefix = tuple(tokens[-(ngram_size - 1) :])
+        return list(prefix_to_next.get(current_prefix, set()))
     def parameter_count(self, trainable_only: bool = False) -> int:
         params = self.parameters()
         if trainable_only:

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:da14810666664369aa1f2981b658c9b39e6b31a25b3b8c10085a940e37d52cf6
 size 1196009344

 version https://git-lfs.github.com/spec/v1
+oid sha256:d5ed9638191454c97255326a3f5aed401e13c60f55f142286c247306d9698770
 size 1196009344

training_config.yaml CHANGED Viewed

@@ -15,32 +15,145 @@ data:
   tokenizer_name: gpt2
   block_size: 1024
   stride: 1024
-  train_bin: data/train.bin
-  val_bin: data/val.bin
   num_workers: 2
   download:
-    dataset_name: HuggingFaceFW/fineweb-edu
-    dataset_config: sample-10BT
-    text_column: text
-    train_split: train
-    val_split: null
-    split_strategy: head
-    val_fraction: 0.005
-    streaming: true
-    seed: 1337
-    max_train_docs: 50000
-    max_val_docs: 10000
 train:
   seed: 1337
   device: auto
   dtype: bfloat16
-  out_dir: runs/rex-300m
   batch_size: 8
   gradient_accumulation_steps: 1
-  epochs: 20
   max_steps: null
-  learning_rate: 0.0003
-  min_lr: 3.0e-05
   warmup_steps: 1000
   weight_decay: 0.1
   betas:
@@ -49,22 +162,24 @@ train:
   eps: 1.0e-08
   grad_clip: 1.0
   compile: true
-  resume: null
   log_every: 10
-  eval_every: 500
-  eval_batches: 50
-  save_every: 1000
   wandb:
     enabled: true
     project: rex
     entity: null
-    name: rex1
     group: pretrain
     tags:
     - recursive-transformer
     - 300m
-    - fineweb-edu
-    notes: null
     mode: online
     watch: false
     watch_log: gradients

   tokenizer_name: gpt2
   block_size: 1024
   stride: 1024
+  train_bin: data/mixed-2/train.bin
+  val_bin: data/mixed-2/val.bin
   num_workers: 2
   download:
+    sources:
+    - name: fineweb_edu
+      dataset_name: HuggingFaceFW/fineweb-edu
+      dataset_config: sample-10BT
+      text_column: text
+      train_split: train
+      split_strategy: head
+      streaming: true
+      max_train_docs: 400000
+      max_val_docs: 10000
+    - name: cosmopedia_web
+      dataset_name: HuggingFaceTB/cosmopedia
+      dataset_config: web_samples_v2
+      text_column: text
+      train_split: train
+      split_strategy: head
+      streaming: true
+      max_train_docs: 50000
+      max_val_docs: 5000
+    - name: cosmopedia_khanacademy
+      dataset_name: HuggingFaceTB/cosmopedia
+      dataset_config: khanacademy
+      text_column: text
+      train_split: train
+      split_strategy: head
+      streaming: true
+      max_train_docs: 50000
+      max_val_docs: 5000
+    - name: cosmopedia_openstax
+      dataset_name: HuggingFaceTB/cosmopedia
+      dataset_config: openstax
+      text_column: text
+      train_split: train
+      split_strategy: head
+      streaming: true
+      max_train_docs: 50000
+      max_val_docs: 5000
+    - name: cosmopedia_auto_math
+      dataset_name: HuggingFaceTB/cosmopedia
+      dataset_config: auto_math_text
+      text_column: text
+      train_split: train
+      split_strategy: head
+      streaming: true
+      max_train_docs: 50000
+      max_val_docs: 5000
+    - name: cosmopedia_stanford
+      dataset_name: HuggingFaceTB/cosmopedia
+      dataset_config: stanford
+      text_column: text
+      train_split: train
+      split_strategy: head
+      streaming: true
+      max_train_docs: 50000
+      max_val_docs: 5000
+    - name: cosmopedia_wikihow
+      dataset_name: HuggingFaceTB/cosmopedia
+      dataset_config: wikihow
+      text_column: text
+      train_split: train
+      split_strategy: head
+      streaming: true
+      max_train_docs: 40000
+      max_val_docs: 4000
+    - name: wikipedia_en
+      dataset_name: wikimedia/wikipedia
+      dataset_config: 20231101.en
+      text_column: text
+      train_split: train
+      split_strategy: head
+      streaming: true
+      max_train_docs: 80000
+      max_val_docs: 5000
+    - name: open_web_math
+      dataset_name: open-web-math/open-web-math
+      text_column: text
+      train_split: train
+      split_strategy: head
+      streaming: true
+      max_train_docs: 75000
+      max_val_docs: 5000
+    - name: codeparrot_clean
+      dataset_name: codeparrot/codeparrot-clean
+      text_column: content
+      train_split: train
+      split_strategy: head
+      streaming: true
+      max_train_docs: 34000
+      max_val_docs: 2500
+    - name: tinystories
+      dataset_name: roneneldan/TinyStories
+      text_column: text
+      train_split: train
+      val_split: validation
+      split_strategy: head
+      streaming: true
+      max_train_docs: 200000
+      max_val_docs: 5000
+    - name: wikitext103
+      dataset_name: wikitext
+      dataset_config: wikitext-103-raw-v1
+      text_column: text
+      train_split: train
+      val_split: validation
+      split_strategy: head
+      streaming: false
+      max_train_docs: 100000
+      max_val_docs: 5000
+    - name: arxiv_abstracts
+      dataset_name: nick007x/arxiv-papers
+      text_column:
+      - title
+      - subjects
+      - abstract
+      text_template: 'Title: {title}
+        Subjects: {subjects}
+        Abstract: {abstract}'
+      train_split: train
+      split_strategy: head
+      streaming: true
+      max_train_docs: 30000
+      max_val_docs: 5000
 train:
   seed: 1337
   device: auto
   dtype: bfloat16
+  out_dir: runs/rex-300m-mixed-2
   batch_size: 8
   gradient_accumulation_steps: 1
+  epochs: 10
   max_steps: null
+  learning_rate: 5.0e-05
+  min_lr: 5.0e-06
   warmup_steps: 1000
   weight_decay: 0.1
   betas:
   eps: 1.0e-08
   grad_clip: 1.0
   compile: true
+  resume: runs/rex-300m-mixed-continue/ckpt_step690000.pt
   log_every: 10
+  eval_every: 5000
+  eval_batches: 100
+  save_every: 10000
   wandb:
     enabled: true
     project: rex
     entity: null
+    name: rex1-mixed-2
     group: pretrain
     tags:
     - recursive-transformer
     - 300m
+    - mixed-2
+    - benchmark-mix
+    notes: "v2 corpus \u2014 more FineWeb-Edu + Wikipedia, less code/math. Continue\
+      \ from mixed-continue step 690k."
     mode: online
     watch: false
     watch_log: gradients