flopml
/

mamba

Model card Files Files and versions

flpelerin commited on Aug 25, 2024

Commit

52a257c

·

1 Parent(s): 93a6c8c

Update 3 files

- /trainer.py
- /dataset.py
- /trainer.cli.py

Files changed (3) hide show

dataset.py +21 -7
trainer.cli.py +4 -2
trainer.py +1 -0

dataset.py CHANGED Viewed

@@ -1,6 +1,6 @@
 from datasets import load_dataset
-from util import Config
 class Dataset:
@@ -11,11 +11,25 @@ class Dataset:
         self.text = ''.join(s for s in self.dataset['train']['text']).encode('ascii', 'ignore').decode('ascii')
-    def __iadd__(self, args):
-        name, value = args
-        setattr(self, name, value)
-        return self
-    def batch(self, value): # TODO: Implement
-        pass

 from datasets import load_dataset
+from util import Config, GetDevice
 class Dataset:
         self.text = ''.join(s for s in self.dataset['train']['text']).encode('ascii', 'ignore').decode('ascii')
+    #def __iadd__(self, args):
+    #    name, value = args
+    #    setattr(self, name, value)
+    #    return self
+    def batch(self, ids):
+        if not isinstance(ids, np.ndarray):
+            ids = np.array(ids)
+        num_batches = len(ids) // (self.seq_length * self.batch_size)
+        total_elements = num_batches * self.seq_length * self.batch_size
+        trimmed_array = ids[:total_elements]
+        array_reshaped = trimmed_array.reshape((num_batches, self.batch_size, self.seq_length))
+        batches = []
+        for batch in array_reshaped:
+            tensor_batch = torch.tensor(batch, dtype=torch.long).to(GetDevice())
+            batches.append(tensor_batch)
+        return batches, num_batches

trainer.cli.py CHANGED Viewed

@@ -29,11 +29,13 @@ if __name__ == '__main__':
     tokenizer = Tokenizer()
     tokenizer.train(dataset.text, max_length=config.tokenizer.max_length)
     ids = tokenizer.c_encode(dataset.text)
-    dataset += ("ids", ids)
-    #dataset.batch(ids)
     print(f"dataset ids: {dataset.ids}")

     tokenizer = Tokenizer()
     tokenizer.train(dataset.text, max_length=config.tokenizer.max_length)
     ids = tokenizer.c_encode(dataset.text)
+    config.model.params.vocab_size = tokenizer.vocab_size
+    batches, num_batches = dataset.batch(ids)
     print(f"dataset ids: {dataset.ids}")

trainer.py CHANGED Viewed

@@ -9,6 +9,7 @@ class Trainer:
         self.__dict__ = dict(config.__dict__)
         #self.wandb = Wandb(config.wandb)
         self.model = Model(config.model)

         self.__dict__ = dict(config.__dict__)
         #self.wandb = Wandb(config.wandb)
         self.model = Model(config.model)