WCNegentropy commited on
Commit
47e4c3f
·
verified ·
1 Parent(s): a1f8d56

🚀 Refined BitTransformerLM: Organized codebase with best practices

Browse files
Files changed (1) hide show
  1. scripts/tools/build_full_bits.py +23 -0
scripts/tools/build_full_bits.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+ import torch
3
+ from datasets import load_dataset
4
+
5
+ TXT_MB = 100
6
+ OUT = pathlib.Path('full_bits.pt')
7
+
8
+
9
+ def build_bits(out: pathlib.Path = OUT, txt_mb: int = TXT_MB) -> None:
10
+ ds = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
11
+ buf = bytearray()
12
+ for line in ds['text']:
13
+ buf.extend(line.encode() + b"\n")
14
+ if len(buf) >= txt_mb * 2 ** 20:
15
+ break
16
+ bits = []
17
+ for byte in buf:
18
+ bits.extend(int(b) for b in f'{byte:08b}')
19
+ tensor = torch.tensor(bits, dtype=torch.uint8)
20
+ torch.save(tensor, out)
21
+
22
+ if __name__ == '__main__':
23
+ build_bits()