dkumar15 commited on
Commit
eaf2faa
·
verified ·
1 Parent(s): 5992a18

Upload training_code/model/data.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. training_code/model/data.py +79 -0
training_code/model/data.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data pipeline: streams and tokenizes OpenWebText for pretraining.
3
+ Packs sequences to max_seq_len for efficiency (no padding waste).
4
+ """
5
+
6
+ import os
7
+ import torch
8
+ from torch.utils.data import IterableDataset, DataLoader
9
+ from datasets import load_dataset
10
+ from transformers import AutoTokenizer
11
+
12
+
13
+ def get_tokenizer(name: str = "mistralai/Mistral-7B-v0.1"):
14
+ """Use Mistral's tokenizer — 32k vocab, BPE, well-trained on diverse data."""
15
+ tok = AutoTokenizer.from_pretrained(name, use_fast=True)
16
+ if tok.pad_token is None:
17
+ tok.pad_token = tok.eos_token
18
+ return tok
19
+
20
+
21
+ class PackedPretrainDataset(IterableDataset):
22
+ """
23
+ Streams text from HuggingFace dataset, tokenizes on the fly,
24
+ and packs into fixed-length sequences for maximum GPU utilization.
25
+ """
26
+
27
+ def __init__(self, tokenizer, max_seq_len: int, split: str = "train", cache_dir: str = None, seed: int = 42):
28
+ self.tokenizer = tokenizer
29
+ self.max_seq_len = max_seq_len
30
+ self.split = split
31
+ self.cache_dir = cache_dir
32
+ self.seed = seed
33
+ self.eos_id = tokenizer.eos_token_id
34
+
35
+ def _token_stream(self):
36
+ ds = load_dataset(
37
+ "HuggingFaceFW/fineweb-edu",
38
+ name="sample-10BT",
39
+ split=self.split,
40
+ streaming=True,
41
+ cache_dir=self.cache_dir,
42
+ )
43
+ ds = ds.shuffle(seed=self.seed, buffer_size=10_000)
44
+
45
+ for example in ds:
46
+ text = example.get("text", "")
47
+ if len(text.strip()) < 50:
48
+ continue
49
+ token_ids = self.tokenizer.encode(text, add_special_tokens=False)
50
+ yield from token_ids
51
+ yield self.eos_id
52
+
53
+ def __iter__(self):
54
+ buffer = []
55
+ for token_id in self._token_stream():
56
+ buffer.append(token_id)
57
+ if len(buffer) == self.max_seq_len + 1:
58
+ input_ids = torch.tensor(buffer[:-1], dtype=torch.long)
59
+ labels = torch.tensor(buffer[1:], dtype=torch.long)
60
+ yield input_ids, labels
61
+ buffer = []
62
+
63
+
64
+ def create_dataloader(tokenizer, config, rank: int = 0, world_size: int = 1, seed_override: int = None):
65
+ seed = seed_override if seed_override is not None else config.seed
66
+ dataset = PackedPretrainDataset(
67
+ tokenizer=tokenizer,
68
+ max_seq_len=config.max_seq_len,
69
+ split="train",
70
+ cache_dir=config.data_cache_dir,
71
+ seed=seed + rank,
72
+ )
73
+ return DataLoader(
74
+ dataset,
75
+ batch_size=config.batch_size_per_gpu,
76
+ num_workers=config.num_workers,
77
+ pin_memory=True,
78
+ prefetch_factor=4,
79
+ )