grapheneaffiliates commited on
Commit
8cb233a
·
verified ·
1 Parent(s): d96a389

Upload python/prepare_data.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. python/prepare_data.py +270 -0
python/prepare_data.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data download and tokenization pipeline for H4 Polytopic Attention experiments.
3
+
4
+ Supports multiple datasets with automatic download and caching:
5
+ - synthetic: Fibonacci-structured phrases (no download needed)
6
+ - shakespeare: Tiny Shakespeare (~1MB character-level text)
7
+ - tinystories: TinyStories from HuggingFace (real children's stories)
8
+
9
+ All datasets return the same interface:
10
+ (train_data, val_data, vocab_size, stoi, itos)
11
+ """
12
+
13
+ import os
14
+ import sys
15
+ import json
16
+ import torch
17
+ import urllib.request
18
+
19
+ DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data')
20
+
21
+ DATASETS = {
22
+ 'synthetic': {
23
+ 'source': 'synthetic',
24
+ 'description': 'Fibonacci-structured phrases (built-in)',
25
+ },
26
+ 'shakespeare': {
27
+ 'source': 'url',
28
+ 'url': 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt',
29
+ 'filename': 'shakespeare.txt',
30
+ 'description': 'Tiny Shakespeare (~1MB, character-level)',
31
+ },
32
+ 'tinystories': {
33
+ 'source': 'huggingface',
34
+ 'path': 'roneneldan/TinyStories',
35
+ 'split': 'train',
36
+ 'val_split': 'validation',
37
+ 'filename': 'tinystories.txt',
38
+ 'val_filename': 'tinystories_val.txt',
39
+ 'description': 'TinyStories (HuggingFace, real children\'s stories)',
40
+ # Fallback URL if HF datasets library is not installed
41
+ 'fallback_url': None, # Too large for raw URL fallback
42
+ },
43
+ }
44
+
45
+
46
+ def _ensure_data_dir():
47
+ """Create data/ directory if it doesn't exist."""
48
+ os.makedirs(DATA_DIR, exist_ok=True)
49
+
50
+
51
+ def _download_url(url, filepath):
52
+ """Download a file from URL using urllib (stdlib)."""
53
+ print(f"Downloading {url} ...")
54
+ try:
55
+ urllib.request.urlretrieve(url, filepath)
56
+ print(f" Saved to {filepath} ({os.path.getsize(filepath)} bytes)")
57
+ return True
58
+ except Exception as e:
59
+ print(f" Download failed: {e}")
60
+ return False
61
+
62
+
63
+ def _generate_synthetic_text():
64
+ """Generate synthetic text with Fibonacci-structured repetitions."""
65
+ base_phrases = [
66
+ "the golden ratio appears in nature ",
67
+ "fibonacci numbers grow exponentially ",
68
+ "symmetry underlies all of physics ",
69
+ "the icosahedron has twenty faces ",
70
+ "phi equals one plus one over phi ",
71
+ "geometry is the language of space ",
72
+ "five fold symmetry cannot tile a plane ",
73
+ "the dodecahedron has twelve faces ",
74
+ ]
75
+ text = ""
76
+ a, b = 1, 1
77
+ for _ in range(200):
78
+ phrase = base_phrases[a % len(base_phrases)]
79
+ text += phrase * (b % 3 + 1)
80
+ a, b = b, a + b
81
+ return text
82
+
83
+
84
+ def _load_shakespeare():
85
+ """Download and return Tiny Shakespeare text."""
86
+ _ensure_data_dir()
87
+ cfg = DATASETS['shakespeare']
88
+ filepath = os.path.join(DATA_DIR, cfg['filename'])
89
+
90
+ if not os.path.exists(filepath):
91
+ if not _download_url(cfg['url'], filepath):
92
+ print("Shakespeare download failed, falling back to synthetic data.")
93
+ return None
94
+
95
+ with open(filepath, 'r', encoding='utf-8') as f:
96
+ text = f.read()
97
+ print(f"Loaded Shakespeare: {len(text):,} chars")
98
+ return text
99
+
100
+
101
+ def _load_tinystories():
102
+ """Load TinyStories from HuggingFace datasets or cached files."""
103
+ _ensure_data_dir()
104
+ cfg = DATASETS['tinystories']
105
+ train_path = os.path.join(DATA_DIR, cfg['filename'])
106
+ val_path = os.path.join(DATA_DIR, cfg['val_filename'])
107
+
108
+ # Check cache first
109
+ if os.path.exists(train_path) and os.path.exists(val_path):
110
+ with open(train_path, 'r', encoding='utf-8') as f:
111
+ train_text = f.read()
112
+ with open(val_path, 'r', encoding='utf-8') as f:
113
+ val_text = f.read()
114
+ print(f"Loaded TinyStories from cache: train={len(train_text):,} chars, val={len(val_text):,} chars")
115
+ return train_text, val_text
116
+
117
+ # Try HuggingFace datasets library
118
+ try:
119
+ from datasets import load_dataset as hf_load_dataset
120
+ print("Loading TinyStories from HuggingFace (this may take a while)...")
121
+ ds = hf_load_dataset(cfg['path'])
122
+
123
+ # Extract text — TinyStories has a 'text' field
124
+ # Limit to first 5M chars for manageability on CPU
125
+ MAX_CHARS = 5_000_000
126
+ train_text = ""
127
+ for item in ds[cfg['split']]:
128
+ train_text += item['text'] + "\n"
129
+ if len(train_text) >= MAX_CHARS:
130
+ train_text = train_text[:MAX_CHARS]
131
+ break
132
+
133
+ val_text = ""
134
+ for item in ds[cfg['val_split']]:
135
+ val_text += item['text'] + "\n"
136
+ if len(val_text) >= MAX_CHARS // 10:
137
+ val_text = val_text[:MAX_CHARS // 10]
138
+ break
139
+
140
+ # Cache to disk
141
+ with open(train_path, 'w', encoding='utf-8') as f:
142
+ f.write(train_text)
143
+ with open(val_path, 'w', encoding='utf-8') as f:
144
+ f.write(val_text)
145
+
146
+ print(f"TinyStories loaded and cached: train={len(train_text):,} chars, val={len(val_text):,} chars")
147
+ return train_text, val_text
148
+
149
+ except ImportError:
150
+ print("HuggingFace 'datasets' library not installed.")
151
+ print("Install with: pip install datasets")
152
+ print("Falling back to synthetic data.")
153
+ return None
154
+ except Exception as e:
155
+ print(f"Failed to load TinyStories: {e}")
156
+ print("Falling back to synthetic data.")
157
+ return None
158
+
159
+
160
+ def prepare_char_dataset(text, val_text=None):
161
+ """Prepare character-level dataset from text.
162
+
163
+ Returns:
164
+ (train_data, val_data, vocab_size, stoi, itos)
165
+ """
166
+ if val_text is not None:
167
+ # Pre-split data: build vocab from both
168
+ all_text = text + val_text
169
+ else:
170
+ all_text = text
171
+
172
+ chars = sorted(list(set(all_text)))
173
+ vocab_size = len(chars)
174
+ stoi = {ch: i for i, ch in enumerate(chars)}
175
+ itos = {i: ch for ch, i in stoi.items()}
176
+
177
+ if val_text is not None:
178
+ train_data = torch.tensor([stoi[c] for c in text], dtype=torch.long)
179
+ val_data = torch.tensor([stoi[c] for c in val_text], dtype=torch.long)
180
+ else:
181
+ data = torch.tensor([stoi[c] for c in text], dtype=torch.long)
182
+ n = int(0.9 * len(data))
183
+ train_data = data[:n]
184
+ val_data = data[n:]
185
+
186
+ return train_data, val_data, vocab_size, stoi, itos
187
+
188
+
189
+ def load_dataset(name='shakespeare'):
190
+ """Load a dataset by name. Returns raw text (or tuple for pre-split datasets).
191
+
192
+ For use with train_cpu.py's load_text_data() replacement.
193
+
194
+ Args:
195
+ name: 'synthetic', 'shakespeare', or 'tinystories'
196
+
197
+ Returns:
198
+ text (str) for single-text datasets, or
199
+ (train_text, val_text) for pre-split datasets, or
200
+ None on failure (caller should fall back to synthetic)
201
+ """
202
+ if name == 'synthetic':
203
+ return _generate_synthetic_text()
204
+ elif name == 'shakespeare':
205
+ return _load_shakespeare()
206
+ elif name == 'tinystories':
207
+ return _load_tinystories()
208
+ else:
209
+ print(f"Unknown dataset: {name}. Available: {list(DATASETS.keys())}")
210
+ return None
211
+
212
+
213
+ def load_and_prepare(name='shakespeare'):
214
+ """Full pipeline: download, tokenize, return ready-to-train tensors.
215
+
216
+ Returns:
217
+ (train_data, val_data, vocab_size, stoi, itos)
218
+ """
219
+ result = load_dataset(name)
220
+
221
+ if result is None:
222
+ # Fall back to synthetic
223
+ print("Using synthetic fallback data.")
224
+ text = _generate_synthetic_text()
225
+ return prepare_char_dataset(text)
226
+
227
+ if isinstance(result, tuple):
228
+ # Pre-split dataset (e.g., TinyStories)
229
+ train_text, val_text = result
230
+ return prepare_char_dataset(train_text, val_text)
231
+ else:
232
+ # Single text, will be split 90/10
233
+ return prepare_char_dataset(result)
234
+
235
+
236
+ def list_datasets():
237
+ """Print available datasets."""
238
+ print("Available datasets:")
239
+ for name, cfg in DATASETS.items():
240
+ cached = ""
241
+ if cfg['source'] == 'url':
242
+ path = os.path.join(DATA_DIR, cfg.get('filename', ''))
243
+ if os.path.exists(path):
244
+ cached = f" [cached: {os.path.getsize(path):,} bytes]"
245
+ elif cfg['source'] == 'huggingface':
246
+ path = os.path.join(DATA_DIR, cfg.get('filename', ''))
247
+ if os.path.exists(path):
248
+ cached = f" [cached: {os.path.getsize(path):,} bytes]"
249
+ print(f" {name:15s} — {cfg['description']}{cached}")
250
+
251
+
252
+ if __name__ == '__main__':
253
+ import argparse
254
+ parser = argparse.ArgumentParser(description='Prepare datasets for H4 experiments')
255
+ parser.add_argument('dataset', nargs='?', default='shakespeare',
256
+ choices=list(DATASETS.keys()),
257
+ help='Dataset to prepare (default: shakespeare)')
258
+ parser.add_argument('--list', action='store_true', help='List available datasets')
259
+ args = parser.parse_args()
260
+
261
+ if args.list:
262
+ list_datasets()
263
+ sys.exit(0)
264
+
265
+ train_data, val_data, vocab_size, stoi, itos = load_and_prepare(args.dataset)
266
+ print(f"\nDataset: {args.dataset}")
267
+ print(f"Vocab size: {vocab_size}")
268
+ print(f"Train tokens: {len(train_data):,}")
269
+ print(f"Val tokens: {len(val_data):,}")
270
+ print(f"Sample chars: {''.join(itos[i] for i in train_data[:80].tolist())}")