RippleGPT-Nano / validation /memory /prepare_large_data.py
Tavernari's picture
Upload folder using huggingface_hub
148b631 verified
"""
prepare_large_data.py - Prepares large dataset (50-100MB) for memory validation.
Unlike the code completion dataset, this downloads MUCH more code
to train a model that truly learns long-term patterns.
Usage:
python validation/memory/prepare_large_data.py --size 50 # 50MB
python validation/memory/prepare_large_data.py --size 100 # 100MB
"""
import os
import sys
import pickle
import argparse
import numpy as np
from tqdm import tqdm
# Settings
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
MIN_FILE_SIZE = 200
MAX_FILE_SIZE = 15000
TRAIN_SPLIT = 0.95 # 95% train, 5% validation (more training data)
def download_large_python_dataset(target_mb: int) -> str:
"""
Downloads a large Python code dataset.
Args:
target_mb: Target size in megabytes (50, 100, etc)
"""
from datasets import load_dataset
target_chars = target_mb * 1_000_000 # ~1 char = 1 byte
print(f"🔹 Downloading ~{target_mb}MB of Python code...")
print(" This may take a few minutes...")
# Try multiple datasets to get enough data
datasets_to_try = [
("bigcode/the-stack-smol", "data/python"),
("codeparrot/codeparrot-clean", None),
]
code_samples = []
current_len = 0
for dataset_name, data_dir in datasets_to_try:
if current_len >= target_chars:
break
try:
print(f"\n 📦 Loading: {dataset_name}")
if data_dir:
dataset = load_dataset(
dataset_name,
data_dir=data_dir,
split="train",
streaming=True
)
else:
dataset = load_dataset(
dataset_name,
split="train",
streaming=True
)
progress = tqdm(
desc=f" Collecting from {dataset_name.split('/')[-1]}",
total=target_chars - current_len,
unit="chars"
)
for sample in dataset:
code = sample.get('content', sample.get('code', ''))
if not code:
continue
# Quality filters
if len(code) < MIN_FILE_SIZE or len(code) > MAX_FILE_SIZE:
continue
# Filter files with too much non-ASCII content
try:
non_ascii = sum(1 for c in code if ord(c) > 127)
if non_ascii / len(code) > 0.05:
continue
except:
continue
# Normalize
code = code.replace('\t', ' ')
code = code.replace('\r\n', '\n')
code_samples.append(code)
current_len += len(code)
progress.update(len(code))
if current_len >= target_chars:
break
progress.close()
except Exception as e:
print(f" ⚠️ Error with {dataset_name}: {e}")
continue
if current_len < target_chars * 0.5:
print(f"\n⚠️ Warning: We only got {current_len / 1e6:.1f}MB of {target_mb}MB")
# Join with separator
separator = "\n\n# === END OF FILE ===\n\n"
full_text = separator.join(code_samples)
return full_text
def build_vocabulary(text: str) -> dict:
"""Builds character vocabulary."""
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
return {
'vocab_size': vocab_size,
'stoi': stoi,
'itos': itos,
'chars': chars
}
def prepare_large_dataset(target_mb: int = 50):
"""Main preparation pipeline."""
print("=" * 60)
print(f"🧠 PREPARING LARGE DATASET ({target_mb}MB) FOR KILLER TEST")
print("=" * 60)
os.makedirs(DATA_DIR, exist_ok=True)
# 1. Download code
code_text = download_large_python_dataset(target_mb)
actual_mb = len(code_text) / 1e6
print(f"\n📊 Final Statistics:")
print(f" Total characters: {len(code_text):,}")
print(f" Actual size: {actual_mb:.2f} MB")
# 2. Vocabulary
print("\n🔤 Building vocabulary...")
vocab = build_vocabulary(code_text)
print(f" Vocab size: {vocab['vocab_size']}")
meta_path = os.path.join(DATA_DIR, 'meta.pkl')
with open(meta_path, 'wb') as f:
pickle.dump(vocab, f)
# 3. Split
print("\n✂️ Splitting train/validation...")
n = len(code_text)
split_idx = int(n * TRAIN_SPLIT)
train_text = code_text[:split_idx]
val_text = code_text[split_idx:]
print(f" Train: {len(train_text)/1e6:.2f} MB")
print(f" Validation: {len(val_text)/1e6:.2f} MB")
# 4. Encode and save
print("\n💾 Encoding and saving (this may take a while)...")
stoi = vocab['stoi']
# Process in chunks to avoid memory overflow
chunk_size = 10_000_000
train_path = os.path.join(DATA_DIR, 'train.bin')
val_path = os.path.join(DATA_DIR, 'val.bin')
# Train
with open(train_path, 'wb') as f:
for i in range(0, len(train_text), chunk_size):
chunk = train_text[i:i+chunk_size]
ids = np.array([stoi[c] for c in chunk], dtype=np.uint16)
ids.tofile(f)
print(f"\r Train: {min(i+chunk_size, len(train_text))/1e6:.1f}MB processed", end="")
print()
# Val
with open(val_path, 'wb') as f:
for i in range(0, len(val_text), chunk_size):
chunk = val_text[i:i+chunk_size]
ids = np.array([stoi[c] for c in chunk], dtype=np.uint16)
ids.tofile(f)
# 5. Stats
stats = {
'target_mb': target_mb,
'actual_mb': actual_mb,
'train_chars': len(train_text),
'val_chars': len(val_text),
'vocab_size': vocab['vocab_size'],
}
with open(os.path.join(DATA_DIR, 'stats.pkl'), 'wb') as f:
pickle.dump(stats, f)
print("\n" + "=" * 60)
print("✅ LARGE DATASET PREPARED!")
print("=" * 60)
print(f"\nNext step: python validation/memory/train_large.py --config medium")
return stats
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Prepares large dataset for Killer Test')
parser.add_argument('--size', type=int, default=50, help='Size in MB (default: 50)')
args = parser.parse_args()
prepare_large_dataset(args.size)