RippleGPT-Nano / validation /memory /prepare_large_data.py

Upload folder using huggingface_hub

148b631 verified 28 days ago

6.82 kB

	"""
	prepare_large_data.py - Prepares large dataset (50-100MB) for memory validation.

	Unlike the code completion dataset, this downloads MUCH more code
	to train a model that truly learns long-term patterns.

	Usage:
	python validation/memory/prepare_large_data.py --size 50 # 50MB
	python validation/memory/prepare_large_data.py --size 100 # 100MB
	"""

	import os
	import sys
	import pickle
	import argparse
	import numpy as np
	from tqdm import tqdm

	# Settings
	DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
	MIN_FILE_SIZE = 200
	MAX_FILE_SIZE = 15000
	TRAIN_SPLIT = 0.95 # 95% train, 5% validation (more training data)


	def download_large_python_dataset(target_mb: int) -> str:
	"""
	Downloads a large Python code dataset.

	Args:
	target_mb: Target size in megabytes (50, 100, etc)
	"""
	from datasets import load_dataset

	target_chars = target_mb * 1_000_000 # ~1 char = 1 byte

	print(f"🔹 Downloading ~{target_mb}MB of Python code...")
	print(" This may take a few minutes...")

	# Try multiple datasets to get enough data
	datasets_to_try = [
	("bigcode/the-stack-smol", "data/python"),
	("codeparrot/codeparrot-clean", None),
	]

	code_samples = []
	current_len = 0

	for dataset_name, data_dir in datasets_to_try:
	if current_len >= target_chars:
	break

	try:
	print(f"\n 📦 Loading: {dataset_name}")

	if data_dir:
	dataset = load_dataset(
	dataset_name,
	data_dir=data_dir,
	split="train",
	streaming=True
	)
	else:
	dataset = load_dataset(
	dataset_name,
	split="train",
	streaming=True
	)

	progress = tqdm(
	desc=f" Collecting from {dataset_name.split('/')[-1]}",
	total=target_chars - current_len,
	unit="chars"
	)

	for sample in dataset:
	code = sample.get('content', sample.get('code', ''))

	if not code:
	continue

	# Quality filters
	if len(code) < MIN_FILE_SIZE or len(code) > MAX_FILE_SIZE:
	continue

	# Filter files with too much non-ASCII content
	try:
	non_ascii = sum(1 for c in code if ord(c) > 127)
	if non_ascii / len(code) > 0.05:
	continue
	except:
	continue

	# Normalize
	code = code.replace('\t', ' ')
	code = code.replace('\r\n', '\n')

	code_samples.append(code)
	current_len += len(code)
	progress.update(len(code))

	if current_len >= target_chars:
	break

	progress.close()

	except Exception as e:
	print(f" ⚠️ Error with {dataset_name}: {e}")
	continue

	if current_len < target_chars * 0.5:
	print(f"\n⚠️ Warning: We only got {current_len / 1e6:.1f}MB of {target_mb}MB")

	# Join with separator
	separator = "\n\n# === END OF FILE ===\n\n"
	full_text = separator.join(code_samples)

	return full_text


	def build_vocabulary(text: str) -> dict:
	"""Builds character vocabulary."""
	chars = sorted(list(set(text)))
	vocab_size = len(chars)

	stoi = {ch: i for i, ch in enumerate(chars)}
	itos = {i: ch for i, ch in enumerate(chars)}

	return {
	'vocab_size': vocab_size,
	'stoi': stoi,
	'itos': itos,
	'chars': chars
	}


	def prepare_large_dataset(target_mb: int = 50):
	"""Main preparation pipeline."""

	print("=" * 60)
	print(f"🧠 PREPARING LARGE DATASET ({target_mb}MB) FOR KILLER TEST")
	print("=" * 60)

	os.makedirs(DATA_DIR, exist_ok=True)

	# 1. Download code
	code_text = download_large_python_dataset(target_mb)

	actual_mb = len(code_text) / 1e6
	print(f"\n📊 Final Statistics:")
	print(f" Total characters: {len(code_text):,}")
	print(f" Actual size: {actual_mb:.2f} MB")

	# 2. Vocabulary
	print("\n🔤 Building vocabulary...")
	vocab = build_vocabulary(code_text)
	print(f" Vocab size: {vocab['vocab_size']}")

	meta_path = os.path.join(DATA_DIR, 'meta.pkl')
	with open(meta_path, 'wb') as f:
	pickle.dump(vocab, f)

	# 3. Split
	print("\n✂️ Splitting train/validation...")
	n = len(code_text)
	split_idx = int(n * TRAIN_SPLIT)

	train_text = code_text[:split_idx]
	val_text = code_text[split_idx:]

	print(f" Train: {len(train_text)/1e6:.2f} MB")
	print(f" Validation: {len(val_text)/1e6:.2f} MB")

	# 4. Encode and save
	print("\n💾 Encoding and saving (this may take a while)...")

	stoi = vocab['stoi']

	# Process in chunks to avoid memory overflow
	chunk_size = 10_000_000

	train_path = os.path.join(DATA_DIR, 'train.bin')
	val_path = os.path.join(DATA_DIR, 'val.bin')

	# Train
	with open(train_path, 'wb') as f:
	for i in range(0, len(train_text), chunk_size):
	chunk = train_text[i:i+chunk_size]
	ids = np.array([stoi[c] for c in chunk], dtype=np.uint16)
	ids.tofile(f)
	print(f"\r Train: {min(i+chunk_size, len(train_text))/1e6:.1f}MB processed", end="")
	print()

	# Val
	with open(val_path, 'wb') as f:
	for i in range(0, len(val_text), chunk_size):
	chunk = val_text[i:i+chunk_size]
	ids = np.array([stoi[c] for c in chunk], dtype=np.uint16)
	ids.tofile(f)

	# 5. Stats
	stats = {
	'target_mb': target_mb,
	'actual_mb': actual_mb,
	'train_chars': len(train_text),
	'val_chars': len(val_text),
	'vocab_size': vocab['vocab_size'],
	}

	with open(os.path.join(DATA_DIR, 'stats.pkl'), 'wb') as f:
	pickle.dump(stats, f)

	print("\n" + "=" * 60)
	print("✅ LARGE DATASET PREPARED!")
	print("=" * 60)
	print(f"\nNext step: python validation/memory/train_large.py --config medium")

	return stats


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Prepares large dataset for Killer Test')
	parser.add_argument('--size', type=int, default=50, help='Size in MB (default: 50)')
	args = parser.parse_args()

	prepare_large_dataset(args.size)