RippleGPT-Nano / validation /code /prepare_code_data.py

Upload folder using huggingface_hub

148b631 verified 28 days ago

5.91 kB

	"""
	prepare_code_data.py - Prepares the-stack-smol dataset for code completion validation.

	This script:
	1. Downloads Python code from HuggingFace (streaming)
	2. Filters and cleans the code
	3. Tokenizes at character level
	4. Saves in binary format for training

	Usage:
	python validation/prepare_code_data.py
	"""

	import os
	import pickle
	import numpy as np
	from tqdm import tqdm

	# Settings
	DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
	TARGET_SIZE_CHARS = 5_000_000 # ~5MB of Python code
	MIN_FILE_SIZE = 100 # Ignore very small files
	MAX_FILE_SIZE = 10000 # Ignore very large files
	TRAIN_SPLIT = 0.9 # 90% train, 10% validation


	def download_python_code(target_chars: int) -> str:
	"""
	Downloads Python code from the-stack-smol via streaming.
	Does not download the entire dataset, only what is needed.
	"""
	from datasets import load_dataset

	print("🔹 Downloading Python code from the-stack-smol...")
	print(" (Using streaming, not downloading entire dataset)")

	try:
	# Streaming: download only what we need
	dataset = load_dataset(
	"bigcode/the-stack-smol",
	data_dir="data/python",
	split="train",
	streaming=True
	)
	except Exception as e:
	print(f"❌ Error accessing HuggingFace: {e}")
	print(" Trying alternative dataset...")
	# Fallback to another code dataset
	dataset = load_dataset(
	"codeparrot/codeparrot-clean",
	split="train",
	streaming=True
	)

	code_samples = []
	current_len = 0

	progress = tqdm(desc="Collecting code", total=target_chars, unit="chars")

	for sample in dataset:
	# Extract code content
	code = sample.get('content', sample.get('code', ''))

	if not code:
	continue

	# Quality filters
	if len(code) < MIN_FILE_SIZE or len(code) > MAX_FILE_SIZE:
	continue

	# Ignore files with many non-ASCII chars (binaries, etc)
	try:
	code.encode('ascii')
	except UnicodeEncodeError:
	# Allow some special characters but filter too many
	non_ascii = sum(1 for c in code if ord(c) > 127)
	if non_ascii / len(code) > 0.1: # More than 10% non-ASCII
	continue

	# Normalize indentation (convert tabs to 4 spaces)
	code = code.replace('\t', ' ')

	code_samples.append(code)
	current_len += len(code)
	progress.update(len(code))

	if current_len >= target_chars:
	break

	progress.close()

	# Join with special separator
	separator = "\n\n# === END OF FILE ===\n\n"
	full_text = separator.join(code_samples)

	return full_text


	def build_vocabulary(text: str) -> dict:
	"""
	Builds character vocabulary.
	Returns dictionaries stoi (char->int) and itos (int->char).
	"""
	chars = sorted(list(set(text)))
	vocab_size = len(chars)

	stoi = {ch: i for i, ch in enumerate(chars)}
	itos = {i: ch for i, ch in enumerate(chars)}

	return {
	'vocab_size': vocab_size,
	'stoi': stoi,
	'itos': itos,
	'chars': chars
	}


	def encode_text(text: str, stoi: dict) -> np.ndarray:
	"""Encodes text to integer array."""
	return np.array([stoi[c] for c in text], dtype=np.uint16)


	def prepare_dataset():
	"""Main preparation pipeline."""

	print("=" * 60)
	print("🧪 PREPARING CODE DATASET FOR VALIDATION")
	print("=" * 60)

	# Create data directory
	os.makedirs(DATA_DIR, exist_ok=True)

	# 1. Download code
	print(f"\n📥 Downloading ~{TARGET_SIZE_CHARS / 1e6:.1f}MB of Python code...")
	code_text = download_python_code(TARGET_SIZE_CHARS)

	print(f"\n📊 Statistics:")
	print(f" Total characters: {len(code_text):,}")
	print(f" Size on disk: {len(code_text) / 1024 / 1024:.2f} MB")

	# 2. Build vocabulary
	print("\n🔤 Building vocabulary...")
	vocab = build_vocabulary(code_text)
	print(f" Vocab size: {vocab['vocab_size']}")
	print(f" Characters (sample): {''.join(vocab['chars'][:50])}...")

	# Save vocabulary
	meta_path = os.path.join(DATA_DIR, 'meta.pkl')
	with open(meta_path, 'wb') as f:
	pickle.dump(vocab, f)
	print(f" Saved to: {meta_path}")

	# 3. Split train/validation
	print("\n✂️ Splitting train/validation...")
	n = len(code_text)
	split_idx = int(n * TRAIN_SPLIT)

	train_text = code_text[:split_idx]
	val_text = code_text[split_idx:]

	print(f" Train: {len(train_text):,} chars ({TRAIN_SPLIT*100:.0f}%)")
	print(f" Validation: {len(val_text):,} chars ({(1-TRAIN_SPLIT)*100:.0f}%)")

	# 4. Encode and save
	print("\n💾 Encoding and saving...")

	train_ids = encode_text(train_text, vocab['stoi'])
	val_ids = encode_text(val_text, vocab['stoi'])

	train_path = os.path.join(DATA_DIR, 'train.bin')
	val_path = os.path.join(DATA_DIR, 'val.bin')

	train_ids.tofile(train_path)
	val_ids.tofile(val_path)

	print(f" Train saved to: {train_path}")
	print(f" Validation saved to: {val_path}")

	# 5. Create statistics file
	stats = {
	'total_chars': len(code_text),
	'train_chars': len(train_text),
	'val_chars': len(val_text),
	'vocab_size': vocab['vocab_size'],
	'source': 'bigcode/the-stack-smol'
	}

	stats_path = os.path.join(DATA_DIR, 'stats.pkl')
	with open(stats_path, 'wb') as f:
	pickle.dump(stats, f)

	print("\n" + "=" * 60)
	print("✅ DATASET PREPARED SUCCESSFULLY!")
	print("=" * 60)
	print(f"\nNext step: python validation/code/train_code.py")

	return stats


	if __name__ == '__main__':
	prepare_dataset()