|
|
""" |
|
|
prepare_large_data.py - Prepares large dataset (50-100MB) for memory validation. |
|
|
|
|
|
Unlike the code completion dataset, this downloads MUCH more code |
|
|
to train a model that truly learns long-term patterns. |
|
|
|
|
|
Usage: |
|
|
python validation/memory/prepare_large_data.py --size 50 # 50MB |
|
|
python validation/memory/prepare_large_data.py --size 100 # 100MB |
|
|
""" |
|
|
|
|
|
import os |
|
|
import sys |
|
|
import pickle |
|
|
import argparse |
|
|
import numpy as np |
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') |
|
|
MIN_FILE_SIZE = 200 |
|
|
MAX_FILE_SIZE = 15000 |
|
|
TRAIN_SPLIT = 0.95 |
|
|
|
|
|
|
|
|
def download_large_python_dataset(target_mb: int) -> str: |
|
|
""" |
|
|
Downloads a large Python code dataset. |
|
|
|
|
|
Args: |
|
|
target_mb: Target size in megabytes (50, 100, etc) |
|
|
""" |
|
|
from datasets import load_dataset |
|
|
|
|
|
target_chars = target_mb * 1_000_000 |
|
|
|
|
|
print(f"🔹 Downloading ~{target_mb}MB of Python code...") |
|
|
print(" This may take a few minutes...") |
|
|
|
|
|
|
|
|
datasets_to_try = [ |
|
|
("bigcode/the-stack-smol", "data/python"), |
|
|
("codeparrot/codeparrot-clean", None), |
|
|
] |
|
|
|
|
|
code_samples = [] |
|
|
current_len = 0 |
|
|
|
|
|
for dataset_name, data_dir in datasets_to_try: |
|
|
if current_len >= target_chars: |
|
|
break |
|
|
|
|
|
try: |
|
|
print(f"\n 📦 Loading: {dataset_name}") |
|
|
|
|
|
if data_dir: |
|
|
dataset = load_dataset( |
|
|
dataset_name, |
|
|
data_dir=data_dir, |
|
|
split="train", |
|
|
streaming=True |
|
|
) |
|
|
else: |
|
|
dataset = load_dataset( |
|
|
dataset_name, |
|
|
split="train", |
|
|
streaming=True |
|
|
) |
|
|
|
|
|
progress = tqdm( |
|
|
desc=f" Collecting from {dataset_name.split('/')[-1]}", |
|
|
total=target_chars - current_len, |
|
|
unit="chars" |
|
|
) |
|
|
|
|
|
for sample in dataset: |
|
|
code = sample.get('content', sample.get('code', '')) |
|
|
|
|
|
if not code: |
|
|
continue |
|
|
|
|
|
|
|
|
if len(code) < MIN_FILE_SIZE or len(code) > MAX_FILE_SIZE: |
|
|
continue |
|
|
|
|
|
|
|
|
try: |
|
|
non_ascii = sum(1 for c in code if ord(c) > 127) |
|
|
if non_ascii / len(code) > 0.05: |
|
|
continue |
|
|
except: |
|
|
continue |
|
|
|
|
|
|
|
|
code = code.replace('\t', ' ') |
|
|
code = code.replace('\r\n', '\n') |
|
|
|
|
|
code_samples.append(code) |
|
|
current_len += len(code) |
|
|
progress.update(len(code)) |
|
|
|
|
|
if current_len >= target_chars: |
|
|
break |
|
|
|
|
|
progress.close() |
|
|
|
|
|
except Exception as e: |
|
|
print(f" ⚠️ Error with {dataset_name}: {e}") |
|
|
continue |
|
|
|
|
|
if current_len < target_chars * 0.5: |
|
|
print(f"\n⚠️ Warning: We only got {current_len / 1e6:.1f}MB of {target_mb}MB") |
|
|
|
|
|
|
|
|
separator = "\n\n# === END OF FILE ===\n\n" |
|
|
full_text = separator.join(code_samples) |
|
|
|
|
|
return full_text |
|
|
|
|
|
|
|
|
def build_vocabulary(text: str) -> dict: |
|
|
"""Builds character vocabulary.""" |
|
|
chars = sorted(list(set(text))) |
|
|
vocab_size = len(chars) |
|
|
|
|
|
stoi = {ch: i for i, ch in enumerate(chars)} |
|
|
itos = {i: ch for i, ch in enumerate(chars)} |
|
|
|
|
|
return { |
|
|
'vocab_size': vocab_size, |
|
|
'stoi': stoi, |
|
|
'itos': itos, |
|
|
'chars': chars |
|
|
} |
|
|
|
|
|
|
|
|
def prepare_large_dataset(target_mb: int = 50): |
|
|
"""Main preparation pipeline.""" |
|
|
|
|
|
print("=" * 60) |
|
|
print(f"🧠 PREPARING LARGE DATASET ({target_mb}MB) FOR KILLER TEST") |
|
|
print("=" * 60) |
|
|
|
|
|
os.makedirs(DATA_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
code_text = download_large_python_dataset(target_mb) |
|
|
|
|
|
actual_mb = len(code_text) / 1e6 |
|
|
print(f"\n📊 Final Statistics:") |
|
|
print(f" Total characters: {len(code_text):,}") |
|
|
print(f" Actual size: {actual_mb:.2f} MB") |
|
|
|
|
|
|
|
|
print("\n🔤 Building vocabulary...") |
|
|
vocab = build_vocabulary(code_text) |
|
|
print(f" Vocab size: {vocab['vocab_size']}") |
|
|
|
|
|
meta_path = os.path.join(DATA_DIR, 'meta.pkl') |
|
|
with open(meta_path, 'wb') as f: |
|
|
pickle.dump(vocab, f) |
|
|
|
|
|
|
|
|
print("\n✂️ Splitting train/validation...") |
|
|
n = len(code_text) |
|
|
split_idx = int(n * TRAIN_SPLIT) |
|
|
|
|
|
train_text = code_text[:split_idx] |
|
|
val_text = code_text[split_idx:] |
|
|
|
|
|
print(f" Train: {len(train_text)/1e6:.2f} MB") |
|
|
print(f" Validation: {len(val_text)/1e6:.2f} MB") |
|
|
|
|
|
|
|
|
print("\n💾 Encoding and saving (this may take a while)...") |
|
|
|
|
|
stoi = vocab['stoi'] |
|
|
|
|
|
|
|
|
chunk_size = 10_000_000 |
|
|
|
|
|
train_path = os.path.join(DATA_DIR, 'train.bin') |
|
|
val_path = os.path.join(DATA_DIR, 'val.bin') |
|
|
|
|
|
|
|
|
with open(train_path, 'wb') as f: |
|
|
for i in range(0, len(train_text), chunk_size): |
|
|
chunk = train_text[i:i+chunk_size] |
|
|
ids = np.array([stoi[c] for c in chunk], dtype=np.uint16) |
|
|
ids.tofile(f) |
|
|
print(f"\r Train: {min(i+chunk_size, len(train_text))/1e6:.1f}MB processed", end="") |
|
|
print() |
|
|
|
|
|
|
|
|
with open(val_path, 'wb') as f: |
|
|
for i in range(0, len(val_text), chunk_size): |
|
|
chunk = val_text[i:i+chunk_size] |
|
|
ids = np.array([stoi[c] for c in chunk], dtype=np.uint16) |
|
|
ids.tofile(f) |
|
|
|
|
|
|
|
|
stats = { |
|
|
'target_mb': target_mb, |
|
|
'actual_mb': actual_mb, |
|
|
'train_chars': len(train_text), |
|
|
'val_chars': len(val_text), |
|
|
'vocab_size': vocab['vocab_size'], |
|
|
} |
|
|
|
|
|
with open(os.path.join(DATA_DIR, 'stats.pkl'), 'wb') as f: |
|
|
pickle.dump(stats, f) |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("✅ LARGE DATASET PREPARED!") |
|
|
print("=" * 60) |
|
|
print(f"\nNext step: python validation/memory/train_large.py --config medium") |
|
|
|
|
|
return stats |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
parser = argparse.ArgumentParser(description='Prepares large dataset for Killer Test') |
|
|
parser.add_argument('--size', type=int, default=50, help='Size in MB (default: 50)') |
|
|
args = parser.parse_args() |
|
|
|
|
|
prepare_large_dataset(args.size) |
|
|
|