File size: 4,901 Bytes
a683148 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
# MAP-NEO Mini Configuration and Setup
# Configuration files and helper scripts
import json
from pathlib import Path
# Training configuration optimized for RTX 5070 8GB
TRAINING_CONFIG = {
"model": {
"vocab_size": 50257,
"max_seq_len": 2048,
"dim": 1024,
"n_layers": 16,
"n_heads": 16,
"hidden_dim": 2736,
"dropout": 0.0
},
"training": {
"batch_size": 1,
"gradient_accumulation_steps": 32,
"max_steps": 50000,
"warmup_steps": 2000,
"learning_rate": 3e-4,
"weight_decay": 0.01,
"grad_clip": 1.0,
"mixed_precision": "bf16",
"gradient_checkpointing": True
},
"data": {
"seq_length": 1024,
"data_path": "data/tokens/packed_1024.txt"
},
"hardware": {
"device": "cuda",
"compile_model": False
},
"logging": {
"log_interval": 10,
"save_interval": 2000,
"output_dir": "checkpoints"
}
}
# Data preprocessing configuration
DATA_CONFIG = {
"num_docs": 20000, # Start with 20k documents
"seq_length": 1024,
"tokenizer": "gpt2", # Will switch to MAP-NEO tokenizer later
"output_dir": "data",
"min_text_length": 50, # Filter out very short texts
"max_text_length": 10000 # Filter out very long texts
}
def setup_project():
"""Create project directory structure"""
directories = [
"data/shards",
"data/processed",
"data/tokens",
"checkpoints",
"configs",
"logs",
"notebooks"
]
for dir_path in directories:
Path(dir_path).mkdir(parents=True, exist_ok=True)
print(f"Created directory: {dir_path}")
def save_configs():
"""Save configuration files"""
# Training config
with open("configs/training_config.json", "w") as f:
json.dump(TRAINING_CONFIG, f, indent=2)
# Data config
with open("configs/data_config.json", "w") as f:
json.dump(DATA_CONFIG, f, indent=2)
print("Configuration files saved to configs/")
def create_requirements_txt():
"""Create requirements.txt file"""
requirements = [
"torch>=2.0.0",
"transformers>=4.35.0",
"tokenizers>=0.14.0",
"datasets>=2.14.0",
"accelerate>=0.24.0",
"sentencepiece>=0.1.99",
"langdetect>=1.0.9",
"zstandard>=0.21.0",
"tqdm>=4.65.0",
"numpy>=1.24.0",
"matplotlib>=3.6.0",
"tensorboard>=2.14.0"
]
with open("requirements.txt", "w") as f:
f.write("\n".join(requirements))
print("Created requirements.txt")
def create_run_script():
"""Create a simple run script for training"""
run_script = '''#!/usr/bin/env python3
# Run MAP-NEO Mini training pipeline
import subprocess
import sys
from pathlib import Path
def run_command(cmd, description):
"""Run a command and handle errors"""
print(f"\\n{'='*50}")
print(f"Running: {description}")
print(f"Command: {cmd}")
print(f"{'='*50}")
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
print(f"Error in {description}:")
print(result.stderr)
sys.exit(1)
else:
print(f"Success: {description}")
if result.stdout:
print(result.stdout)
def main():
print("MAP-NEO Mini Training Pipeline")
print("Optimized for RTX 5070 8GB VRAM")
# Step 1: Data preprocessing
if not Path("data/tokens/packed_1024.txt").exists():
print("\\nStep 1: Data preprocessing")
run_command(
"python data_prep.py --num_docs 20000 --seq_length 1024",
"Data preprocessing"
)
else:
print("\\nSkipping data preprocessing (data exists)")
# Step 2: Model training
print("\\nStep 2: Starting model training")
run_command(
"python train_neo.py",
"Model training"
)
print("\\n" + "="*50)
print("Training pipeline completed!")
print("Check checkpoints/ directory for saved models")
print("="*50)
if __name__ == "__main__":
main()
'''
with open("run_training.py", "w") as f:
f.write(run_script)
print("Created run_training.py script")
if __name__ == "__main__":
print("Setting up MAP-NEO Mini project...")
setup_project()
save_configs()
create_requirements_txt()
create_run_script()
print("\nProject setup complete!")
print("\nNext steps:")
print("1. Run: python data_prep.py --num_docs 10000")
print("2. Run: python train_neo.py")
print("3. Or use: python run_training.py") |