gpt2_medium_prefix_682k / scripts /aws /validate_setup.sh

GPT-2 Medium trained on prefix dataset (682K)

3742716 verified 4 days ago

8.17 kB

	#!/bin/bash
	# Validate Seriguela Training Setup
	# This script validates that everything is configured correctly before training
	# Usage: ./validate_setup.sh

	set -e

	GREEN='\033[0;32m'
	RED='\033[0;31m'
	YELLOW='\033[1;33m'
	BLUE='\033[0;34m'
	NC='\033[0m'

	print_success() { echo -e "${GREEN}✅${NC} $1"; }
	print_error() { echo -e "${RED}❌${NC} $1"; }
	print_warning() { echo -e "${YELLOW}⚠️${NC} $1"; }
	print_header() { echo -e "\n${BLUE}========== $1 ==========${NC}"; }

	ERRORS=0

	print_header "Seriguela Setup Validation"

	# Change to project directory
	if [ -d "/home/ubuntu/seriguela" ]; then
	cd /home/ubuntu/seriguela
	elif [ -d "$(pwd)/seriguela" ]; then
	cd seriguela
	else
	cd .
	fi

	print_header "1. Python Environment"

	# Check Python version
	if python3 --version &> /dev/null; then
	PYTHON_VERSION=$(python3 --version)
	print_success "Python installed: $PYTHON_VERSION"
	else
	print_error "Python not found"
	ERRORS=$((ERRORS + 1))
	fi

	# Check venv
	if [ -d "venv" ]; then
	print_success "Virtual environment exists"
	source venv/bin/activate
	else
	print_error "Virtual environment not found"
	ERRORS=$((ERRORS + 1))
	fi

	# Check pip
	if pip --version &> /dev/null; then
	PIP_VERSION=$(pip --version \| cut -d' ' -f2)
	print_success "pip version: $PIP_VERSION"
	else
	print_error "pip not found"
	ERRORS=$((ERRORS + 1))
	fi

	print_header "2. Python Packages"

	# Check critical packages
	PACKAGES=(
	"transformers:Hugging Face Transformers"
	"torch:PyTorch"
	"wandb:Weights & Biases"
	"peft:Parameter-Efficient Fine-Tuning"
	"datasets:Hugging Face Datasets"
	)

	for pkg_info in "${PACKAGES[@]}"; do
	IFS=':' read -r pkg_name pkg_desc <<< "$pkg_info"

	if python3 -c "import $pkg_name" &> /dev/null; then
	VERSION=$(python3 -c "import $pkg_name; print($pkg_name.__version__)" 2>/dev/null \|\| echo "unknown")
	print_success "$pkg_desc ($pkg_name) - version $VERSION"
	else
	print_error "$pkg_desc ($pkg_name) not installed"
	ERRORS=$((ERRORS + 1))
	fi
	done

	# Check Wandb version specifically
	WANDB_VERSION=$(python3 -c "import wandb; print(wandb.__version__)" 2>/dev/null \|\| echo "0.0.0")
	REQUIRED_VERSION="0.24.0"

	if python3 << VERSIONCHECK
	import sys
	from packaging import version
	current = version.parse("$WANDB_VERSION")
	required = version.parse("$REQUIRED_VERSION")
	sys.exit(0 if current >= required else 1)
	VERSIONCHECK
	then
	print_success "Wandb version $WANDB_VERSION (>= $REQUIRED_VERSION required)"
	else
	print_warning "Wandb version $WANDB_VERSION is older than recommended $REQUIRED_VERSION"
	print_warning "New API key format (wandb_v1_...) requires Wandb >= 0.24.0"
	fi

	print_header "3. Environment Variables"

	# Load .env if exists
	if [ -f ".env" ]; then
	source <(grep -v '^#' .env \| sed 's/^/export /')
	print_success ".env file loaded"
	else
	print_warning ".env file not found"
	fi

	# Check HF_TOKEN
	if [ -n "$HF_TOKEN" ]; then
	TOKEN_LEN=${#HF_TOKEN}
	print_success "HF_TOKEN set ($TOKEN_LEN characters)"
	else
	print_warning "HF_TOKEN not set (model won't be pushed to Hub)"
	fi

	# Check WANDB_API_KEY
	if [ -n "$WANDB_API_KEY" ]; then
	KEY_LEN=${#WANDB_API_KEY}
	print_success "WANDB_API_KEY set ($KEY_LEN characters)"
	else
	print_error "WANDB_API_KEY not set"
	ERRORS=$((ERRORS + 1))
	fi

	print_header "4. GPU / CUDA"

	# Check nvidia-smi
	if nvidia-smi &> /dev/null; then
	GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader \| head -1)
	GPU_MEMORY=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader \| head -1)
	print_success "GPU detected: $GPU_NAME ($GPU_MEMORY)"
	else
	print_error "GPU not detected (nvidia-smi failed)"
	ERRORS=$((ERRORS + 1))
	fi

	# Check CUDA
	if python3 -c "import torch; assert torch.cuda.is_available()" &> /dev/null; then
	CUDA_VERSION=$(python3 -c "import torch; print(torch.version.cuda)")
	GPU_COUNT=$(python3 -c "import torch; print(torch.cuda.device_count())")
	print_success "CUDA available: version $CUDA_VERSION ($GPU_COUNT GPU(s))"
	else
	print_error "CUDA not available in PyTorch"
	ERRORS=$((ERRORS + 1))
	fi

	print_header "5. Wandb Authentication"

	if [ -n "$WANDB_API_KEY" ]; then
	if python3 << WANDBCHECK
	import wandb
	import sys
	try:
	result = wandb.login(key="$WANDB_API_KEY", relogin=True)
	if result:
	print("Login successful")
	sys.exit(0)
	else:
	print("Login failed")
	sys.exit(1)
	except Exception as e:
	print(f"Error: {e}")
	sys.exit(1)
	WANDBCHECK
	then
	print_success "Wandb authentication successful"

	# Get user info
	WANDB_USER=$(python3 << 'GETUSER'
	import wandb
	try:
	api = wandb.Api()
	print(api.viewer.get("username", "unknown"))
	except:
	print("unknown")
	GETUSER
	)
	print_success "Logged in as: $WANDB_USER"
	else
	print_error "Wandb authentication failed"
	ERRORS=$((ERRORS + 1))
	fi
	else
	print_warning "Skipping Wandb auth (no API key)"
	fi

	print_header "6. HuggingFace Authentication"

	if [ -n "$HF_TOKEN" ]; then
	if python3 << HFCHECK
	from huggingface_hub import HfApi
	import sys
	try:
	api = HfApi(token="$HF_TOKEN")
	user = api.whoami()
	print(f"Login successful: {user.get('name', 'unknown')}")
	sys.exit(0)
	except Exception as e:
	print(f"Error: {e}")
	sys.exit(1)
	HFCHECK
	then
	print_success "HuggingFace authentication successful"
	else
	print_error "HuggingFace authentication failed"
	ERRORS=$((ERRORS + 1))
	fi
	else
	print_warning "Skipping HF auth (no token)"
	fi

	print_header "7. Dataset Access"

	# Test dataset loading
	if python3 << DATASETCHECK
	from datasets import load_dataset
	import sys
	try:
	# Quick test load (just get info, don't download)
	ds = load_dataset("augustocsc/sintetico_natural", split="train", streaming=True)
	print("Dataset accessible")
	sys.exit(0)
	except Exception as e:
	print(f"Error: {e}")
	sys.exit(1)
	DATASETCHECK
	then
	print_success "Dataset accessible: augustocsc/sintetico_natural"
	else
	print_warning "Could not verify dataset access (may require authentication)"
	fi

	print_header "8. Scripts"

	SCRIPTS=(
	"scripts/train.py"
	"scripts/evaluate.py"
	"scripts/generate.py"
	"scripts/aws/monitor_training_auto.sh"
	"scripts/aws/analyze_model.sh"
	)

	for script in "${SCRIPTS[@]}"; do
	if [ -f "$script" ]; then
	print_success "$script exists"
	else
	print_warning "$script not found"
	fi
	done

	# Final summary
	print_header "Validation Summary"
	echo ""

	if [ $ERRORS -eq 0 ]; then
	echo -e "${GREEN}╔══════════════════════════════════════╗${NC}"
	echo -e "${GREEN}║ ║${NC}"
	echo -e "${GREEN}║ ✅ ALL VALIDATIONS PASSED ✅ ║${NC}"
	echo -e "${GREEN}║ ║${NC}"
	echo -e "${GREEN}║ Ready for training! 🚀 ║${NC}"
	echo -e "${GREEN}║ ║${NC}"
	echo -e "${GREEN}╚══════════════════════════════════════╝${NC}"
	echo ""
	echo "You can now run:"
	echo " python scripts/train.py --help"
	echo " bash scripts/aws/run_all_training.sh"
	echo ""
	exit 0
	else
	echo -e "${RED}╔══════════════════════════════════════╗${NC}"
	echo -e "${RED}║ ║${NC}"
	echo -e "${RED}║ ❌ VALIDATION FAILED ❌ ║${NC}"
	echo -e "${RED}║ ║${NC}"
	echo -e "${RED}║ $ERRORS error(s) found ║${NC}"
	echo -e "${RED}║ ║${NC}"
	echo -e "${RED}╚══════════════════════════════════════╝${NC}"
	echo ""
	echo "Please fix the errors above before training."
	echo ""
	exit 1
	fi