augustocsc's picture
GPT-2 Medium trained on prefix dataset (682K)
3742716 verified
#!/bin/bash
# Validate Seriguela Training Setup
# This script validates that everything is configured correctly before training
# Usage: ./validate_setup.sh
set -e
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
print_success() { echo -e "${GREEN}βœ…${NC} $1"; }
print_error() { echo -e "${RED}❌${NC} $1"; }
print_warning() { echo -e "${YELLOW}⚠️${NC} $1"; }
print_header() { echo -e "\n${BLUE}========== $1 ==========${NC}"; }
ERRORS=0
print_header "Seriguela Setup Validation"
# Change to project directory
if [ -d "/home/ubuntu/seriguela" ]; then
cd /home/ubuntu/seriguela
elif [ -d "$(pwd)/seriguela" ]; then
cd seriguela
else
cd .
fi
print_header "1. Python Environment"
# Check Python version
if python3 --version &> /dev/null; then
PYTHON_VERSION=$(python3 --version)
print_success "Python installed: $PYTHON_VERSION"
else
print_error "Python not found"
ERRORS=$((ERRORS + 1))
fi
# Check venv
if [ -d "venv" ]; then
print_success "Virtual environment exists"
source venv/bin/activate
else
print_error "Virtual environment not found"
ERRORS=$((ERRORS + 1))
fi
# Check pip
if pip --version &> /dev/null; then
PIP_VERSION=$(pip --version | cut -d' ' -f2)
print_success "pip version: $PIP_VERSION"
else
print_error "pip not found"
ERRORS=$((ERRORS + 1))
fi
print_header "2. Python Packages"
# Check critical packages
PACKAGES=(
"transformers:Hugging Face Transformers"
"torch:PyTorch"
"wandb:Weights & Biases"
"peft:Parameter-Efficient Fine-Tuning"
"datasets:Hugging Face Datasets"
)
for pkg_info in "${PACKAGES[@]}"; do
IFS=':' read -r pkg_name pkg_desc <<< "$pkg_info"
if python3 -c "import $pkg_name" &> /dev/null; then
VERSION=$(python3 -c "import $pkg_name; print($pkg_name.__version__)" 2>/dev/null || echo "unknown")
print_success "$pkg_desc ($pkg_name) - version $VERSION"
else
print_error "$pkg_desc ($pkg_name) not installed"
ERRORS=$((ERRORS + 1))
fi
done
# Check Wandb version specifically
WANDB_VERSION=$(python3 -c "import wandb; print(wandb.__version__)" 2>/dev/null || echo "0.0.0")
REQUIRED_VERSION="0.24.0"
if python3 << VERSIONCHECK
import sys
from packaging import version
current = version.parse("$WANDB_VERSION")
required = version.parse("$REQUIRED_VERSION")
sys.exit(0 if current >= required else 1)
VERSIONCHECK
then
print_success "Wandb version $WANDB_VERSION (>= $REQUIRED_VERSION required)"
else
print_warning "Wandb version $WANDB_VERSION is older than recommended $REQUIRED_VERSION"
print_warning "New API key format (wandb_v1_...) requires Wandb >= 0.24.0"
fi
print_header "3. Environment Variables"
# Load .env if exists
if [ -f ".env" ]; then
source <(grep -v '^#' .env | sed 's/^/export /')
print_success ".env file loaded"
else
print_warning ".env file not found"
fi
# Check HF_TOKEN
if [ -n "$HF_TOKEN" ]; then
TOKEN_LEN=${#HF_TOKEN}
print_success "HF_TOKEN set ($TOKEN_LEN characters)"
else
print_warning "HF_TOKEN not set (model won't be pushed to Hub)"
fi
# Check WANDB_API_KEY
if [ -n "$WANDB_API_KEY" ]; then
KEY_LEN=${#WANDB_API_KEY}
print_success "WANDB_API_KEY set ($KEY_LEN characters)"
else
print_error "WANDB_API_KEY not set"
ERRORS=$((ERRORS + 1))
fi
print_header "4. GPU / CUDA"
# Check nvidia-smi
if nvidia-smi &> /dev/null; then
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
GPU_MEMORY=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader | head -1)
print_success "GPU detected: $GPU_NAME ($GPU_MEMORY)"
else
print_error "GPU not detected (nvidia-smi failed)"
ERRORS=$((ERRORS + 1))
fi
# Check CUDA
if python3 -c "import torch; assert torch.cuda.is_available()" &> /dev/null; then
CUDA_VERSION=$(python3 -c "import torch; print(torch.version.cuda)")
GPU_COUNT=$(python3 -c "import torch; print(torch.cuda.device_count())")
print_success "CUDA available: version $CUDA_VERSION ($GPU_COUNT GPU(s))"
else
print_error "CUDA not available in PyTorch"
ERRORS=$((ERRORS + 1))
fi
print_header "5. Wandb Authentication"
if [ -n "$WANDB_API_KEY" ]; then
if python3 << WANDBCHECK
import wandb
import sys
try:
result = wandb.login(key="$WANDB_API_KEY", relogin=True)
if result:
print("Login successful")
sys.exit(0)
else:
print("Login failed")
sys.exit(1)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
WANDBCHECK
then
print_success "Wandb authentication successful"
# Get user info
WANDB_USER=$(python3 << 'GETUSER'
import wandb
try:
api = wandb.Api()
print(api.viewer.get("username", "unknown"))
except:
print("unknown")
GETUSER
)
print_success "Logged in as: $WANDB_USER"
else
print_error "Wandb authentication failed"
ERRORS=$((ERRORS + 1))
fi
else
print_warning "Skipping Wandb auth (no API key)"
fi
print_header "6. HuggingFace Authentication"
if [ -n "$HF_TOKEN" ]; then
if python3 << HFCHECK
from huggingface_hub import HfApi
import sys
try:
api = HfApi(token="$HF_TOKEN")
user = api.whoami()
print(f"Login successful: {user.get('name', 'unknown')}")
sys.exit(0)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
HFCHECK
then
print_success "HuggingFace authentication successful"
else
print_error "HuggingFace authentication failed"
ERRORS=$((ERRORS + 1))
fi
else
print_warning "Skipping HF auth (no token)"
fi
print_header "7. Dataset Access"
# Test dataset loading
if python3 << DATASETCHECK
from datasets import load_dataset
import sys
try:
# Quick test load (just get info, don't download)
ds = load_dataset("augustocsc/sintetico_natural", split="train", streaming=True)
print("Dataset accessible")
sys.exit(0)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
DATASETCHECK
then
print_success "Dataset accessible: augustocsc/sintetico_natural"
else
print_warning "Could not verify dataset access (may require authentication)"
fi
print_header "8. Scripts"
SCRIPTS=(
"scripts/train.py"
"scripts/evaluate.py"
"scripts/generate.py"
"scripts/aws/monitor_training_auto.sh"
"scripts/aws/analyze_model.sh"
)
for script in "${SCRIPTS[@]}"; do
if [ -f "$script" ]; then
print_success "$script exists"
else
print_warning "$script not found"
fi
done
# Final summary
print_header "Validation Summary"
echo ""
if [ $ERRORS -eq 0 ]; then
echo -e "${GREEN}╔══════════════════════════════════════╗${NC}"
echo -e "${GREEN}β•‘ β•‘${NC}"
echo -e "${GREEN}β•‘ βœ… ALL VALIDATIONS PASSED βœ… β•‘${NC}"
echo -e "${GREEN}β•‘ β•‘${NC}"
echo -e "${GREEN}β•‘ Ready for training! πŸš€ β•‘${NC}"
echo -e "${GREEN}β•‘ β•‘${NC}"
echo -e "${GREEN}β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•${NC}"
echo ""
echo "You can now run:"
echo " python scripts/train.py --help"
echo " bash scripts/aws/run_all_training.sh"
echo ""
exit 0
else
echo -e "${RED}╔══════════════════════════════════════╗${NC}"
echo -e "${RED}β•‘ β•‘${NC}"
echo -e "${RED}β•‘ ❌ VALIDATION FAILED ❌ β•‘${NC}"
echo -e "${RED}β•‘ β•‘${NC}"
echo -e "${RED}β•‘ $ERRORS error(s) found β•‘${NC}"
echo -e "${RED}β•‘ β•‘${NC}"
echo -e "${RED}β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•${NC}"
echo ""
echo "Please fix the errors above before training."
echo ""
exit 1
fi