File size: 8,171 Bytes
#!/bin/bash
# Validate Seriguela Training Setup
# This script validates that everything is configured correctly before training
# Usage: ./validate_setup.sh

set -e

GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

print_success() { echo -e "${GREEN}✅${NC} $1"; }
print_error() { echo -e "${RED}❌${NC} $1"; }
print_warning() { echo -e "${YELLOW}⚠️${NC}  $1"; }
print_header() { echo -e "\n${BLUE}========== $1 ==========${NC}"; }

ERRORS=0

print_header "Seriguela Setup Validation"

# Change to project directory
if [ -d "/home/ubuntu/seriguela" ]; then
    cd /home/ubuntu/seriguela
elif [ -d "$(pwd)/seriguela" ]; then
    cd seriguela
else
    cd .
fi

print_header "1. Python Environment"

# Check Python version
if python3 --version &> /dev/null; then
    PYTHON_VERSION=$(python3 --version)
    print_success "Python installed: $PYTHON_VERSION"
else
    print_error "Python not found"
    ERRORS=$((ERRORS + 1))
fi

# Check venv
if [ -d "venv" ]; then
    print_success "Virtual environment exists"
    source venv/bin/activate
else
    print_error "Virtual environment not found"
    ERRORS=$((ERRORS + 1))
fi

# Check pip
if pip --version &> /dev/null; then
    PIP_VERSION=$(pip --version | cut -d' ' -f2)
    print_success "pip version: $PIP_VERSION"
else
    print_error "pip not found"
    ERRORS=$((ERRORS + 1))
fi

print_header "2. Python Packages"

# Check critical packages
PACKAGES=(
    "transformers:Hugging Face Transformers"
    "torch:PyTorch"
    "wandb:Weights & Biases"
    "peft:Parameter-Efficient Fine-Tuning"
    "datasets:Hugging Face Datasets"
)

for pkg_info in "${PACKAGES[@]}"; do
    IFS=':' read -r pkg_name pkg_desc <<< "$pkg_info"

    if python3 -c "import $pkg_name" &> /dev/null; then
        VERSION=$(python3 -c "import $pkg_name; print($pkg_name.__version__)" 2>/dev/null || echo "unknown")
        print_success "$pkg_desc ($pkg_name) - version $VERSION"
    else
        print_error "$pkg_desc ($pkg_name) not installed"
        ERRORS=$((ERRORS + 1))
    fi
done

# Check Wandb version specifically
WANDB_VERSION=$(python3 -c "import wandb; print(wandb.__version__)" 2>/dev/null || echo "0.0.0")
REQUIRED_VERSION="0.24.0"

if python3 << VERSIONCHECK
import sys
from packaging import version
current = version.parse("$WANDB_VERSION")
required = version.parse("$REQUIRED_VERSION")
sys.exit(0 if current >= required else 1)
VERSIONCHECK
then
    print_success "Wandb version $WANDB_VERSION (>= $REQUIRED_VERSION required)"
else
    print_warning "Wandb version $WANDB_VERSION is older than recommended $REQUIRED_VERSION"
    print_warning "New API key format (wandb_v1_...) requires Wandb >= 0.24.0"
fi

print_header "3. Environment Variables"

# Load .env if exists
if [ -f ".env" ]; then
    source <(grep -v '^#' .env | sed 's/^/export /')
    print_success ".env file loaded"
else
    print_warning ".env file not found"
fi

# Check HF_TOKEN
if [ -n "$HF_TOKEN" ]; then
    TOKEN_LEN=${#HF_TOKEN}
    print_success "HF_TOKEN set ($TOKEN_LEN characters)"
else
    print_warning "HF_TOKEN not set (model won't be pushed to Hub)"
fi

# Check WANDB_API_KEY
if [ -n "$WANDB_API_KEY" ]; then
    KEY_LEN=${#WANDB_API_KEY}
    print_success "WANDB_API_KEY set ($KEY_LEN characters)"
else
    print_error "WANDB_API_KEY not set"
    ERRORS=$((ERRORS + 1))
fi

print_header "4. GPU / CUDA"

# Check nvidia-smi
if nvidia-smi &> /dev/null; then
    GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1)
    GPU_MEMORY=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader | head -1)
    print_success "GPU detected: $GPU_NAME ($GPU_MEMORY)"
else
    print_error "GPU not detected (nvidia-smi failed)"
    ERRORS=$((ERRORS + 1))
fi

# Check CUDA
if python3 -c "import torch; assert torch.cuda.is_available()" &> /dev/null; then
    CUDA_VERSION=$(python3 -c "import torch; print(torch.version.cuda)")
    GPU_COUNT=$(python3 -c "import torch; print(torch.cuda.device_count())")
    print_success "CUDA available: version $CUDA_VERSION ($GPU_COUNT GPU(s))"
else
    print_error "CUDA not available in PyTorch"
    ERRORS=$((ERRORS + 1))
fi

print_header "5. Wandb Authentication"

if [ -n "$WANDB_API_KEY" ]; then
    if python3 << WANDBCHECK
import wandb
import sys
try:
    result = wandb.login(key="$WANDB_API_KEY", relogin=True)
    if result:
        print("Login successful")
        sys.exit(0)
    else:
        print("Login failed")
        sys.exit(1)
except Exception as e:
    print(f"Error: {e}")
    sys.exit(1)
WANDBCHECK
    then
        print_success "Wandb authentication successful"

        # Get user info
        WANDB_USER=$(python3 << 'GETUSER'
import wandb
try:
    api = wandb.Api()
    print(api.viewer.get("username", "unknown"))
except:
    print("unknown")
GETUSER
)
        print_success "Logged in as: $WANDB_USER"
    else
        print_error "Wandb authentication failed"
        ERRORS=$((ERRORS + 1))
    fi
else
    print_warning "Skipping Wandb auth (no API key)"
fi

print_header "6. HuggingFace Authentication"

if [ -n "$HF_TOKEN" ]; then
    if python3 << HFCHECK
from huggingface_hub import HfApi
import sys
try:
    api = HfApi(token="$HF_TOKEN")
    user = api.whoami()
    print(f"Login successful: {user.get('name', 'unknown')}")
    sys.exit(0)
except Exception as e:
    print(f"Error: {e}")
    sys.exit(1)
HFCHECK
    then
        print_success "HuggingFace authentication successful"
    else
        print_error "HuggingFace authentication failed"
        ERRORS=$((ERRORS + 1))
    fi
else
    print_warning "Skipping HF auth (no token)"
fi

print_header "7. Dataset Access"

# Test dataset loading
if python3 << DATASETCHECK
from datasets import load_dataset
import sys
try:
    # Quick test load (just get info, don't download)
    ds = load_dataset("augustocsc/sintetico_natural", split="train", streaming=True)
    print("Dataset accessible")
    sys.exit(0)
except Exception as e:
    print(f"Error: {e}")
    sys.exit(1)
DATASETCHECK
then
    print_success "Dataset accessible: augustocsc/sintetico_natural"
else
    print_warning "Could not verify dataset access (may require authentication)"
fi

print_header "8. Scripts"

SCRIPTS=(
    "scripts/train.py"
    "scripts/evaluate.py"
    "scripts/generate.py"
    "scripts/aws/monitor_training_auto.sh"
    "scripts/aws/analyze_model.sh"
)

for script in "${SCRIPTS[@]}"; do
    if [ -f "$script" ]; then
        print_success "$script exists"
    else
        print_warning "$script not found"
    fi
done

# Final summary
print_header "Validation Summary"
echo ""

if [ $ERRORS -eq 0 ]; then
    echo -e "${GREEN}╔══════════════════════════════════════╗${NC}"
    echo -e "${GREEN}║                                      ║${NC}"
    echo -e "${GREEN}║    ✅ ALL VALIDATIONS PASSED ✅     ║${NC}"
    echo -e "${GREEN}║                                      ║${NC}"
    echo -e "${GREEN}║     Ready for training! 🚀           ║${NC}"
    echo -e "${GREEN}║                                      ║${NC}"
    echo -e "${GREEN}╚══════════════════════════════════════╝${NC}"
    echo ""
    echo "You can now run:"
    echo "  python scripts/train.py --help"
    echo "  bash scripts/aws/run_all_training.sh"
    echo ""
    exit 0
else
    echo -e "${RED}╔══════════════════════════════════════╗${NC}"
    echo -e "${RED}║                                      ║${NC}"
    echo -e "${RED}║    ❌ VALIDATION FAILED ❌           ║${NC}"
    echo -e "${RED}║                                      ║${NC}"
    echo -e "${RED}║   $ERRORS error(s) found              ║${NC}"
    echo -e "${RED}║                                      ║${NC}"
    echo -e "${RED}╚══════════════════════════════════════╝${NC}"
    echo ""
    echo "Please fix the errors above before training."
    echo ""
    exit 1
fi