|
|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set -e |
|
|
|
|
|
GREEN='\033[0;32m' |
|
|
RED='\033[0;31m' |
|
|
YELLOW='\033[1;33m' |
|
|
BLUE='\033[0;34m' |
|
|
NC='\033[0m' |
|
|
|
|
|
print_success() { echo -e "${GREEN}β
${NC} $1"; } |
|
|
print_error() { echo -e "${RED}β${NC} $1"; } |
|
|
print_warning() { echo -e "${YELLOW}β οΈ${NC} $1"; } |
|
|
print_header() { echo -e "\n${BLUE}========== $1 ==========${NC}"; } |
|
|
|
|
|
ERRORS=0 |
|
|
|
|
|
print_header "Seriguela Setup Validation" |
|
|
|
|
|
|
|
|
if [ -d "/home/ubuntu/seriguela" ]; then |
|
|
cd /home/ubuntu/seriguela |
|
|
elif [ -d "$(pwd)/seriguela" ]; then |
|
|
cd seriguela |
|
|
else |
|
|
cd . |
|
|
fi |
|
|
|
|
|
print_header "1. Python Environment" |
|
|
|
|
|
|
|
|
if python3 --version &> /dev/null; then |
|
|
PYTHON_VERSION=$(python3 --version) |
|
|
print_success "Python installed: $PYTHON_VERSION" |
|
|
else |
|
|
print_error "Python not found" |
|
|
ERRORS=$((ERRORS + 1)) |
|
|
fi |
|
|
|
|
|
|
|
|
if [ -d "venv" ]; then |
|
|
print_success "Virtual environment exists" |
|
|
source venv/bin/activate |
|
|
else |
|
|
print_error "Virtual environment not found" |
|
|
ERRORS=$((ERRORS + 1)) |
|
|
fi |
|
|
|
|
|
|
|
|
if pip --version &> /dev/null; then |
|
|
PIP_VERSION=$(pip --version | cut -d' ' -f2) |
|
|
print_success "pip version: $PIP_VERSION" |
|
|
else |
|
|
print_error "pip not found" |
|
|
ERRORS=$((ERRORS + 1)) |
|
|
fi |
|
|
|
|
|
print_header "2. Python Packages" |
|
|
|
|
|
|
|
|
PACKAGES=( |
|
|
"transformers:Hugging Face Transformers" |
|
|
"torch:PyTorch" |
|
|
"wandb:Weights & Biases" |
|
|
"peft:Parameter-Efficient Fine-Tuning" |
|
|
"datasets:Hugging Face Datasets" |
|
|
) |
|
|
|
|
|
for pkg_info in "${PACKAGES[@]}"; do |
|
|
IFS=':' read -r pkg_name pkg_desc <<< "$pkg_info" |
|
|
|
|
|
if python3 -c "import $pkg_name" &> /dev/null; then |
|
|
VERSION=$(python3 -c "import $pkg_name; print($pkg_name.__version__)" 2>/dev/null || echo "unknown") |
|
|
print_success "$pkg_desc ($pkg_name) - version $VERSION" |
|
|
else |
|
|
print_error "$pkg_desc ($pkg_name) not installed" |
|
|
ERRORS=$((ERRORS + 1)) |
|
|
fi |
|
|
done |
|
|
|
|
|
|
|
|
WANDB_VERSION=$(python3 -c "import wandb; print(wandb.__version__)" 2>/dev/null || echo "0.0.0") |
|
|
REQUIRED_VERSION="0.24.0" |
|
|
|
|
|
if python3 << VERSIONCHECK |
|
|
import sys |
|
|
from packaging import version |
|
|
current = version.parse("$WANDB_VERSION") |
|
|
required = version.parse("$REQUIRED_VERSION") |
|
|
sys.exit(0 if current >= required else 1) |
|
|
VERSIONCHECK |
|
|
then |
|
|
print_success "Wandb version $WANDB_VERSION (>= $REQUIRED_VERSION required)" |
|
|
else |
|
|
print_warning "Wandb version $WANDB_VERSION is older than recommended $REQUIRED_VERSION" |
|
|
print_warning "New API key format (wandb_v1_...) requires Wandb >= 0.24.0" |
|
|
fi |
|
|
|
|
|
print_header "3. Environment Variables" |
|
|
|
|
|
|
|
|
if [ -f ".env" ]; then |
|
|
source <(grep -v '^#' .env | sed 's/^/export /') |
|
|
print_success ".env file loaded" |
|
|
else |
|
|
print_warning ".env file not found" |
|
|
fi |
|
|
|
|
|
|
|
|
if [ -n "$HF_TOKEN" ]; then |
|
|
TOKEN_LEN=${#HF_TOKEN} |
|
|
print_success "HF_TOKEN set ($TOKEN_LEN characters)" |
|
|
else |
|
|
print_warning "HF_TOKEN not set (model won't be pushed to Hub)" |
|
|
fi |
|
|
|
|
|
|
|
|
if [ -n "$WANDB_API_KEY" ]; then |
|
|
KEY_LEN=${#WANDB_API_KEY} |
|
|
print_success "WANDB_API_KEY set ($KEY_LEN characters)" |
|
|
else |
|
|
print_error "WANDB_API_KEY not set" |
|
|
ERRORS=$((ERRORS + 1)) |
|
|
fi |
|
|
|
|
|
print_header "4. GPU / CUDA" |
|
|
|
|
|
|
|
|
if nvidia-smi &> /dev/null; then |
|
|
GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -1) |
|
|
GPU_MEMORY=$(nvidia-smi --query-gpu=memory.total --format=csv,noheader | head -1) |
|
|
print_success "GPU detected: $GPU_NAME ($GPU_MEMORY)" |
|
|
else |
|
|
print_error "GPU not detected (nvidia-smi failed)" |
|
|
ERRORS=$((ERRORS + 1)) |
|
|
fi |
|
|
|
|
|
|
|
|
if python3 -c "import torch; assert torch.cuda.is_available()" &> /dev/null; then |
|
|
CUDA_VERSION=$(python3 -c "import torch; print(torch.version.cuda)") |
|
|
GPU_COUNT=$(python3 -c "import torch; print(torch.cuda.device_count())") |
|
|
print_success "CUDA available: version $CUDA_VERSION ($GPU_COUNT GPU(s))" |
|
|
else |
|
|
print_error "CUDA not available in PyTorch" |
|
|
ERRORS=$((ERRORS + 1)) |
|
|
fi |
|
|
|
|
|
print_header "5. Wandb Authentication" |
|
|
|
|
|
if [ -n "$WANDB_API_KEY" ]; then |
|
|
if python3 << WANDBCHECK |
|
|
import wandb |
|
|
import sys |
|
|
try: |
|
|
result = wandb.login(key="$WANDB_API_KEY", relogin=True) |
|
|
if result: |
|
|
print("Login successful") |
|
|
sys.exit(0) |
|
|
else: |
|
|
print("Login failed") |
|
|
sys.exit(1) |
|
|
except Exception as e: |
|
|
print(f"Error: {e}") |
|
|
sys.exit(1) |
|
|
WANDBCHECK |
|
|
then |
|
|
print_success "Wandb authentication successful" |
|
|
|
|
|
|
|
|
WANDB_USER=$(python3 << 'GETUSER' |
|
|
import wandb |
|
|
try: |
|
|
api = wandb.Api() |
|
|
print(api.viewer.get("username", "unknown")) |
|
|
except: |
|
|
print("unknown") |
|
|
GETUSER |
|
|
) |
|
|
print_success "Logged in as: $WANDB_USER" |
|
|
else |
|
|
print_error "Wandb authentication failed" |
|
|
ERRORS=$((ERRORS + 1)) |
|
|
fi |
|
|
else |
|
|
print_warning "Skipping Wandb auth (no API key)" |
|
|
fi |
|
|
|
|
|
print_header "6. HuggingFace Authentication" |
|
|
|
|
|
if [ -n "$HF_TOKEN" ]; then |
|
|
if python3 << HFCHECK |
|
|
from huggingface_hub import HfApi |
|
|
import sys |
|
|
try: |
|
|
api = HfApi(token="$HF_TOKEN") |
|
|
user = api.whoami() |
|
|
print(f"Login successful: {user.get('name', 'unknown')}") |
|
|
sys.exit(0) |
|
|
except Exception as e: |
|
|
print(f"Error: {e}") |
|
|
sys.exit(1) |
|
|
HFCHECK |
|
|
then |
|
|
print_success "HuggingFace authentication successful" |
|
|
else |
|
|
print_error "HuggingFace authentication failed" |
|
|
ERRORS=$((ERRORS + 1)) |
|
|
fi |
|
|
else |
|
|
print_warning "Skipping HF auth (no token)" |
|
|
fi |
|
|
|
|
|
print_header "7. Dataset Access" |
|
|
|
|
|
|
|
|
if python3 << DATASETCHECK |
|
|
from datasets import load_dataset |
|
|
import sys |
|
|
try: |
|
|
# Quick test load (just get info, don't download) |
|
|
ds = load_dataset("augustocsc/sintetico_natural", split="train", streaming=True) |
|
|
print("Dataset accessible") |
|
|
sys.exit(0) |
|
|
except Exception as e: |
|
|
print(f"Error: {e}") |
|
|
sys.exit(1) |
|
|
DATASETCHECK |
|
|
then |
|
|
print_success "Dataset accessible: augustocsc/sintetico_natural" |
|
|
else |
|
|
print_warning "Could not verify dataset access (may require authentication)" |
|
|
fi |
|
|
|
|
|
print_header "8. Scripts" |
|
|
|
|
|
SCRIPTS=( |
|
|
"scripts/train.py" |
|
|
"scripts/evaluate.py" |
|
|
"scripts/generate.py" |
|
|
"scripts/aws/monitor_training_auto.sh" |
|
|
"scripts/aws/analyze_model.sh" |
|
|
) |
|
|
|
|
|
for script in "${SCRIPTS[@]}"; do |
|
|
if [ -f "$script" ]; then |
|
|
print_success "$script exists" |
|
|
else |
|
|
print_warning "$script not found" |
|
|
fi |
|
|
done |
|
|
|
|
|
|
|
|
print_header "Validation Summary" |
|
|
echo "" |
|
|
|
|
|
if [ $ERRORS -eq 0 ]; then |
|
|
echo -e "${GREEN}ββββββββββββββββββββββββββββββββββββββββ${NC}" |
|
|
echo -e "${GREEN}β β${NC}" |
|
|
echo -e "${GREEN}β β
ALL VALIDATIONS PASSED β
β${NC}" |
|
|
echo -e "${GREEN}β β${NC}" |
|
|
echo -e "${GREEN}β Ready for training! π β${NC}" |
|
|
echo -e "${GREEN}β β${NC}" |
|
|
echo -e "${GREEN}ββββββββββββββββββββββββββββββββββββββββ${NC}" |
|
|
echo "" |
|
|
echo "You can now run:" |
|
|
echo " python scripts/train.py --help" |
|
|
echo " bash scripts/aws/run_all_training.sh" |
|
|
echo "" |
|
|
exit 0 |
|
|
else |
|
|
echo -e "${RED}ββββββββββββββββββββββββββββββββββββββββ${NC}" |
|
|
echo -e "${RED}β β${NC}" |
|
|
echo -e "${RED}β β VALIDATION FAILED β β${NC}" |
|
|
echo -e "${RED}β β${NC}" |
|
|
echo -e "${RED}β $ERRORS error(s) found β${NC}" |
|
|
echo -e "${RED}β β${NC}" |
|
|
echo -e "${RED}ββββββββββββββββββββββββββββββββββββββββ${NC}" |
|
|
echo "" |
|
|
echo "Please fix the errors above before training." |
|
|
echo "" |
|
|
exit 1 |
|
|
fi |
|
|
|