#!/bin/bash
# =============================================================================
# Headless GRPO Training Entrypoint Script
# =============================================================================
# This script orchestrates the training lifecycle:
# 1. Validate environment (secrets, GPU)
# 2. Authenticate with HuggingFace and W&B
# 3. Run training
# 4. Upload results
# 5. Self-terminate pod (if RUNPOD_POD_ID is set)
#
# Exit on any error
set -e

echo "=============================================================="
echo "Marxist-GRPO Headless Training"
echo "=============================================================="
echo "Start time: $(date -Iseconds)"
echo ""

# =============================================================================
# 1. ENVIRONMENT VALIDATION
# =============================================================================
echo "[1/5] Validating environment..."

# Check required secrets
if [ -z "$HF_TOKEN" ]; then
    echo "ERROR: HF_TOKEN environment variable is required"
    exit 1
fi

if [ -z "$WANDB_API_KEY" ]; then
    echo "ERROR: WANDB_API_KEY environment variable is required"
    exit 1
fi

# Check GPU availability
python -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'" || {
    echo "ERROR: CUDA/GPU not available"
    exit 1
}

echo "  - HF_TOKEN: [set]"
echo "  - WANDB_API_KEY: [set]"
echo "  - GPU: $(python -c 'import torch; print(torch.cuda.get_device_name())')"
echo "  - VRAM: $(python -c 'import torch; print(f\"{torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB\")')"
echo ""

# =============================================================================
# 2. AUTHENTICATION
# =============================================================================
echo "[2/5] Authenticating with services..."

# Login to HuggingFace
echo "  - HuggingFace Hub..."
huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential 2>/dev/null || true

# Login to Weights & Biases
echo "  - Weights & Biases..."
wandb login "$WANDB_API_KEY" 2>/dev/null || true

echo ""

# =============================================================================
# 3. DATA VALIDATION
# =============================================================================
echo "[3/5] Validating dataset..."

DATASET_PATH="${DATASET_PATH:-/workspace/dataset.jsonl}"

if [ ! -f "$DATASET_PATH" ]; then
    echo "ERROR: Dataset not found at $DATASET_PATH"

    # If DATASET_URL is set, try to download
    if [ -n "$DATASET_URL" ]; then
        echo "Attempting to download from DATASET_URL..."
        wget -O "$DATASET_PATH" "$DATASET_URL" || {
            echo "ERROR: Failed to download dataset"
            exit 1
        }
    else
        exit 1
    fi
fi

RECORD_COUNT=$(wc -l < "$DATASET_PATH")
echo "  - Dataset: $DATASET_PATH"
echo "  - Records: $RECORD_COUNT"
echo ""

# =============================================================================
# 4. TRAINING
# =============================================================================
echo "[4/5] Starting training..."
echo ""
echo "Configuration:"
echo "  - Model: ${MODEL_NAME:-unsloth/DeepSeek-R1-0528-Qwen3-8B}"
echo "  - Max Steps: ${MAX_STEPS:-500}"
echo "  - Batch Size: ${BATCH_SIZE:-2} x ${GRADIENT_ACCUMULATION:-2}"
echo "  - Learning Rate: ${LEARNING_RATE:-5e-6}"
echo "  - Reward Mode: ${REWARD_MODE:-FULL}"
echo "  - HF Repo: ${HF_REPO:-prolewiki/marxist-grpo-lora}"
echo ""

# Create output directories (use env vars to allow override in tests)
CHECKPOINT_DIR="${CHECKPOINT_DIR:-/workspace/checkpoints}"
LORA_OUTPUT="${LORA_OUTPUT:-/workspace/lora-output}"
OUTPUT_DIR="${OUTPUT_DIR:-/workspace/outputs}"
mkdir -p "$CHECKPOINT_DIR"
mkdir -p "$LORA_OUTPUT"
mkdir -p "$OUTPUT_DIR"

# Run training
# Note: The training script handles all the environment variables
# IMPORTANT: Disable set -e temporarily to capture exit code for failure handling
set +e
python -m prolewiki_llm.train_headless
TRAINING_EXIT_CODE=$?
set -e

echo ""
echo "Training completed with exit code: $TRAINING_EXIT_CODE"
echo ""

# =============================================================================
# 5. CLEANUP AND TERMINATION
# =============================================================================
echo "[5/5] Cleanup..."

# Log completion time
echo "End time: $(date -Iseconds)"

# If training succeeded and RUNPOD_POD_ID is set, terminate the pod
if [ $TRAINING_EXIT_CODE -eq 0 ]; then
    echo "Training completed successfully!"

    if [ -n "$RUNPOD_POD_ID" ]; then
        echo ""
        echo "Terminating pod to stop billing..."
        echo "Pod ID: $RUNPOD_POD_ID"

        # Give a few seconds for logs to flush
        sleep 5

        # Stop the pod
        runpodctl stop pod "$RUNPOD_POD_ID" || {
            echo "Warning: Failed to stop pod automatically"
            echo "Please manually stop pod $RUNPOD_POD_ID to avoid billing"
        }
    else
        echo ""
        echo "Note: RUNPOD_POD_ID not set - pod will continue running"
        echo "Remember to stop the pod manually to avoid billing!"
    fi
else
    echo "Training failed with exit code $TRAINING_EXIT_CODE"
    echo "Pod will NOT be automatically terminated for debugging"
    echo ""
    echo "To debug:"
    echo "  1. SSH into the pod"
    echo "  2. Check /workspace/outputs for logs"
    echo "  3. Check W&B dashboard for metrics"
fi

exit $TRAINING_EXIT_CODE