#!/bin/bash # ============================================================================= # Headless GRPO Training Entrypoint Script # ============================================================================= # This script orchestrates the training lifecycle: # 1. Validate environment (secrets, GPU) # 2. Authenticate with HuggingFace and W&B # 3. Run training # 4. Upload results # 5. Self-terminate pod (if RUNPOD_POD_ID is set) # # Exit on any error set -e echo "==============================================================" echo "Marxist-GRPO Headless Training" echo "==============================================================" echo "Start time: $(date -Iseconds)" echo "" # ============================================================================= # 1. ENVIRONMENT VALIDATION # ============================================================================= echo "[1/5] Validating environment..." # Check required secrets if [ -z "$HF_TOKEN" ]; then echo "ERROR: HF_TOKEN environment variable is required" exit 1 fi if [ -z "$WANDB_API_KEY" ]; then echo "ERROR: WANDB_API_KEY environment variable is required" exit 1 fi # Check GPU availability python -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'" || { echo "ERROR: CUDA/GPU not available" exit 1 } echo " - HF_TOKEN: [set]" echo " - WANDB_API_KEY: [set]" echo " - GPU: $(python -c 'import torch; print(torch.cuda.get_device_name())')" echo " - VRAM: $(python -c 'import torch; print(f\"{torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB\")')" echo "" # ============================================================================= # 2. AUTHENTICATION # ============================================================================= echo "[2/5] Authenticating with services..." # Login to HuggingFace echo " - HuggingFace Hub..." huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential 2>/dev/null || true # Login to Weights & Biases echo " - Weights & Biases..." wandb login "$WANDB_API_KEY" 2>/dev/null || true echo "" # ============================================================================= # 3. DATA VALIDATION # ============================================================================= echo "[3/5] Validating dataset..." DATASET_PATH="${DATASET_PATH:-/workspace/dataset.jsonl}" if [ ! -f "$DATASET_PATH" ]; then echo "ERROR: Dataset not found at $DATASET_PATH" # If DATASET_URL is set, try to download if [ -n "$DATASET_URL" ]; then echo "Attempting to download from DATASET_URL..." wget -O "$DATASET_PATH" "$DATASET_URL" || { echo "ERROR: Failed to download dataset" exit 1 } else exit 1 fi fi RECORD_COUNT=$(wc -l < "$DATASET_PATH") echo " - Dataset: $DATASET_PATH" echo " - Records: $RECORD_COUNT" echo "" # ============================================================================= # 4. TRAINING # ============================================================================= echo "[4/5] Starting training..." echo "" echo "Configuration:" echo " - Model: ${MODEL_NAME:-unsloth/DeepSeek-R1-0528-Qwen3-8B}" echo " - Max Steps: ${MAX_STEPS:-500}" echo " - Batch Size: ${BATCH_SIZE:-2} x ${GRADIENT_ACCUMULATION:-2}" echo " - Learning Rate: ${LEARNING_RATE:-5e-6}" echo " - Reward Mode: ${REWARD_MODE:-FULL}" echo " - HF Repo: ${HF_REPO:-prolewiki/marxist-grpo-lora}" echo "" # Create output directories (use env vars to allow override in tests) CHECKPOINT_DIR="${CHECKPOINT_DIR:-/workspace/checkpoints}" LORA_OUTPUT="${LORA_OUTPUT:-/workspace/lora-output}" OUTPUT_DIR="${OUTPUT_DIR:-/workspace/outputs}" mkdir -p "$CHECKPOINT_DIR" mkdir -p "$LORA_OUTPUT" mkdir -p "$OUTPUT_DIR" # Run training # Note: The training script handles all the environment variables # IMPORTANT: Disable set -e temporarily to capture exit code for failure handling set +e python -m prolewiki_llm.train_headless TRAINING_EXIT_CODE=$? set -e echo "" echo "Training completed with exit code: $TRAINING_EXIT_CODE" echo "" # ============================================================================= # 5. CLEANUP AND TERMINATION # ============================================================================= echo "[5/5] Cleanup..." # Log completion time echo "End time: $(date -Iseconds)" # If training succeeded and RUNPOD_POD_ID is set, terminate the pod if [ $TRAINING_EXIT_CODE -eq 0 ]; then echo "Training completed successfully!" if [ -n "$RUNPOD_POD_ID" ]; then echo "" echo "Terminating pod to stop billing..." echo "Pod ID: $RUNPOD_POD_ID" # Give a few seconds for logs to flush sleep 5 # Stop the pod runpodctl stop pod "$RUNPOD_POD_ID" || { echo "Warning: Failed to stop pod automatically" echo "Please manually stop pod $RUNPOD_POD_ID to avoid billing" } else echo "" echo "Note: RUNPOD_POD_ID not set - pod will continue running" echo "Remember to stop the pod manually to avoid billing!" fi else echo "Training failed with exit code $TRAINING_EXIT_CODE" echo "Pod will NOT be automatically terminated for debugging" echo "" echo "To debug:" echo " 1. SSH into the pod" echo " 2. Check /workspace/outputs for logs" echo " 3. Check W&B dashboard for metrics" fi exit $TRAINING_EXIT_CODE