llm-training / docker /start.sh
percyraskova's picture
Upload folder using huggingface_hub
81b3473 verified
#!/bin/bash
# =============================================================================
# Headless GRPO Training Entrypoint Script
# =============================================================================
# This script orchestrates the training lifecycle:
# 1. Validate environment (secrets, GPU)
# 2. Authenticate with HuggingFace and W&B
# 3. Run training
# 4. Upload results
# 5. Self-terminate pod (if RUNPOD_POD_ID is set)
#
# Exit on any error
set -e
echo "=============================================================="
echo "Marxist-GRPO Headless Training"
echo "=============================================================="
echo "Start time: $(date -Iseconds)"
echo ""
# =============================================================================
# 1. ENVIRONMENT VALIDATION
# =============================================================================
echo "[1/5] Validating environment..."
# Check required secrets
if [ -z "$HF_TOKEN" ]; then
echo "ERROR: HF_TOKEN environment variable is required"
exit 1
fi
if [ -z "$WANDB_API_KEY" ]; then
echo "ERROR: WANDB_API_KEY environment variable is required"
exit 1
fi
# Check GPU availability
python -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'" || {
echo "ERROR: CUDA/GPU not available"
exit 1
}
echo " - HF_TOKEN: [set]"
echo " - WANDB_API_KEY: [set]"
echo " - GPU: $(python -c 'import torch; print(torch.cuda.get_device_name())')"
echo " - VRAM: $(python -c 'import torch; print(f\"{torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB\")')"
echo ""
# =============================================================================
# 2. AUTHENTICATION
# =============================================================================
echo "[2/5] Authenticating with services..."
# Login to HuggingFace
echo " - HuggingFace Hub..."
huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential 2>/dev/null || true
# Login to Weights & Biases
echo " - Weights & Biases..."
wandb login "$WANDB_API_KEY" 2>/dev/null || true
echo ""
# =============================================================================
# 3. DATA VALIDATION
# =============================================================================
echo "[3/5] Validating dataset..."
DATASET_PATH="${DATASET_PATH:-/workspace/dataset.jsonl}"
if [ ! -f "$DATASET_PATH" ]; then
echo "ERROR: Dataset not found at $DATASET_PATH"
# If DATASET_URL is set, try to download
if [ -n "$DATASET_URL" ]; then
echo "Attempting to download from DATASET_URL..."
wget -O "$DATASET_PATH" "$DATASET_URL" || {
echo "ERROR: Failed to download dataset"
exit 1
}
else
exit 1
fi
fi
RECORD_COUNT=$(wc -l < "$DATASET_PATH")
echo " - Dataset: $DATASET_PATH"
echo " - Records: $RECORD_COUNT"
echo ""
# =============================================================================
# 4. TRAINING
# =============================================================================
echo "[4/5] Starting training..."
echo ""
echo "Configuration:"
echo " - Model: ${MODEL_NAME:-unsloth/DeepSeek-R1-0528-Qwen3-8B}"
echo " - Max Steps: ${MAX_STEPS:-500}"
echo " - Batch Size: ${BATCH_SIZE:-2} x ${GRADIENT_ACCUMULATION:-2}"
echo " - Learning Rate: ${LEARNING_RATE:-5e-6}"
echo " - Reward Mode: ${REWARD_MODE:-FULL}"
echo " - HF Repo: ${HF_REPO:-prolewiki/marxist-grpo-lora}"
echo ""
# Create output directories (use env vars to allow override in tests)
CHECKPOINT_DIR="${CHECKPOINT_DIR:-/workspace/checkpoints}"
LORA_OUTPUT="${LORA_OUTPUT:-/workspace/lora-output}"
OUTPUT_DIR="${OUTPUT_DIR:-/workspace/outputs}"
mkdir -p "$CHECKPOINT_DIR"
mkdir -p "$LORA_OUTPUT"
mkdir -p "$OUTPUT_DIR"
# Run training
# Note: The training script handles all the environment variables
# IMPORTANT: Disable set -e temporarily to capture exit code for failure handling
set +e
python -m prolewiki_llm.train_headless
TRAINING_EXIT_CODE=$?
set -e
echo ""
echo "Training completed with exit code: $TRAINING_EXIT_CODE"
echo ""
# =============================================================================
# 5. CLEANUP AND TERMINATION
# =============================================================================
echo "[5/5] Cleanup..."
# Log completion time
echo "End time: $(date -Iseconds)"
# If training succeeded and RUNPOD_POD_ID is set, terminate the pod
if [ $TRAINING_EXIT_CODE -eq 0 ]; then
echo "Training completed successfully!"
if [ -n "$RUNPOD_POD_ID" ]; then
echo ""
echo "Terminating pod to stop billing..."
echo "Pod ID: $RUNPOD_POD_ID"
# Give a few seconds for logs to flush
sleep 5
# Stop the pod
runpodctl stop pod "$RUNPOD_POD_ID" || {
echo "Warning: Failed to stop pod automatically"
echo "Please manually stop pod $RUNPOD_POD_ID to avoid billing"
}
else
echo ""
echo "Note: RUNPOD_POD_ID not set - pod will continue running"
echo "Remember to stop the pod manually to avoid billing!"
fi
else
echo "Training failed with exit code $TRAINING_EXIT_CODE"
echo "Pod will NOT be automatically terminated for debugging"
echo ""
echo "To debug:"
echo " 1. SSH into the pod"
echo " 2. Check /workspace/outputs for logs"
echo " 3. Check W&B dashboard for metrics"
fi
exit $TRAINING_EXIT_CODE