Upload folder using huggingface_hub

81b3473 verified about 1 month ago

5.41 kB

	#!/bin/bash
	# =============================================================================
	# Headless GRPO Training Entrypoint Script
	# =============================================================================
	# This script orchestrates the training lifecycle:
	# 1. Validate environment (secrets, GPU)
	# 2. Authenticate with HuggingFace and W&B
	# 3. Run training
	# 4. Upload results
	# 5. Self-terminate pod (if RUNPOD_POD_ID is set)
	#
	# Exit on any error
	set -e

	echo "=============================================================="
	echo "Marxist-GRPO Headless Training"
	echo "=============================================================="
	echo "Start time: $(date -Iseconds)"
	echo ""

	# =============================================================================
	# 1. ENVIRONMENT VALIDATION
	# =============================================================================
	echo "[1/5] Validating environment..."

	# Check required secrets
	if [ -z "$HF_TOKEN" ]; then
	echo "ERROR: HF_TOKEN environment variable is required"
	exit 1
	fi

	if [ -z "$WANDB_API_KEY" ]; then
	echo "ERROR: WANDB_API_KEY environment variable is required"
	exit 1
	fi

	# Check GPU availability
	python -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'" \|\| {
	echo "ERROR: CUDA/GPU not available"
	exit 1
	}

	echo " - HF_TOKEN: [set]"
	echo " - WANDB_API_KEY: [set]"
	echo " - GPU: $(python -c 'import torch; print(torch.cuda.get_device_name())')"
	echo " - VRAM: $(python -c 'import torch; print(f\"{torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB\")')"
	echo ""

	# =============================================================================
	# 2. AUTHENTICATION
	# =============================================================================
	echo "[2/5] Authenticating with services..."

	# Login to HuggingFace
	echo " - HuggingFace Hub..."
	huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential 2>/dev/null \|\| true

	# Login to Weights & Biases
	echo " - Weights & Biases..."
	wandb login "$WANDB_API_KEY" 2>/dev/null \|\| true

	echo ""

	# =============================================================================
	# 3. DATA VALIDATION
	# =============================================================================
	echo "[3/5] Validating dataset..."

	DATASET_PATH="${DATASET_PATH:-/workspace/dataset.jsonl}"

	if [ ! -f "$DATASET_PATH" ]; then
	echo "ERROR: Dataset not found at $DATASET_PATH"

	# If DATASET_URL is set, try to download
	if [ -n "$DATASET_URL" ]; then
	echo "Attempting to download from DATASET_URL..."
	wget -O "$DATASET_PATH" "$DATASET_URL" \|\| {
	echo "ERROR: Failed to download dataset"
	exit 1
	}
	else
	exit 1
	fi
	fi

	RECORD_COUNT=$(wc -l < "$DATASET_PATH")
	echo " - Dataset: $DATASET_PATH"
	echo " - Records: $RECORD_COUNT"
	echo ""

	# =============================================================================
	# 4. TRAINING
	# =============================================================================
	echo "[4/5] Starting training..."
	echo ""
	echo "Configuration:"
	echo " - Model: ${MODEL_NAME:-unsloth/DeepSeek-R1-0528-Qwen3-8B}"
	echo " - Max Steps: ${MAX_STEPS:-500}"
	echo " - Batch Size: ${BATCH_SIZE:-2} x ${GRADIENT_ACCUMULATION:-2}"
	echo " - Learning Rate: ${LEARNING_RATE:-5e-6}"
	echo " - Reward Mode: ${REWARD_MODE:-FULL}"
	echo " - HF Repo: ${HF_REPO:-prolewiki/marxist-grpo-lora}"
	echo ""

	# Create output directories (use env vars to allow override in tests)
	CHECKPOINT_DIR="${CHECKPOINT_DIR:-/workspace/checkpoints}"
	LORA_OUTPUT="${LORA_OUTPUT:-/workspace/lora-output}"
	OUTPUT_DIR="${OUTPUT_DIR:-/workspace/outputs}"
	mkdir -p "$CHECKPOINT_DIR"
	mkdir -p "$LORA_OUTPUT"
	mkdir -p "$OUTPUT_DIR"

	# Run training
	# Note: The training script handles all the environment variables
	# IMPORTANT: Disable set -e temporarily to capture exit code for failure handling
	set +e
	python -m prolewiki_llm.train_headless
	TRAINING_EXIT_CODE=$?
	set -e

	echo ""
	echo "Training completed with exit code: $TRAINING_EXIT_CODE"
	echo ""

	# =============================================================================
	# 5. CLEANUP AND TERMINATION
	# =============================================================================
	echo "[5/5] Cleanup..."

	# Log completion time
	echo "End time: $(date -Iseconds)"

	# If training succeeded and RUNPOD_POD_ID is set, terminate the pod
	if [ $TRAINING_EXIT_CODE -eq 0 ]; then
	echo "Training completed successfully!"

	if [ -n "$RUNPOD_POD_ID" ]; then
	echo ""
	echo "Terminating pod to stop billing..."
	echo "Pod ID: $RUNPOD_POD_ID"

	# Give a few seconds for logs to flush
	sleep 5

	# Stop the pod
	runpodctl stop pod "$RUNPOD_POD_ID" \|\| {
	echo "Warning: Failed to stop pod automatically"
	echo "Please manually stop pod $RUNPOD_POD_ID to avoid billing"
	}
	else
	echo ""
	echo "Note: RUNPOD_POD_ID not set - pod will continue running"
	echo "Remember to stop the pod manually to avoid billing!"
	fi
	else
	echo "Training failed with exit code $TRAINING_EXIT_CODE"
	echo "Pod will NOT be automatically terminated for debugging"
	echo ""
	echo "To debug:"
	echo " 1. SSH into the pod"
	echo " 2. Check /workspace/outputs for logs"
	echo " 3. Check W&B dashboard for metrics"
	fi

	exit $TRAINING_EXIT_CODE