|
|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set -e |
|
|
|
|
|
echo "==============================================================" |
|
|
echo "Marxist-GRPO Headless Training" |
|
|
echo "==============================================================" |
|
|
echo "Start time: $(date -Iseconds)" |
|
|
echo "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
echo "[1/5] Validating environment..." |
|
|
|
|
|
|
|
|
if [ -z "$HF_TOKEN" ]; then |
|
|
echo "ERROR: HF_TOKEN environment variable is required" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
if [ -z "$WANDB_API_KEY" ]; then |
|
|
echo "ERROR: WANDB_API_KEY environment variable is required" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
|
|
|
python -c "import torch; assert torch.cuda.is_available(), 'CUDA not available'" || { |
|
|
echo "ERROR: CUDA/GPU not available" |
|
|
exit 1 |
|
|
} |
|
|
|
|
|
echo " - HF_TOKEN: [set]" |
|
|
echo " - WANDB_API_KEY: [set]" |
|
|
echo " - GPU: $(python -c 'import torch; print(torch.cuda.get_device_name())')" |
|
|
echo " - VRAM: $(python -c 'import torch; print(f\"{torch.cuda.get_device_properties(0).total_memory/1e9:.1f} GB\")')" |
|
|
echo "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
echo "[2/5] Authenticating with services..." |
|
|
|
|
|
|
|
|
echo " - HuggingFace Hub..." |
|
|
huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential 2>/dev/null || true |
|
|
|
|
|
|
|
|
echo " - Weights & Biases..." |
|
|
wandb login "$WANDB_API_KEY" 2>/dev/null || true |
|
|
|
|
|
echo "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
echo "[3/5] Validating dataset..." |
|
|
|
|
|
DATASET_PATH="${DATASET_PATH:-/workspace/dataset.jsonl}" |
|
|
|
|
|
if [ ! -f "$DATASET_PATH" ]; then |
|
|
echo "ERROR: Dataset not found at $DATASET_PATH" |
|
|
|
|
|
|
|
|
if [ -n "$DATASET_URL" ]; then |
|
|
echo "Attempting to download from DATASET_URL..." |
|
|
wget -O "$DATASET_PATH" "$DATASET_URL" || { |
|
|
echo "ERROR: Failed to download dataset" |
|
|
exit 1 |
|
|
} |
|
|
else |
|
|
exit 1 |
|
|
fi |
|
|
fi |
|
|
|
|
|
RECORD_COUNT=$(wc -l < "$DATASET_PATH") |
|
|
echo " - Dataset: $DATASET_PATH" |
|
|
echo " - Records: $RECORD_COUNT" |
|
|
echo "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
echo "[4/5] Starting training..." |
|
|
echo "" |
|
|
echo "Configuration:" |
|
|
echo " - Model: ${MODEL_NAME:-unsloth/DeepSeek-R1-0528-Qwen3-8B}" |
|
|
echo " - Max Steps: ${MAX_STEPS:-500}" |
|
|
echo " - Batch Size: ${BATCH_SIZE:-2} x ${GRADIENT_ACCUMULATION:-2}" |
|
|
echo " - Learning Rate: ${LEARNING_RATE:-5e-6}" |
|
|
echo " - Reward Mode: ${REWARD_MODE:-FULL}" |
|
|
echo " - HF Repo: ${HF_REPO:-prolewiki/marxist-grpo-lora}" |
|
|
echo "" |
|
|
|
|
|
|
|
|
CHECKPOINT_DIR="${CHECKPOINT_DIR:-/workspace/checkpoints}" |
|
|
LORA_OUTPUT="${LORA_OUTPUT:-/workspace/lora-output}" |
|
|
OUTPUT_DIR="${OUTPUT_DIR:-/workspace/outputs}" |
|
|
mkdir -p "$CHECKPOINT_DIR" |
|
|
mkdir -p "$LORA_OUTPUT" |
|
|
mkdir -p "$OUTPUT_DIR" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set +e |
|
|
python -m prolewiki_llm.train_headless |
|
|
TRAINING_EXIT_CODE=$? |
|
|
set -e |
|
|
|
|
|
echo "" |
|
|
echo "Training completed with exit code: $TRAINING_EXIT_CODE" |
|
|
echo "" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
echo "[5/5] Cleanup..." |
|
|
|
|
|
|
|
|
echo "End time: $(date -Iseconds)" |
|
|
|
|
|
|
|
|
if [ $TRAINING_EXIT_CODE -eq 0 ]; then |
|
|
echo "Training completed successfully!" |
|
|
|
|
|
if [ -n "$RUNPOD_POD_ID" ]; then |
|
|
echo "" |
|
|
echo "Terminating pod to stop billing..." |
|
|
echo "Pod ID: $RUNPOD_POD_ID" |
|
|
|
|
|
|
|
|
sleep 5 |
|
|
|
|
|
|
|
|
runpodctl stop pod "$RUNPOD_POD_ID" || { |
|
|
echo "Warning: Failed to stop pod automatically" |
|
|
echo "Please manually stop pod $RUNPOD_POD_ID to avoid billing" |
|
|
} |
|
|
else |
|
|
echo "" |
|
|
echo "Note: RUNPOD_POD_ID not set - pod will continue running" |
|
|
echo "Remember to stop the pod manually to avoid billing!" |
|
|
fi |
|
|
else |
|
|
echo "Training failed with exit code $TRAINING_EXIT_CODE" |
|
|
echo "Pod will NOT be automatically terminated for debugging" |
|
|
echo "" |
|
|
echo "To debug:" |
|
|
echo " 1. SSH into the pod" |
|
|
echo " 2. Check /workspace/outputs for logs" |
|
|
echo " 3. Check W&B dashboard for metrics" |
|
|
fi |
|
|
|
|
|
exit $TRAINING_EXIT_CODE |
|
|
|