llm-training / ai-docs /runpod.yaml
percyraskova's picture
Upload folder using huggingface_hub
81b3473 verified
# RunPod.io Setup Guide for LLM Fine-Tuning
# Optimized for DeepSeek 7B Abliterated with Unsloth QLoRA
# Status: READY - verified configuration for Phase 8
overview:
purpose: |
Step-by-step guide for deploying a RunPod GPU pod to fine-tune
DeepSeek-R1-Distill-Qwen-7B-abliterated on ProleWiki corpus using Unsloth.
estimated_cost: "$0.30-0.60 for complete training run (~30 min)"
workflow_summary:
- Create pod with PyTorch 2.4 template
- Install Unsloth and dependencies
- Upload training data (JSONL chunks)
- Run SFT training (~20-30 min)
- Export GGUF model
- Download and deploy to Ollama
- STOP POD immediately after download
gpu_selection:
recommended: RTX 4090
vram_required: "16-18GB with Unsloth QLoRA"
note: |
Unsloth's QLoRA reduces 7B model VRAM from ~24GB to ~16-18GB.
RTX 4090 (24GB) provides comfortable headroom.
options:
rtx_4090:
vram: 24GB
price_spot: "$0.40-0.50/hr"
price_ondemand: "$0.50-0.60/hr"
recommendation: "Best value - sufficient VRAM, fast training"
a40:
vram: 48GB
price_spot: "$0.45-0.55/hr"
price_ondemand: "$0.50-0.65/hr"
recommendation: "More headroom, similar price"
rtx_3090:
vram: 24GB
price_spot: "$0.25-0.35/hr"
price_ondemand: "$0.30-0.40/hr"
recommendation: "Budget option, slightly older"
a100_40gb:
vram: 40GB
price_spot: "$0.80-1.00/hr"
price_ondemand: "$1.00-1.50/hr"
recommendation: "Overkill for 7B, use for larger models"
spot_vs_ondemand:
spot:
pros: "30-50% cheaper"
cons: "May be interrupted if demand spikes"
best_for: "Long training runs where checkpoints save progress"
ondemand:
pros: "Guaranteed availability"
cons: "Full price"
best_for: "Short runs (<1hr) like our 30-min training"
recommendation: |
For ProleWiki fine-tuning (~30 min), use On-Demand.
Spot interruption would cost more in setup time than savings.
pod_configuration:
template: "RunPod PyTorch 2.4"
alternative: "RunPod PyTorch 2.8 (if available)"
template_includes:
- PyTorch 2.4
- CUDA 12.4
- cuDNN
- JupyterLab
- SSH access
- Python 3.10+
storage:
container_disk:
size: "50GB minimum"
purpose: "Ephemeral - Unsloth, model weights during training"
warning: "LOST on pod restart!"
volume_disk:
size: "100GB minimum"
purpose: "Persistent - checkpoints, scripts, training data"
critical: "ALL important files must go here!"
mount_path: "/workspace"
ports:
- port: 8888
purpose: "JupyterLab (primary interface)"
- port: 22
purpose: "SSH (optional, for terminal access)"
environment_variables:
required:
HF_TOKEN: "Your Hugging Face token (for gated models)"
optional:
JUPYTER_PASSWORD: "Secure notebook access"
WANDB_API_KEY: "If using Weights & Biases logging"
secure_secrets:
note: |
Use RUNPOD_SECRET_ prefix for encrypted secrets:
RUNPOD_SECRET_HF_TOKEN will be injected securely.
step_by_step_setup:
step_1_create_pod:
description: "Create GPU pod from RunPod dashboard"
actions:
- "Go to https://runpod.io/console/pods"
- "Click '+ Deploy' or 'New Pod'"
- "Select GPU: RTX 4090 (or A40)"
- "Click 'Change Template' → search 'PyTorch'"
- "Select 'RunPod PyTorch 2.4'"
- "Set Container Disk: 50 GB"
- "Set Volume Disk: 100 GB"
- "Expand 'Environment Variables'"
- "Add: HF_TOKEN = your_token"
- "Click 'Deploy On-Demand' (not Spot for short runs)"
step_2_connect:
description: "Connect to running pod"
actions:
- "Wait for pod status: 'Running' (usually <1 min)"
- "Click 'Connect' button"
- "Select 'Jupyter Lab' (opens in new tab)"
- "Or select 'SSH' for terminal access"
step_3_install_unsloth:
description: "Install Unsloth and dependencies in JupyterLab terminal"
commands: |
# Verify CUDA is working
nvidia-smi
python -c "import torch; print(f'CUDA: {torch.cuda.get_device_name()}')"
# Install Unsloth (auto-detects CUDA version)
pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# Install flash-attention (may take a few minutes to compile)
pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"
# Install training dependencies
pip install trl>=0.7.0 datasets accelerate bitsandbytes peft
# Install tiktoken for data transformation
pip install tiktoken
# Verify installation
python -c "from unsloth import FastLanguageModel; print('Unsloth ready!')"
troubleshooting:
flash_attn_fails: |
If flash-attn compilation fails, try:
pip install flash-attn --no-build-isolation
cuda_version_mismatch: |
If CUDA errors occur, specify version explicitly:
pip install "unsloth[cu124] @ git+https://github.com/unslothai/unsloth.git"
step_4_upload_data:
description: "Upload training data to pod"
option_a_jupyterlab:
best_for: "Small datasets (<100MB)"
steps:
- "In JupyterLab file browser (left sidebar)"
- "Navigate to /workspace"
- "Create folder: 'data'"
- "Click upload icon (up arrow)"
- "Select your JSONL chunks file"
option_b_wget:
best_for: "Data hosted on web"
command: |
mkdir -p /workspace/data
wget https://your-url/library_chunks.jsonl -O /workspace/data/chunks.jsonl
option_c_huggingface:
best_for: "Dataset on Hugging Face"
command: |
huggingface-cli download your-user/prolewiki-chunks \
--local-dir /workspace/data \
--token $HF_TOKEN
option_d_scp:
best_for: "From local machine via SSH"
command: |
# Get SSH command from RunPod 'Connect' dropdown
scp -P 22XXX library_chunks.jsonl root@pod-ip:/workspace/data/
step_5_run_training:
description: "Execute fine-tuning script"
note: "See ai-docs/finetune.yaml for complete training code"
minimal_script: |
from unsloth import FastLanguageModel
import torch
# Load abliterated model
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="huihui-ai/DeepSeek-R1-Distill-Qwen-7B-abliterated",
max_seq_length=2048,
load_in_4bit=True,
dtype=None, # Auto-detect
)
# Apply LoRA
model = FastLanguageModel.get_peft_model(
model,
r=16,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_dropout=0.05,
)
# Load and train (see finetune.yaml for full code)
# ...
# Save checkpoint to Volume disk!
model.save_pretrained("/workspace/checkpoints/marxist-deepseek-lora")
expected_time: "20-30 minutes for ~1,000 samples, 3 epochs"
step_6_export_gguf:
description: "Export model to GGUF format for Ollama"
command: |
# Export with q4_k_m quantization (good balance)
model.save_pretrained_gguf(
"/workspace/exports/marxist-deepseek",
tokenizer,
quantization_method="q4_k_m"
)
# Check output
ls -lh /workspace/exports/
output_size: "~4GB for 7B q4_k_m"
quantization_options:
q4_k_m: "Recommended - good quality/size balance (~4GB)"
q5_k_m: "Higher quality, larger (~5GB)"
q8_0: "Best quality, largest (~7GB)"
step_7_download_model:
description: "Download GGUF to local machine"
option_a_jupyterlab:
steps:
- "In JupyterLab file browser"
- "Navigate to /workspace/exports/"
- "Right-click the .gguf file"
- "Select 'Download'"
option_b_runpodctl:
command: |
# Install runpodctl locally first
# https://github.com/runpod/runpodctl
runpodctl receive /workspace/exports/marxist-deepseek-q4_k_m.gguf
step_8_stop_pod:
description: "CRITICAL - Stop pod to avoid charges"
warning: "Billing continues until pod is stopped!"
actions:
- "Verify GGUF downloaded successfully to local machine"
- "In RunPod dashboard, click 'Stop' on your pod"
- "Wait for status: 'Stopped'"
- "Delete pod if you don't need it again"
- "Volume disk data persists even after pod deletion"
ollama_deployment:
description: "Deploy GGUF to local Ollama after download"
steps:
- step: "Create Modelfile"
content: |
# Save as: Modelfile.marxist-deepseek
FROM ./marxist-deepseek-q4_k_m.gguf
TEMPLATE """<|im_start|>system
{{ .System }}<|im_end|>
<|im_start|>user
{{ .Prompt }}<|im_end|>
<|im_start|>assistant
{{ .Response }}<|im_end|>"""
SYSTEM "You are a Marxist-Leninist assistant trained on ProleWiki."
PARAMETER stop "<|im_end|>"
PARAMETER temperature 0.7
PARAMETER top_p 0.9
- step: "Create Ollama model"
command: "ollama create marxist-deepseek -f Modelfile.marxist-deepseek"
- step: "Test model"
command: "ollama run marxist-deepseek 'Explain dialectical materialism.'"
cost_summary:
example_run:
gpu: "RTX 4090 On-Demand"
rate: "$0.55/hr"
time: "30 minutes"
total: "$0.28"
breakdown:
setup: "5 min - Pod creation, Unsloth install"
upload: "2 min - Data transfer"
training: "20-25 min - SFT with QLoRA"
export: "3 min - GGUF conversion"
download: "5 min - Transfer GGUF locally"
total_time: "~35-40 min"
tips:
- "Use On-Demand for short runs (<1hr)"
- "Use Spot for long runs with checkpoint saving"
- "Stop pod IMMEDIATELY after download"
- "Delete pod after confirming success"
- "Volume disk persists - can restart training later"
troubleshooting:
out_of_memory:
symptoms: "CUDA OOM, kernel dies"
solutions:
- "Reduce batch size in training args"
- "Ensure load_in_4bit=True"
- "Use gradient_checkpointing=True"
- "Upgrade to A40 (48GB VRAM)"
slow_training:
symptoms: "Steps/sec much lower than expected"
solutions:
- "Verify GPU is being used: nvidia-smi"
- "Check torch.cuda.is_available()"
- "Ensure flash-attn installed correctly"
pod_wont_start:
symptoms: "Pod stuck in 'Pending' or 'Initializing'"
solutions:
- "Try different data center region"
- "Try different GPU type"
- "Check RunPod status page"
checkpoint_lost:
symptoms: "Can't find saved model after restart"
cause: "Saved to Container Disk instead of Volume"
prevention: "ALWAYS use /workspace/ for important files"
references:
runpod_docs: "https://docs.runpod.io/"
unsloth_github: "https://github.com/unslothai/unsloth"
context7_runpod: "/runpod/docs"
related_docs:
- "ai-docs/finetune.yaml - Complete training configuration"
- "ai-docs/embedding.yaml - Embedding pipeline"
- "ai-docs/project-status.yaml - Phase 8 status"