Upload folder using huggingface_hub

81b3473 verified 3 months ago

11.2 kB

	# RunPod.io Setup Guide for LLM Fine-Tuning
	# Optimized for DeepSeek 7B Abliterated with Unsloth QLoRA
	# Status: READY - verified configuration for Phase 8

	overview:
	purpose: \|
	Step-by-step guide for deploying a RunPod GPU pod to fine-tune
	DeepSeek-R1-Distill-Qwen-7B-abliterated on ProleWiki corpus using Unsloth.

	estimated_cost: "$0.30-0.60 for complete training run (~30 min)"

	workflow_summary:
	- Create pod with PyTorch 2.4 template
	- Install Unsloth and dependencies
	- Upload training data (JSONL chunks)
	- Run SFT training (~20-30 min)
	- Export GGUF model
	- Download and deploy to Ollama
	- STOP POD immediately after download

	gpu_selection:
	recommended: RTX 4090
	vram_required: "16-18GB with Unsloth QLoRA"
	note: \|
	Unsloth's QLoRA reduces 7B model VRAM from ~24GB to ~16-18GB.
	RTX 4090 (24GB) provides comfortable headroom.

	options:
	rtx_4090:
	vram: 24GB
	price_spot: "$0.40-0.50/hr"
	price_ondemand: "$0.50-0.60/hr"
	recommendation: "Best value - sufficient VRAM, fast training"

	a40:
	vram: 48GB
	price_spot: "$0.45-0.55/hr"
	price_ondemand: "$0.50-0.65/hr"
	recommendation: "More headroom, similar price"

	rtx_3090:
	vram: 24GB
	price_spot: "$0.25-0.35/hr"
	price_ondemand: "$0.30-0.40/hr"
	recommendation: "Budget option, slightly older"

	a100_40gb:
	vram: 40GB
	price_spot: "$0.80-1.00/hr"
	price_ondemand: "$1.00-1.50/hr"
	recommendation: "Overkill for 7B, use for larger models"

	spot_vs_ondemand:
	spot:
	pros: "30-50% cheaper"
	cons: "May be interrupted if demand spikes"
	best_for: "Long training runs where checkpoints save progress"

	ondemand:
	pros: "Guaranteed availability"
	cons: "Full price"
	best_for: "Short runs (<1hr) like our 30-min training"

	recommendation: \|
	For ProleWiki fine-tuning (~30 min), use On-Demand.
	Spot interruption would cost more in setup time than savings.

	pod_configuration:
	template: "RunPod PyTorch 2.4"
	alternative: "RunPod PyTorch 2.8 (if available)"

	template_includes:
	- PyTorch 2.4
	- CUDA 12.4
	- cuDNN
	- JupyterLab
	- SSH access
	- Python 3.10+

	storage:
	container_disk:
	size: "50GB minimum"
	purpose: "Ephemeral - Unsloth, model weights during training"
	warning: "LOST on pod restart!"

	volume_disk:
	size: "100GB minimum"
	purpose: "Persistent - checkpoints, scripts, training data"
	critical: "ALL important files must go here!"
	mount_path: "/workspace"

	ports:
	- port: 8888
	purpose: "JupyterLab (primary interface)"
	- port: 22
	purpose: "SSH (optional, for terminal access)"

	environment_variables:
	required:
	HF_TOKEN: "Your Hugging Face token (for gated models)"

	optional:
	JUPYTER_PASSWORD: "Secure notebook access"
	WANDB_API_KEY: "If using Weights & Biases logging"

	secure_secrets:
	note: \|
	Use RUNPOD_SECRET_ prefix for encrypted secrets:
	RUNPOD_SECRET_HF_TOKEN will be injected securely.

	step_by_step_setup:
	step_1_create_pod:
	description: "Create GPU pod from RunPod dashboard"
	actions:
	- "Go to https://runpod.io/console/pods"
	- "Click '+ Deploy' or 'New Pod'"
	- "Select GPU: RTX 4090 (or A40)"
	- "Click 'Change Template' → search 'PyTorch'"
	- "Select 'RunPod PyTorch 2.4'"
	- "Set Container Disk: 50 GB"
	- "Set Volume Disk: 100 GB"
	- "Expand 'Environment Variables'"
	- "Add: HF_TOKEN = your_token"
	- "Click 'Deploy On-Demand' (not Spot for short runs)"

	step_2_connect:
	description: "Connect to running pod"
	actions:
	- "Wait for pod status: 'Running' (usually <1 min)"
	- "Click 'Connect' button"
	- "Select 'Jupyter Lab' (opens in new tab)"
	- "Or select 'SSH' for terminal access"

	step_3_install_unsloth:
	description: "Install Unsloth and dependencies in JupyterLab terminal"
	commands: \|
	# Verify CUDA is working
	nvidia-smi
	python -c "import torch; print(f'CUDA: {torch.cuda.get_device_name()}')"

	# Install Unsloth (auto-detects CUDA version)
	pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

	# Install flash-attention (may take a few minutes to compile)
	pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

	# Install training dependencies
	pip install trl>=0.7.0 datasets accelerate bitsandbytes peft

	# Install tiktoken for data transformation
	pip install tiktoken

	# Verify installation
	python -c "from unsloth import FastLanguageModel; print('Unsloth ready!')"

	troubleshooting:
	flash_attn_fails: \|
	If flash-attn compilation fails, try:
	pip install flash-attn --no-build-isolation

	cuda_version_mismatch: \|
	If CUDA errors occur, specify version explicitly:
	pip install "unsloth[cu124] @ git+https://github.com/unslothai/unsloth.git"

	step_4_upload_data:
	description: "Upload training data to pod"

	option_a_jupyterlab:
	best_for: "Small datasets (<100MB)"
	steps:
	- "In JupyterLab file browser (left sidebar)"
	- "Navigate to /workspace"
	- "Create folder: 'data'"
	- "Click upload icon (up arrow)"
	- "Select your JSONL chunks file"

	option_b_wget:
	best_for: "Data hosted on web"
	command: \|
	mkdir -p /workspace/data
	wget https://your-url/library_chunks.jsonl -O /workspace/data/chunks.jsonl

	option_c_huggingface:
	best_for: "Dataset on Hugging Face"
	command: \|
	huggingface-cli download your-user/prolewiki-chunks \
	--local-dir /workspace/data \
	--token $HF_TOKEN

	option_d_scp:
	best_for: "From local machine via SSH"
	command: \|
	# Get SSH command from RunPod 'Connect' dropdown
	scp -P 22XXX library_chunks.jsonl root@pod-ip:/workspace/data/

	step_5_run_training:
	description: "Execute fine-tuning script"
	note: "See ai-docs/finetune.yaml for complete training code"

	minimal_script: \|
	from unsloth import FastLanguageModel
	import torch

	# Load abliterated model
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name="huihui-ai/DeepSeek-R1-Distill-Qwen-7B-abliterated",
	max_seq_length=2048,
	load_in_4bit=True,
	dtype=None, # Auto-detect
	)

	# Apply LoRA
	model = FastLanguageModel.get_peft_model(
	model,
	r=16,
	lora_alpha=32,
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj"],
	lora_dropout=0.05,
	)

	# Load and train (see finetune.yaml for full code)
	# ...

	# Save checkpoint to Volume disk!
	model.save_pretrained("/workspace/checkpoints/marxist-deepseek-lora")

	expected_time: "20-30 minutes for ~1,000 samples, 3 epochs"

	step_6_export_gguf:
	description: "Export model to GGUF format for Ollama"
	command: \|
	# Export with q4_k_m quantization (good balance)
	model.save_pretrained_gguf(
	"/workspace/exports/marxist-deepseek",
	tokenizer,
	quantization_method="q4_k_m"
	)

	# Check output
	ls -lh /workspace/exports/

	output_size: "~4GB for 7B q4_k_m"

	quantization_options:
	q4_k_m: "Recommended - good quality/size balance (~4GB)"
	q5_k_m: "Higher quality, larger (~5GB)"
	q8_0: "Best quality, largest (~7GB)"

	step_7_download_model:
	description: "Download GGUF to local machine"

	option_a_jupyterlab:
	steps:
	- "In JupyterLab file browser"
	- "Navigate to /workspace/exports/"
	- "Right-click the .gguf file"
	- "Select 'Download'"

	option_b_runpodctl:
	command: \|
	# Install runpodctl locally first
	# https://github.com/runpod/runpodctl

	runpodctl receive /workspace/exports/marxist-deepseek-q4_k_m.gguf

	step_8_stop_pod:
	description: "CRITICAL - Stop pod to avoid charges"
	warning: "Billing continues until pod is stopped!"
	actions:
	- "Verify GGUF downloaded successfully to local machine"
	- "In RunPod dashboard, click 'Stop' on your pod"
	- "Wait for status: 'Stopped'"
	- "Delete pod if you don't need it again"
	- "Volume disk data persists even after pod deletion"

	ollama_deployment:
	description: "Deploy GGUF to local Ollama after download"

	steps:
	- step: "Create Modelfile"
	content: \|
	# Save as: Modelfile.marxist-deepseek
	FROM ./marxist-deepseek-q4_k_m.gguf

	TEMPLATE """<\|im_start\|>system
	{{ .System }}<\|im_end\|>
	<\|im_start\|>user
	{{ .Prompt }}<\|im_end\|>
	<\|im_start\|>assistant
	{{ .Response }}<\|im_end\|>"""

	SYSTEM "You are a Marxist-Leninist assistant trained on ProleWiki."

	PARAMETER stop "<\|im_end\|>"
	PARAMETER temperature 0.7
	PARAMETER top_p 0.9

	- step: "Create Ollama model"
	command: "ollama create marxist-deepseek -f Modelfile.marxist-deepseek"

	- step: "Test model"
	command: "ollama run marxist-deepseek 'Explain dialectical materialism.'"

	cost_summary:
	example_run:
	gpu: "RTX 4090 On-Demand"
	rate: "$0.55/hr"
	time: "30 minutes"
	total: "$0.28"

	breakdown:
	setup: "5 min - Pod creation, Unsloth install"
	upload: "2 min - Data transfer"
	training: "20-25 min - SFT with QLoRA"
	export: "3 min - GGUF conversion"
	download: "5 min - Transfer GGUF locally"
	total_time: "~35-40 min"

	tips:
	- "Use On-Demand for short runs (<1hr)"
	- "Use Spot for long runs with checkpoint saving"
	- "Stop pod IMMEDIATELY after download"
	- "Delete pod after confirming success"
	- "Volume disk persists - can restart training later"

	troubleshooting:
	out_of_memory:
	symptoms: "CUDA OOM, kernel dies"
	solutions:
	- "Reduce batch size in training args"
	- "Ensure load_in_4bit=True"
	- "Use gradient_checkpointing=True"
	- "Upgrade to A40 (48GB VRAM)"

	slow_training:
	symptoms: "Steps/sec much lower than expected"
	solutions:
	- "Verify GPU is being used: nvidia-smi"
	- "Check torch.cuda.is_available()"
	- "Ensure flash-attn installed correctly"

	pod_wont_start:
	symptoms: "Pod stuck in 'Pending' or 'Initializing'"
	solutions:
	- "Try different data center region"
	- "Try different GPU type"
	- "Check RunPod status page"

	checkpoint_lost:
	symptoms: "Can't find saved model after restart"
	cause: "Saved to Container Disk instead of Volume"
	prevention: "ALWAYS use /workspace/ for important files"

	references:
	runpod_docs: "https://docs.runpod.io/"
	unsloth_github: "https://github.com/unslothai/unsloth"
	context7_runpod: "/runpod/docs"
	related_docs:
	- "ai-docs/finetune.yaml - Complete training configuration"
	- "ai-docs/embedding.yaml - Embedding pipeline"
	- "ai-docs/project-status.yaml - Phase 8 status"