import gradio as gr # ============================================================ # CONTENT β€” each chapter stored as a Python string (Markdown) # ============================================================ OVERVIEW = r""" # πŸ“– The Complete Guide to Post-Training of Large Language Models ### From Pretraining to Alignment β€” Everything You Need to Know --- > **Who is this for?** You've learned how pretraining works β€” you understand GPT-2, transformer architectures, next-token prediction, and the cross-entropy loss. Now you want to understand what happens *after* pretraining: how raw language models become helpful assistants like ChatGPT, Claude, and Gemini. --- ## πŸ—ΊοΈ Roadmap | # | Chapter | What You'll Learn | |---|---------|-------------------| | 1 | **The Big Picture** | Why pretrained models aren't useful yet; the 3-stage pipeline | | 2 | **SFT** | Supervised Fine-Tuning β€” loss function, data formats, key papers | | 3 | **RLHF** | Reward models, PPO, KL divergence, reward hacking | | 4 | **DPO** | Direct Preference Optimization β€” RLHF without RL | | 5 | **Preference Zoo** | KTO, ORPO, SimPO, CPO, IPO, Online DPO | | 6 | **GRPO & Reasoning** | DeepSeek-R1, reward functions, the reasoning revolution | | 7 | **PEFT** | LoRA, QLoRA β€” fine-tune on consumer GPUs | | 8 | **Toolbox** | TRL, Transformers, vLLM, Accelerate, DeepSpeed | | 9 | **Datasets** | What to train on β€” curated lists with Hub links | | 10 | **Evaluation** | Benchmarks, LLM-as-Judge, human eval | | 11 | **Full Recipe** | End-to-end pipeline with code | | 12 | **Reading List** | 18 must-read papers in 4 tiers | --- ### The Three Stages of Post-Training ``` β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ STAGE 1: SFT β”‚ ──> β”‚ STAGE 2: Reward β”‚ ──> β”‚ STAGE 3: RL β”‚ β”‚ β”‚ β”‚ Model Training β”‚ β”‚ (PPO / DPO / GRPO) β”‚ β”‚ Teach format β”‚ β”‚ Learn preferencesβ”‚ β”‚ Optimize for preferencesβ”‚ β”‚ & behavior β”‚ β”‚ from comparisons β”‚ β”‚ while staying close to β”‚ β”‚ β”‚ β”‚ β”‚ β”‚ the SFT model β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ Input: Pretrained LM Output: Aligned Assistant ``` ### The Evolution Timeline | Year | Method | Key Idea | |------|--------|----------| | 2017 | RLHF (original) | Human preferences β†’ reward model β†’ RL | | 2020 | RLHF for LLMs | Applied to text summarization | | 2022 | **InstructGPT** | Full SFT β†’ RM β†’ PPO pipeline | | 2022 | Constitutional AI | AI feedback replaces human feedback | | 2023 | **DPO** | No reward model needed β€” direct optimization | | 2024 | KTO / ORPO | Binary feedback / combined SFT+preference | | 2024 | **GRPO** | Group-based RL for reasoning (DeepSeek) | | 2025 | DeepSeek-R1 | RL teaches chain-of-thought from scratch | """ CH1_BIG_PICTURE = r""" # Chapter 1: The Big Picture β€” Why Post-Training Exists ## 1.1 The Gap Between Pretraining and Usefulness You've pretrained a language model. It can predict the next token with impressive accuracy. It has absorbed vast knowledge from the internet. But try asking it a question: ``` User: What is the capital of France? Model: What is the capital of Germany? What is the capital of Italy? What is the... ``` The model doesn't *answer* β€” it *continues*. The pretraining objective (`P(next_token | context)`) optimizes for predicting what comes next in web text, not for being helpful. > *"Large language models can generate outputs that are untruthful, toxic, or simply not helpful to the user. In other words, these models are not aligned with their users."* > β€” **InstructGPT** (Ouyang et al., 2022) ## 1.2 What Post-Training Does Post-training is everything after pretraining that makes a model useful, safe, and aligned. It has three stages: **Stage 1 β€” SFT (Supervised Fine-Tuning):** Teach the model the *format* of helpful responses using demonstrations. **Stage 2 β€” Reward Modeling:** Train a model to predict which response a human would prefer. **Stage 3 β€” RL Optimization:** Optimize the SFT model to generate responses the reward model scores highly. ## 1.3 The Superficial Alignment Hypothesis > *"A model's knowledge and capabilities are learnt almost entirely during pretraining, while alignment teaches it which subdistribution of formats should be used when interacting with users."* > β€” **LIMA** (Zhou et al., 2023) This is the key insight: **post-training doesn't teach new knowledge** β€” it teaches the model to *surface existing knowledge in the right way*. The pretrained model already "knows" the capital of France; SFT teaches it to respond to questions rather than generate more questions. ## 1.4 Why This Matters The difference is dramatic: - **InstructGPT 1.3B** (post-trained) was preferred over **GPT-3 175B** (pretrained only) by human evaluators - That's a 100Γ— smaller model winning because of post-training - Post-training is what turns a text predictor into an assistant """ CH2_SFT = r""" # Chapter 2: Supervised Fine-Tuning (SFT) ## 2.1 What SFT Does SFT is the bridge between a pretrained language model and a useful assistant. **Before SFT:** ``` Input: "Explain quantum computing in simple terms." Output: "Explain quantum computing to a 5-year-old. Explain quantum computing..." ``` **After SFT:** ``` Input: "Explain quantum computing in simple terms." Output: "Quantum computing uses the principles of quantum mechanics to process information. Unlike classical computers that use bits (0 or 1), quantum computers use qubits that can be both 0 and 1 simultaneously..." ``` ## 2.2 The SFT Loss Function If you understand pretraining, you understand SFT β€” with one crucial difference: **Pretraining loss** (on ALL tokens): ``` L_pretrain = -Ξ£ log P(token_i | token_1, ..., token_{i-1}) for ALL tokens ``` **SFT loss** (on RESPONSE tokens only): ``` L_SFT = -Ξ£ log P(c_i | prompt, c_1, ..., c_{i-1}) for COMPLETION tokens only ``` The prompt tokens are masked from the loss. We don't want the model to learn to generate instructions β€” we want it to learn to *respond* to them. ``` Sequence: [User: What is 2+2?] [Assistant: 4] Loss mask: [ ----IGNORED---- ] [COMPUTED HERE ] ``` ## 2.3 Chat Templates & Data Format Modern SFT uses structured conversations: ```python { "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What is the capital of France?"}, {"role": "assistant", "content": "The capital of France is Paris."} ] } ``` Each model family converts this to its own template: ``` # ChatML (Qwen, etc.): <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user What is the capital of France?<|im_end|> <|im_start|>assistant The capital of France is Paris.<|im_end|> # Llama-3: <|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful assistant.<|eot_id|> <|start_header_id|>user<|end_header_id|> What is the capital of France?<|eot_id|> ``` The `transformers` library handles this automatically: ```python from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct") messages = [ {"role": "user", "content": "What is 2+2?"}, {"role": "assistant", "content": "4"} ] # For training: text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) # For inference: text = tokenizer.apply_chat_template(messages[:1], tokenize=False, add_generation_prompt=True) ``` ## 2.4 Key SFT Papers ### FLAN (2021) β€” Instruction Tuning Works πŸ“„ *"Finetuned Language Models Are Zero-Shot Learners"* β€” [arXiv:2109.01652](https://arxiv.org/abs/2109.01652) - 62 NLP datasets formatted as instructions β†’ fine-tune LaMDA-PT 137B - **Result:** Surpassed zero-shot GPT-3 175B on 20/25 tasks - **Key insight:** Instructions matter β€” same tasks without instructions = much weaker - **Recipe:** Adafactor, lr=3e-5, 30K steps, batch 8192 tokens ### Self-Instruct (2022) β€” Synthetic Data πŸ“„ *"Self-Instruct"* β€” [arXiv:2212.10560](https://arxiv.org/abs/2212.10560) - 175 seed tasks β†’ GPT-3 generates 52K instructions + responses - **Result:** +33% over vanilla GPT-3 on SuperNaturalInstructions - **Led to:** Stanford Alpaca (LLaMA + 52K GPT instructions for <$600) ### InstructGPT (2022) β€” SFT as Stage 1 πŸ“„ *"Training Language Models to Follow Instructions with Human Feedback"* β€” [arXiv:2203.02155](https://arxiv.org/abs/2203.02155) - ~13K human demonstrations, 16 epochs, cosine LR, dropout 0.2 - **Key finding:** SFT overfits on val loss after 1 epoch, but more epochs ↑ RM score - **Result:** 1.3B InstructGPT preferred over 175B GPT-3 ### LIMA (2023) β€” Less Is More πŸ“„ *"LIMA: Less Is More for Alignment"* β€” [arXiv:2305.11206](https://arxiv.org/abs/2305.11206) - Just **1,000 curated examples** β†’ competitive with GPT-3.5 (DaVinci003) - **Recipe:** AdamW, lr 1e-5β†’1e-6, 15 epochs, batch 32, max len 2048 - **Takeaway:** Data quality >> data quantity ## 2.5 SFT Code Example ```python from trl import SFTTrainer, SFTConfig from datasets import load_dataset dataset = load_dataset("trl-lib/Capybara", split="train") config = SFTConfig( output_dir="./sft-output", num_train_epochs=3, per_device_train_batch_size=4, learning_rate=2e-5, max_seq_length=2048, gradient_checkpointing=True, bf16=True, logging_steps=10, push_to_hub=True, hub_model_id="your-username/your-sft-model", ) trainer = SFTTrainer( model="Qwen/Qwen3-0.6B", args=config, train_dataset=dataset, ) trainer.train() ``` SFTTrainer automatically detects the `messages` column, applies the chat template, and masks prompt tokens from the loss. """ CH3_RLHF = r""" # Chapter 3: Reinforcement Learning from Human Feedback (RLHF) ## 3.1 Why SFT Isn't Enough SFT teaches format and basic behavior, but: - It only learns from demonstrations (can't be better than its training data) - It can't express preferences (treats all tokens equally) - It can learn bad habits from imperfect training data RLHF addresses this by training on **which outputs are better**, not what specific tokens to generate. ## 3.2 Step 1: Train a Reward Model A **reward model (RM)** scores how good a response is (prompt + response β†’ scalar score). **Training process:** 1. Generate multiple responses per prompt using the SFT model 2. Humans rank responses (e.g., A > B) 3. Train RM to predict these rankings **The Bradley-Terry preference model:** ``` P(A preferred over B) = Οƒ(r(A) - r(B)) Loss: L_RM = -E[log Οƒ(r(x, y_chosen) - r(x, y_rejected))] ``` **Architecture:** Same as the LM, but output head β†’ single scalar. InstructGPT used a 6B RM (not 175B β€” larger was unstable). ## 3.3 Step 2: Optimize with PPO **The RLHF objective:** ``` maximize E[RM(prompt, response)] - Ξ² Β· KL(Ο€_ΞΈ || Ο€_ref) ↑ score high ↑ don't deviate too far on reward model from original SFT model ``` The **KL penalty** prevents **reward hacking** β€” without it, the model generates gibberish that tricks the RM. **PPO (Proximal Policy Optimization) loop:** 1. **Generate** responses from the current model 2. **Score** them with the reward model 3. **Compute advantage** (how much better than expected) 4. **Update** weights to favor high-advantage responses 5. **Clip** updates for stability ``` L_PPO = -E[min(ratio Β· Γ‚, clip(ratio, 1-Ξ΅, 1+Ξ΅) Β· Γ‚)] where ratio = Ο€_ΞΈ(a|s) / Ο€_old(a|s) ``` ## 3.4 InstructGPT Training Details - Ξ² = 0.02 for KL penalty - Mixed 10% pretraining data during PPO (prevents capability regression) - LR range: 2.55e-6 to 2.55e-5 (>8.05e-6 diverged) - 256K PPO episodes total - 4 models in memory simultaneously: policy, reference, reward, value ## 3.5 Why RLHF is Hard | Challenge | Description | |-----------|-------------| | **Complexity** | 4 models in memory simultaneously | | **Instability** | PPO is sensitive to hyperparameters | | **Reward hacking** | Model exploits RM rather than genuinely improving | | **Cost** | Human preference data is expensive | | **Reproducibility** | Small changes β†’ very different outcomes | These challenges motivated the development of DPO. ## 3.6 Constitutional AI (RLAIF) πŸ“„ *"Constitutional AI"* (Bai et al., 2022) β€” [arXiv:2212.08073](https://arxiv.org/abs/2212.08073) **Key idea:** Replace human feedback with **AI feedback**. An AI evaluates responses against principles (the "constitution"). Dramatically reduces cost and scales the feedback process. """ CH4_DPO = r""" # Chapter 4: Direct Preference Optimization (DPO) ## 4.1 The Key Insight πŸ“„ *"Direct Preference Optimization: Your Language Model is Secretly a Reward Model"* β€” [arXiv:2305.18290](https://arxiv.org/abs/2305.18290) **You don't need a separate reward model or RL training loop.** The language model itself implicitly represents a reward model. The optimal solution to the RLHF objective can be expressed in closed form: ``` Ο€*(y|x) = (1/Z(x)) Β· Ο€_ref(y|x) Β· exp((1/Ξ²) Β· r(x,y)) ``` Rearranging to express reward in terms of the policy: ``` r(x,y) = Ξ² Β· log(Ο€_ΞΈ(y|x) / Ο€_ref(y|x)) + Ξ² Β· log Z(x) ``` Since Bradley-Terry only uses **reward differences**, the partition function Z(x) cancels: ``` L_DPO = -E[log Οƒ(Ξ² Β· log(Ο€_ΞΈ(y_w|x)/Ο€_ref(y_w|x)) - Ξ² Β· log(Ο€_ΞΈ(y_l|x)/Ο€_ref(y_l|x)))] ``` ## 4.2 DPO vs RLHF | Aspect | RLHF (PPO) | DPO | |--------|-------------|-----| | Models in memory | 4 | 2 (policy + reference) | | Training loop | Complex RL with generation | Simple supervised training | | Hyperparameters | Many | Few (mainly Ξ²) | | Stability | Often unstable | Very stable | | Sampling during training | Required | Not required | | Performance | Strong | Comparable or better | ## 4.3 The DPO Gradient β€” Intuition ``` βˆ‡L_DPO ∝ -Ξ² Β· [weight] Β· [βˆ‡log Ο€(y_w|x) - βˆ‡log Ο€(y_l|x)] ``` - **Increase** likelihood of preferred response y_w - **Decrease** likelihood of rejected response y_l - **Weight** by how wrong the model currently is If the model already prefers y_w correctly β†’ small gradient (don't fix what isn't broken). ## 4.4 DPO Data Format ```python # Each example needs: prompt + chosen + rejected { "prompt": [{"role": "user", "content": "Explain gravity"}], "chosen": [{"role": "assistant", "content": "Gravity is a fundamental force..."}], "rejected": [{"role": "assistant", "content": "Gravity is when things fall down."}] } ``` ## 4.5 DPO Code Example ```python from trl import DPOTrainer, DPOConfig from datasets import load_dataset dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train") config = DPOConfig( output_dir="./dpo-output", num_train_epochs=1, per_device_train_batch_size=4, learning_rate=5e-7, # Much lower than SFT! beta=0.1, # KL penalty strength bf16=True, gradient_checkpointing=True, push_to_hub=True, hub_model_id="your-username/your-dpo-model", ) trainer = DPOTrainer( model="your-sft-model", # SFT model from stage 1 args=config, train_dataset=dataset, ) trainer.train() ``` ## 4.6 Key Hyperparameters | Parameter | Typical Range | Notes | |-----------|--------------|-------| | **Ξ² (beta)** | 0.01 – 0.5 | Higher = stay closer to reference. Default: 0.1 | | **Learning rate** | 1e-7 – 5e-6 | Much lower than SFT. Very sensitive. | | **Epochs** | 1 – 3 | Overfitting is common with more | """ CH5_PREFERENCE_ZOO = r""" # Chapter 5: The Preference Optimization Zoo After DPO, researchers developed many variants. Here's what each adds. ## 5.1 IPO β€” Identity Preference Optimization **Problem:** DPO can overfit, especially with noisy preferences. **Solution:** Adds regularization without assuming the Bradley-Terry model: ``` L_IPO = E[(log(Ο€_ΞΈ(y_w)/Ο€_ref(y_w)) - log(Ο€_ΞΈ(y_l)/Ο€_ref(y_l)) - 1/(2Ξ²))Β²] ``` **When to use:** Noisy preference data, DPO overfitting. ## 5.2 KTO β€” Kahneman-Tversky Optimization πŸ“„ [arXiv:2402.01306](https://arxiv.org/abs/2402.01306) **Problem:** DPO needs *paired* preferences (chosen AND rejected per prompt). Expensive to collect. **Solution:** Works with **unpaired binary feedback** β€” just "good" or "bad" per response. Based on prospect theory: losses hurt more than equivalent gains. ```python {"prompt": "...", "completion": "...", "label": True} # πŸ‘ {"prompt": "...", "completion": "...", "label": False} # πŸ‘Ž ``` **When to use:** Thumbs-up/down feedback, no pairwise comparisons. ## 5.3 ORPO β€” Odds Ratio Preference Optimization **Problem:** DPO needs a separate SFT stage + reference model. **Solution:** Combines SFT and preference optimization in **one step**: ``` L_ORPO = L_SFT + Ξ» Β· L_OR ``` **When to use:** Simplest possible pipeline. ## 5.4 SimPO β€” Simple Preference Optimization **Problem:** DPO needs a reference model in memory (doubles GPU needs). **Solution:** Eliminates reference model. Uses **average log probability** as the implicit reward (length-normalized). **When to use:** GPU memory constraints. ## 5.5 CPO β€” Contrastive Preference Optimization Removes reference model using a contrastive loss. Similar to SimPO, different formulation. ## 5.6 Online DPO **Problem:** Standard DPO uses a static dataset β€” data goes stale as the model changes. **Solution:** Generates new completions from the *current* model during training, scored by a reward model. ## 5.7 Comparison Table | Method | Ref Model? | Paired Data? | Needs RM? | Separate SFT? | Key Advantage | |--------|-----------|-------------|-----------|---------------|---------------| | PPO | Yes | No | **Yes** | Yes | Gold standard | | DPO | Yes | **Yes** | No | Yes | Simple, stable | | IPO | Yes | Yes | No | Yes | Robust to noise | | KTO | Yes | **No** | No | Yes | Binary feedback | | ORPO | **No** | Yes | No | **No** | Simplest pipeline | | SimPO | **No** | Yes | No | Yes | Memory efficient | | CPO | **No** | Yes | No | Yes | Memory efficient | | Online DPO | Yes | Online | **Yes** | Yes | Fresh data | | GRPO | Yes (soft) | No | **Yes** / funcs | Yes | Best for reasoning | """ CH6_GRPO = r""" # Chapter 6: GRPO and the Reasoning Revolution ## 6.1 What is GRPO? πŸ“„ *"DeepSeekMath"* (Shao et al., 2024) β€” [arXiv:2402.03300](https://arxiv.org/abs/2402.03300) **Group Relative Policy Optimization** β€” a PPO variant that's more memory-efficient and great for reasoning. **Key idea:** Instead of a separate value model (critic), GRPO generates **multiple completions per prompt** and uses the group average as the baseline. ## 6.2 How GRPO Works ``` For each prompt: 1. Generate G completions (e.g., G=16) 2. Score each with a reward function 3. Compute advantage: Γ‚_i = (r_i - mean(r)) / std(r) 4. Update model: ↑ probability of high-advantage completions ↓ probability of low-advantage completions ``` **The GRPO loss:** ``` L = -E[min(ratio Β· Γ‚, clip(ratio, 1-Ξ΅, 1+Ξ΅) Β· Γ‚)] + Ξ² Β· KL ``` "Group Relative" = advantage computed *relative to the group* for the same prompt. ## 6.3 The DeepSeek-R1 Story πŸ“„ *"DeepSeek-R1"* (DeepSeek-AI, 2025) β€” [arXiv:2501.12948](https://arxiv.org/abs/2501.12948) GRPO was used to train DeepSeek-R1 β€” a model that learned chain-of-thought reasoning **purely through RL**. With the right reward (accuracy on math/code), the model **spontaneously developed:** - Chain-of-thought reasoning - Self-verification ("Let me check...") - Error correction - Problem decomposition No one explicitly taught these behaviors β€” they emerged from the reward signal. ## 6.4 Reward Functions GRPO is flexible β€” rewards can be: 1. **Python functions** (rule-based): Is the math answer correct? Does the code pass tests? 2. **Reward models** (learned): A neural network scoring responses 3. **Multiple functions**: accuracy + format + length ```python import re def accuracy_reward(completions, ground_truth, **kwargs): matches = [re.search(r"\\boxed\{(.*?)\}", c) for c in completions] contents = [m.group(1) if m else "" for m in matches] return [1.0 if c == gt else 0.0 for c, gt in zip(contents, ground_truth)] def format_reward(completions, **kwargs): pattern = r".*?.*.*?" return [1.0 if re.match(pattern, c, re.DOTALL) else 0.0 for c in completions] ``` ## 6.5 GRPO Code Example ```python from trl import GRPOTrainer, GRPOConfig from datasets import load_dataset from trl.rewards import accuracy_reward dataset = load_dataset("trl-lib/DeepMath-103K", split="train") config = GRPOConfig( output_dir="./grpo-output", learning_rate=1e-6, per_device_train_batch_size=4, num_generations=16, # G completions per prompt max_completion_length=512, bf16=True, gradient_checkpointing=True, ) trainer = GRPOTrainer( model="Qwen/Qwen2.5-0.5B-Instruct", reward_funcs=accuracy_reward, args=config, train_dataset=dataset, ) trainer.train() ``` """ CH7_PEFT = r""" # Chapter 7: Parameter-Efficient Fine-Tuning (PEFT) ## 7.1 The Memory Problem Fine-tuning 7B parameters requires: | Component | Memory | |-----------|--------| | Model weights (bf16) | 14 GB | | Gradients | 14 GB | | Optimizer states (AdamW) | 28 GB | | Activations | 10-30 GB | | **Total** | **~60-80 GB** | That's one A100 for SFT alone. RLHF with PPO (4 models) = 4Γ— this. ## 7.2 LoRA: Low-Rank Adaptation πŸ“„ [arXiv:2106.09685](https://arxiv.org/abs/2106.09685) **Insight:** Fine-tuning weight updates have low rank β€” approximate them with small matrices. ``` W' = W + Ξ± Β· B Γ— A W: original frozen weight (d Γ— d) β€” NOT trained A: down projection (d Γ— r) β€” trained B: up projection (r Γ— d) β€” trained r: rank (typically 8-32) r << d ``` **Example:** 4096 Γ— 4096 weight matrix: - Full fine-tuning: **16.7M** parameters - LoRA (r=16): 2 Γ— 4096 Γ— 16 = **131K** parameters (128Γ— fewer!) ## 7.3 QLoRA: Quantized LoRA πŸ“„ [arXiv:2305.14314](https://arxiv.org/abs/2305.14314) Quantize the frozen base model to **4-bit**, add LoRA adapters in bf16. - 7B model: ~4 GB (quantized) + small LoRA adapters - Fine-tune 7B on a single RTX 4090 (24 GB)! ## 7.4 LoRA with TRL ```python from peft import LoraConfig from trl import SFTTrainer, SFTConfig lora_config = LoraConfig( r=16, # Rank lora_alpha=32, # Scaling (usually 2Γ—r) lora_dropout=0.05, target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], task_type="CAUSAL_LM", ) config = SFTConfig( output_dir="./sft-lora", num_train_epochs=3, learning_rate=2e-4, # Higher LR for LoRA bf16=True, gradient_checkpointing=True, ) trainer = SFTTrainer( model="meta-llama/Llama-3.1-8B", args=config, train_dataset=dataset, peft_config=lora_config, # ← pass LoRA config ) trainer.train() ``` ## 7.5 When to Use What | Scenario | Recommendation | |----------|---------------| | Limited GPU memory | LoRA / QLoRA | | Quick experiment | LoRA | | Maximum quality | Full fine-tuning | | Multiple adapters from same base | LoRA (swap adapters) | | Very small dataset | LoRA (acts as regularizer) | **LoRA is ~95-99% as good as full fine-tuning** at a fraction of the compute. """ CH8_TOOLBOX = r""" # Chapter 8: The Toolbox ## 8.1 TRL β€” Transformers Reinforcement Learning πŸ”— [github.com/huggingface/trl](https://github.com/huggingface/trl) Β· [Docs](https://huggingface.co/docs/trl) The central library for all post-training methods: | Trainer | Method | Dataset Type | |---------|--------|-------------| | `SFTTrainer` | Supervised Fine-Tuning | `messages` or `prompt`+`completion` | | `DPOTrainer` | Direct Preference Optimization | `prompt`+`chosen`+`rejected` | | `GRPOTrainer` | Group Relative Policy Optimization | `prompt` only | | `RLOOTrainer` | REINFORCE Leave-One-Out | `prompt` only | | `RewardTrainer` | Reward Model Training | `prompt`+`chosen`+`rejected` | | `KTOTrainer` | Kahneman-Tversky Optimization | `prompt`+`completion`+`label` | | `ORPOTrainer` | Odds Ratio Preference Opt. | `prompt`+`chosen`+`rejected` | | `CPOTrainer` | Contrastive Preference Opt. | `prompt`+`chosen`+`rejected` | | `OnlineDPOTrainer` | Online DPO | `prompt` only | | `PPOTrainer` | Proximal Policy Optimization | tokenized | | `XPOTrainer` | Exploratory Preference Opt. | `prompt` only | | `NashMDTrainer` | Nash Mirror Descent | `prompt` only | | `PRMTrainer` | Process Reward Model | stepwise supervision | ## 8.2 Core Libraries | Library | Purpose | |---------|---------| | **transformers** | Model loading, tokenization, chat templates | | **datasets** | Dataset loading and processing | | **peft** | LoRA, QLoRA, adapters | | **accelerate** | Distributed training, DeepSpeed | | **bitsandbytes** | 4/8-bit quantization | ## 8.3 Inference & Serving | Tool | Purpose | |------|---------| | **vLLM** | High-throughput inference (5-10Γ— faster generation) | | **TGI** | HuggingFace Text Generation Inference | | **Unsloth** | 2-5Γ— faster LoRA training | TRL integrates vLLM directly: ```python config = GRPOConfig(use_vllm=True, vllm_mode="colocate") ``` ## 8.4 Experiment Tracking & Evaluation | Tool | Purpose | |------|---------| | **Weights & Biases** | Experiment tracking | | **Trackio** | HF-native tracking | | **lm-eval-harness** | Standardized LLM benchmarks | | **AlpacaEval** | GPT-4 based evaluation | ## 8.5 TRL CLI Commands ```bash # Install pip install trl # SFT from command line trl sft --model_name_or_path Qwen/Qwen3-0.6B \ --dataset_name trl-lib/Capybara --output_dir ./sft # DPO from command line trl dpo --model_name_or_path your-sft-model \ --dataset_name trl-lib/ultrafeedback_binarized --output_dir ./dpo # GRPO from command line trl grpo --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \ --dataset_name trl-lib/DeepMath-103K --output_dir ./grpo # Multi-GPU accelerate launch --num_processes 4 train.py # DeepSpeed ZeRO-3 accelerate launch --config_file deepspeed_zero3.yaml train.py ``` """ CH9_DATASETS = r""" # Chapter 9: Datasets β€” What to Train On ## 9.1 SFT Datasets | Dataset | Size | Description | |---------|------|-------------| | [**trl-lib/Capybara**](https://huggingface.co/datasets/trl-lib/Capybara) | ~90K | Multi-turn conversations, high quality | | [**HuggingFaceH4/ultrachat_200k**](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k) | 200K | Diverse multi-turn conversations | | [**allenai/tulu-3-sft-mixture**](https://huggingface.co/datasets/allenai/tulu-3-sft-mixture) | ~1.3M | Large-scale SFT mixture | | [**OpenAssistant/oasst1**](https://huggingface.co/datasets/OpenAssistant/oasst1) | 161K | Crowdsourced conversation trees | | [**tatsu-lab/alpaca**](https://huggingface.co/datasets/tatsu-lab/alpaca) | 52K | GPT-generated instructions | | [**teknium/OpenHermes-2.5**](https://huggingface.co/datasets/teknium/OpenHermes-2.5) | 1M | Large synthetic instructions | ## 9.2 Preference Datasets (DPO / KTO / ORPO) | Dataset | Size | Description | |---------|------|-------------| | [**trl-lib/ultrafeedback_binarized**](https://huggingface.co/datasets/trl-lib/ultrafeedback_binarized) | 60K | Binarized UltraFeedback | | [**Anthropic/hh-rlhf**](https://huggingface.co/datasets/Anthropic/hh-rlhf) | 170K | Helpful + harmless preferences | | [**argilla/ultrafeedback-binarized-preferences**](https://huggingface.co/datasets/argilla/ultrafeedback-binarized-preferences) | 60K | Cleaned UltraFeedback | ## 9.3 Prompt-Only Datasets (GRPO / RLOO) | Dataset | Size | Description | |---------|------|-------------| | [**trl-lib/DeepMath-103K**](https://huggingface.co/datasets/trl-lib/DeepMath-103K) | 103K | Math with verifiable answers | | [**AI-MO/NuminaMath-TIR**](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR) | ~70K | Competition math | ## 9.4 How to Choose 1. **First experiment:** `trl-lib/Capybara` (SFT) or `trl-lib/ultrafeedback_binarized` (DPO) 2. **Quality > Quantity:** LIMA: 1K great > 52K mediocre 3. **Match your use case:** Math model β†’ math data, chat model β†’ diverse conversations 4. **Always inspect first:** ```python from datasets import load_dataset ds = load_dataset("trl-lib/Capybara", split="train") print(ds[0]) # Look at the data! ``` """ CH10_EVALUATION = r""" # Chapter 10: Evaluation β€” How to Know If It Worked ## 10.1 Why Evaluation is Hard - Open-ended outputs can be correct in many ways - Perplexity doesn't correlate with usefulness (LIMA proved this) - Benchmark scores β‰  real-world performance - Human evaluation is expensive and subjective ## 10.2 Automated Benchmarks | Benchmark | Measures | How | |-----------|----------|-----| | **MMLU** | Knowledge (57 subjects) | Multiple choice | | **HellaSwag** | Commonsense reasoning | Sentence completion | | **ARC** | Science reasoning | Multiple choice | | **TruthfulQA** | Truthfulness | Trick questions | | **GSM8K** | Math reasoning | Grade-school word problems | | **MATH** | Advanced math | Competition-level | | **HumanEval** | Code generation | Python problems | | **MBPP** | Code generation | Basic Python | | **IFEval** | Instruction following | Verifiable constraints | **Run with lm-eval-harness:** ```bash lm_eval --model hf \ --model_args pretrained=your-model \ --tasks mmlu,gsm8k,hellaswag \ --batch_size 8 ``` ## 10.3 LLM-as-Judge | Evaluation | Description | |------------|-------------| | **AlpacaEval** | GPT-4 compares to reference | | **MT-Bench** | Multi-turn dialogue, GPT-4 judged | | **Arena Hard** | Challenging prompts, GPT-4 judged | ## 10.4 Human Evaluation - **Side-by-side:** Show two responses, pick the better one - **Likert scale:** Rate helpfulness, accuracy, harmlessness (1-7) - **Chatbot Arena:** [lmarena.ai](https://lmarena.ai/) β€” crowdsourced blind comparisons ## 10.5 The Open LLM Leaderboard πŸ”— [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard) Standardized evaluation of open-source models. The primary way the community tracks progress. """ CH11_RECIPE = r""" # Chapter 11: Putting It All Together ## 11.1 The Standard Recipe (2024-2025) ``` Step 1: Choose Base Model β”œβ”€β”€ Qwen3 (0.6B–235B) β€” Top-performing β”œβ”€β”€ Llama 3.1/3.2 (1B–405B) β€” Meta β”œβ”€β”€ Gemma 3/4 (1B–27B) β€” Google └── Mistral/Mixtral β€” Efficient Step 2: SFT β”œβ”€β”€ Dataset: trl-lib/Capybara or ultrachat_200k β”œβ”€β”€ Method: SFTTrainer + LoRA (or full) β”œβ”€β”€ Epochs: 2-3, LR: 2e-5 (full) / 2e-4 (LoRA) └── Output β†’ reference model for Stage 3 Step 3: Preference Optimization β”œβ”€β”€ DPO (simplest): Ξ²=0.1, LR=5e-7, 1-2 epochs β”œβ”€β”€ GRPO (reasoning): 16 generations, LR=1e-6 └── KTO (binary feedback): unpaired data Step 4: Evaluation β”œβ”€β”€ lm-eval-harness (MMLU, GSM8K) β”œβ”€β”€ MT-Bench / AlpacaEval └── Manual testing ``` ## 11.2 Full Code: SFT β†’ DPO ```python # ═══ STAGE 1: SFT ═══ from trl import SFTTrainer, SFTConfig from datasets import load_dataset sft_dataset = load_dataset("trl-lib/Capybara", split="train") sft_config = SFTConfig( output_dir="./sft-model", num_train_epochs=2, per_device_train_batch_size=4, learning_rate=2e-5, max_seq_length=2048, bf16=True, gradient_checkpointing=True, push_to_hub=True, hub_model_id="your-username/my-sft-model", ) SFTTrainer(model="Qwen/Qwen3-0.6B", args=sft_config, train_dataset=sft_dataset).train() # ═══ STAGE 2: DPO ═══ from trl import DPOTrainer, DPOConfig dpo_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train") dpo_config = DPOConfig( output_dir="./dpo-model", num_train_epochs=1, per_device_train_batch_size=4, learning_rate=5e-7, beta=0.1, bf16=True, gradient_checkpointing=True, push_to_hub=True, hub_model_id="your-username/my-dpo-model", ) DPOTrainer(model="your-username/my-sft-model", args=dpo_config, train_dataset=dpo_dataset).train() ``` ## 11.3 Hardware Guidelines | Model Size | Minimum | Recommended | With LoRA | |-----------|---------|-------------|-----------| | 0.5–3B | 1Γ— A10G (24GB) | 1Γ— A100 (80GB) | 1Γ— T4 (16GB) | | 7–8B | 1Γ— A100 (80GB) | 2Γ— A100 | 1Γ— A10G (24GB) | | 13B | 2Γ— A100 | 4Γ— A100 | 1Γ— A100 (80GB) | | 70B | 4Γ— A100 | 8Γ— A100 | 2Γ— A100 | """ CH12_READING_LIST = r""" # Chapter 12: The Reading List ## Tier 1: Must-Read (The Foundations) | # | Paper | Year | Why Read It | |---|-------|------|------------| | 1 | [**InstructGPT**](https://arxiv.org/abs/2203.02155) β€” *Training LMs to Follow Instructions with Human Feedback* | 2022 | The SFTβ†’RMβ†’PPO pipeline. Everything starts here. | | 2 | [**DPO**](https://arxiv.org/abs/2305.18290) β€” *Direct Preference Optimization* | 2023 | No reward model, no RL. Most widely used method. | | 3 | [**LoRA**](https://arxiv.org/abs/2106.09685) β€” *Low-Rank Adaptation of Large Language Models* | 2021 | Made fine-tuning accessible. Used everywhere. | | 4 | [**DeepSeek-R1**](https://arxiv.org/abs/2501.12948) β€” *Incentivizing Reasoning via RL* | 2025 | RL teaches reasoning from scratch. The reasoning era. | ## Tier 2: Important (Deeper Understanding) | # | Paper | Year | Why Read It | |---|-------|------|------------| | 5 | [**LIMA**](https://arxiv.org/abs/2305.11206) β€” *Less Is More for Alignment* | 2023 | Data quality >> quantity. | | 6 | [**Constitutional AI**](https://arxiv.org/abs/2212.08073) β€” *Harmlessness from AI Feedback* | 2022 | AI feedback replaces human feedback (RLAIF). | | 7 | [**FLAN**](https://arxiv.org/abs/2109.01652) β€” *Finetuned LMs Are Zero-Shot Learners* | 2021 | Proved instruction tuning works. | | 8 | [**Self-Instruct**](https://arxiv.org/abs/2212.10560) β€” *Aligning LMs with Self-Generated Instructions* | 2022 | Synthetic data for SFT. Led to Alpaca. | | 9 | [**DeepSeekMath**](https://arxiv.org/abs/2402.03300) β€” *Pushing the Limits of Mathematical Reasoning* | 2024 | Introduced GRPO. | | 10 | [**QLoRA**](https://arxiv.org/abs/2305.14314) β€” *Efficient Finetuning of Quantized LMs* | 2023 | 7B fine-tuning on consumer GPUs. | ## Tier 3: Advanced (Cutting Edge) | # | Paper | Year | |---|-------|------| | 11 | [**KTO**](https://arxiv.org/abs/2402.01306) β€” *Prospect Theoretic Optimization* | 2024 | | 12 | [**ORPO**](https://arxiv.org/abs/2403.07691) β€” *Monolithic Preference Optimization* | 2024 | | 13 | [**SimPO**](https://arxiv.org/abs/2405.14734) β€” *Reference-Free Reward* | 2024 | | 14 | **Tulu 3** β€” *Open Language Model Post-Training* | 2024 | | 15 | [**Zephyr**](https://arxiv.org/abs/2310.16944) β€” *Direct Distillation of LM Alignment* | 2023 | ## Tier 4: RL Foundations | # | Paper | Year | |---|-------|------| | 16 | [**PPO**](https://arxiv.org/abs/1707.06347) β€” *Proximal Policy Optimization* | 2017 | | 17 | [**Learning to Summarize from Human Feedback**](https://arxiv.org/abs/2009.01325) | 2020 | | 18 | [**Fine-Tuning LMs from Human Preferences**](https://arxiv.org/abs/1909.08593) | 2019 | """ GLOSSARY = r""" # Glossary & Quick Reference ## Key Terms | Term | Definition | |------|-----------| | **Alignment** | Making a model behave according to human intentions and values | | **RLHF** | Reinforcement Learning from Human Feedback | | **RLAIF** | RL from AI Feedback (AI replaces human annotators) | | **SFT** | Supervised Fine-Tuning on instruction-response pairs | | **DPO** | Direct Preference Optimization β€” no reward model needed | | **GRPO** | Group Relative Policy Optimization β€” for reasoning | | **PPO** | Proximal Policy Optimization β€” the RL algorithm in RLHF | | **Reward Model** | Scores responses based on human preferences | | **Policy** | The language model being trained (RL terminology) | | **Reference Model (Ο€_ref)** | SFT model used as baseline | | **KL Divergence** | Measures how far the policy drifts from the reference | | **Bradley-Terry** | Preference model: P(A>B) = Οƒ(score(A) - score(B)) | | **Reward Hacking** | Model exploits the reward model instead of improving | | **LoRA** | Low-Rank Adaptation β€” parameter-efficient fine-tuning | | **QLoRA** | 4-bit quantized base model + LoRA adapters | | **Chat Template** | Format for structuring conversations (special tokens + roles) | | **On-policy** | Training on data from the *current* model | | **Off-policy** | Training on data from a *different* model | | **Advantage** | How much better an action is vs. the expected value | ## Dataset Formats by Trainer ```python # SFT {"messages": [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]} # DPO / ORPO / CPO {"prompt": "...", "chosen": "...", "rejected": "..."} # GRPO / RLOO / Online DPO {"prompt": "..."} # KTO {"prompt": "...", "completion": "...", "label": True} # Reward Model {"prompt": "...", "chosen": "...", "rejected": "..."} # PRM (Process Reward Model) {"prompt": "...", "completions": ["step1", "step2"], "labels": [True, False]} ``` """ SEARCH_CONTENT = { "Overview & Roadmap": OVERVIEW, "Ch 1: The Big Picture": CH1_BIG_PICTURE, "Ch 2: SFT": CH2_SFT, "Ch 3: RLHF": CH3_RLHF, "Ch 4: DPO": CH4_DPO, "Ch 5: Preference Zoo": CH5_PREFERENCE_ZOO, "Ch 6: GRPO & Reasoning": CH6_GRPO, "Ch 7: PEFT (LoRA)": CH7_PEFT, "Ch 8: Toolbox": CH8_TOOLBOX, "Ch 9: Datasets": CH9_DATASETS, "Ch 10: Evaluation": CH10_EVALUATION, "Ch 11: Full Recipe": CH11_RECIPE, "Ch 12: Reading List": CH12_READING_LIST, "Glossary & Reference": GLOSSARY, } def search_guide(query): if not query or not query.strip(): return "Type a keyword to search across all chapters (e.g., *DPO*, *reward*, *LoRA*, *dataset*)." query_lower = query.lower().strip() results = [] for title, content in SEARCH_CONTENT.items(): lines = content.split("\n") for i, line in enumerate(lines): if query_lower in line.lower(): start = max(0, i - 1) end = min(len(lines), i + 4) snippet = "\n".join(lines[start:end]) results.append(f"### πŸ“ {title}\n\n{snippet}\n\n---\n") break # one match per chapter if results: return f"## Found in {len(results)} chapter(s)\n\n" + "\n".join(results) return f"No results for **'{query}'**. Try: *SFT*, *DPO*, *GRPO*, *LoRA*, *reward*, *PPO*, *dataset*, *evaluation*." # ============================================================ # GRADIO UI # ============================================================ CUSTOM_CSS = """ .chapter-content { max-width: 900px; margin: 0 auto; font-size: 16px; line-height: 1.7; } .gradio-container { max-width: 1100px !important; } footer { display: none !important; } """ THEME = gr.themes.Soft( primary_hue="blue", secondary_hue="slate", font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"], ) with gr.Blocks(title="Post-Training Guide for LLMs") as demo: gr.HTML("""

πŸ“– The Complete Guide to Post-Training of LLMs

From Pretraining to Alignment β€” SFT Β· RLHF Β· DPO Β· GRPO Β· LoRA and more

""") with gr.Tabs(): with gr.Tab("🏠 Overview"): gr.Markdown(OVERVIEW, elem_classes="chapter-content") with gr.Tab("1️⃣ Big Picture"): gr.Markdown(CH1_BIG_PICTURE, elem_classes="chapter-content") with gr.Tab("2️⃣ SFT"): gr.Markdown(CH2_SFT, elem_classes="chapter-content") with gr.Tab("3️⃣ RLHF"): gr.Markdown(CH3_RLHF, elem_classes="chapter-content") with gr.Tab("4️⃣ DPO"): gr.Markdown(CH4_DPO, elem_classes="chapter-content") with gr.Tab("5️⃣ Preference Zoo"): gr.Markdown(CH5_PREFERENCE_ZOO, elem_classes="chapter-content") with gr.Tab("6️⃣ GRPO"): gr.Markdown(CH6_GRPO, elem_classes="chapter-content") with gr.Tab("7️⃣ PEFT"): gr.Markdown(CH7_PEFT, elem_classes="chapter-content") with gr.Tab("8️⃣ Toolbox"): gr.Markdown(CH8_TOOLBOX, elem_classes="chapter-content") with gr.Tab("9️⃣ Datasets"): gr.Markdown(CH9_DATASETS, elem_classes="chapter-content") with gr.Tab("πŸ”Ÿ Evaluation"): gr.Markdown(CH10_EVALUATION, elem_classes="chapter-content") with gr.Tab("πŸ“‹ Full Recipe"): gr.Markdown(CH11_RECIPE, elem_classes="chapter-content") with gr.Tab("πŸ“š Reading List"): gr.Markdown(CH12_READING_LIST, elem_classes="chapter-content") with gr.Tab("πŸ“– Glossary"): gr.Markdown(GLOSSARY, elem_classes="chapter-content") with gr.Tab("πŸ” Search"): gr.Markdown("## Search the Guide\nFind any topic across all chapters.") search_input = gr.Textbox( placeholder="Type a keyword (e.g. DPO, reward, LoRA, dataset, PPO)...", label="Search", lines=1, ) search_output = gr.Markdown(elem_classes="chapter-content") search_input.submit(search_guide, inputs=search_input, outputs=search_output) search_input.change(search_guide, inputs=search_input, outputs=search_output) gr.HTML("""
Built from primary research papers & official HF documentation Β· All code uses TRL v1.2+ APIs Β· TRL Docs Β· TRL GitHub Β· Last updated April 2026
""") demo.launch(theme=THEME, css=CUSTOM_CSS)