tobil
/

qmd-training-scripts

Model card Files Files and versions

xet

Community

tobil commited on Jan 24

Commit

c7967b0

verified ·

1 Parent(s): 58867a4

Upload train_grpo.py with huggingface_hub

Browse files

Files changed (1) hide show

train_grpo.py +188 -133

train_grpo.py CHANGED Viewed

@@ -8,16 +8,16 @@
 #     "trackio",
 #     "datasets",
 #     "bitsandbytes",
-#     "sentence-transformers",
 # ]
 # ///
 """
 GRPO (Group Relative Policy Optimization) training for QMD query expansion.
-Reward Type 2: Format + Diversity
-- Rewards correct lex/vec/hyde format
-- Penalizes repetition between lines
-- Rewards semantic diversity of expansions
 Usage:
     uv run train_grpo.py --sft-model tobil/qmd-query-expansion-0.6B
@@ -26,159 +26,204 @@ Usage:
 import re
 import torch
 import trackio
 from datasets import load_dataset
-from peft import LoraConfig, PeftModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from trl import GRPOTrainer, GRPOConfig
-from sentence_transformers import SentenceTransformer
 # ============================================================================
-# Reward Function: Format + Diversity
 # ============================================================================
 def parse_expansion(text: str) -> dict:
-    """Parse expansion output into lex/vec/hyde components."""
-    result = {"lex": [], "vec": [], "hyde": []}
-    for line in text.strip().split("\n"):
         line = line.strip()
         if line.startswith("lex:"):
             result["lex"].append(line[4:].strip())
         elif line.startswith("vec:"):
             result["vec"].append(line[4:].strip())
         elif line.startswith("hyde:"):
             result["hyde"].append(line[5:].strip())
     return result
-def compute_format_reward(text: str) -> float:
-    """
-    Reward for correct format:
-    - Has at least 1 lex line: +0.2
-    - Has at least 1 vec line: +0.2
-    - Has hyde line: +0.1
-    - Correct line format (type: content): +0.1 per line (max 0.3)
-    - No garbage/malformed lines: +0.2
-    """
-    reward = 0.0
-    parsed = parse_expansion(text)
-    # Check required components
-    if parsed["lex"]:
-        reward += 0.2
-    if parsed["vec"]:
-        reward += 0.2
-    if parsed["hyde"]:
-        reward += 0.1
-    # Check line format
-    lines = text.strip().split("\n")
-    valid_lines = 0
-    for line in lines:
-        if re.match(r'^(lex|vec|hyde):\s*.+', line.strip()):
-            valid_lines += 1
-    reward += min(0.3, valid_lines * 0.1)
-    # Penalize malformed lines
-    malformed = len(lines) - valid_lines
-    if malformed == 0:
-        reward += 0.2
-    else:
-        reward -= malformed * 0.1
-    return max(0.0, min(1.0, reward))
-def compute_diversity_reward(text: str, embedder) -> float:
     """
-    Reward for diverse expansions:
-    - Penalize exact duplicates
-    - Reward semantic distance between expansions
     """
-    parsed = parse_expansion(text)
-    all_expansions = parsed["lex"] + parsed["vec"] + parsed["hyde"]
-    if len(all_expansions) < 2:
-        return 0.0
-    # Penalize exact duplicates
-    unique = set(e.lower() for e in all_expansions)
-    duplicate_penalty = (len(all_expansions) - len(unique)) * 0.2
-    # Compute semantic diversity
-    if len(unique) >= 2:
-        try:
-            embeddings = embedder.encode(list(unique))
-            # Compute pairwise cosine similarities
-            from torch.nn.functional import cosine_similarity
-            emb_tensor = torch.tensor(embeddings)
-            similarities = []
-            for i in range(len(emb_tensor)):
-                for j in range(i + 1, len(emb_tensor)):
-                    sim = cosine_similarity(
-                        emb_tensor[i].unsqueeze(0),
-                        emb_tensor[j].unsqueeze(0)
-                    ).item()
-                    similarities.append(sim)
-            # Lower similarity = higher diversity = higher reward
-            avg_similarity = sum(similarities) / len(similarities) if similarities else 1.0
-            diversity_reward = 1.0 - avg_similarity  # 0 = identical, 1 = orthogonal
-        except Exception:
-            diversity_reward = 0.0
     else:
-        diversity_reward = 0.0
-    return max(0.0, diversity_reward - duplicate_penalty)
-def compute_length_reward(text: str) -> float:
-    """Reward appropriate length (not too short, not too long)."""
-    lines = [l for l in text.strip().split("\n") if l.strip()]
-    # Ideal: 3-6 lines
-    if 3 <= len(lines) <= 6:
-        return 0.2
-    elif 2 <= len(lines) <= 7:
-        return 0.1
-    else:
-        return 0.0
 class QMDRewardFunction:
-    """Combined reward function for QMD query expansion."""
-    __name__ = "qmd_format_diversity_reward"
-    def __init__(self):
-        # Load a small embedding model for diversity computation
-        print("Loading embedding model for diversity reward...")
-        self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
-        print("Embedding model loaded.")
     def __call__(self, completions: list[str], prompts: list[str] = None, **kwargs) -> list[float]:
         """Compute rewards for a batch of completions."""
         rewards = []
-        for completion in completions:
-            # Extract just the generated part (after prompt)
-            text = completion
-            # Compute component rewards
-            format_r = compute_format_reward(text)
-            diversity_r = compute_diversity_reward(text, self.embedder)
-            length_r = compute_length_reward(text)
-            # Weighted combination
-            total = (
-                0.5 * format_r +      # Format is most important
-                0.35 * diversity_r +  # Diversity is second
-                0.15 * length_r       # Length is minor
-            )
-            rewards.append(total)
         return rewards
@@ -194,9 +239,11 @@ def main():
                         help="SFT model to use as starting point")
     parser.add_argument("--base-model", default="Qwen/Qwen3-0.6B",
                         help="Base model (for loading tokenizer)")
-    parser.add_argument("--output", default="tobil/qmd-query-expansion-0.6B-grpo",
                         help="Output model name on Hub")
     parser.add_argument("--epochs", type=int, default=1)
     parser.add_argument("--dry-run", action="store_true")
     args = parser.parse_args()
@@ -206,6 +253,7 @@ def main():
         print(f"  Base Model: {args.base_model}")
         print(f"  Output: {args.output}")
         print(f"  Epochs: {args.epochs}")
         return
     # Load dataset (just prompts needed for GRPO)
@@ -217,7 +265,7 @@ def main():
         return {"prompt": example["messages"][0]["content"]}
     dataset = dataset.map(extract_prompt, remove_columns=dataset.column_names)
-    dataset = dataset.shuffle(seed=42).select(range(min(2000, len(dataset))))  # Use subset for GRPO
     print(f"Using {len(dataset)} prompts for GRPO")
     # Load tokenizer
@@ -234,18 +282,17 @@ def main():
         device_map="auto",
     )
     model = PeftModel.from_pretrained(base_model, args.sft_model)
-    model = model.merge_and_unload()  # Merge LoRA weights
     print("Model loaded and LoRA merged.")
-    # Add new LoRA adapter for GRPO training
-    from peft import get_peft_model
     grpo_lora_config = LoraConfig(
-        r=8,
-        lora_alpha=16,
         lora_dropout=0.05,
         bias="none",
         task_type="CAUSAL_LM",
-        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
     )
     model = get_peft_model(model, grpo_lora_config)
     model.print_trainable_parameters()
@@ -254,21 +301,29 @@ def main():
     # Initialize reward function
     reward_fn = QMDRewardFunction()
-    # GRPO config
     config = GRPOConfig(
-        output_dir="qmd-expansion-grpo",
         push_to_hub=True,
         hub_model_id=args.output,
-        # GRPO specific
-        num_generations=4,  # Generate 4 completions per prompt
-        max_completion_length=256,
-        # Training
         num_train_epochs=args.epochs,
         per_device_train_batch_size=2,
-        gradient_accumulation_steps=4,
-        learning_rate=5e-6,  # Lower LR for RL
         # Logging
         logging_steps=10,
@@ -276,8 +331,8 @@ def main():
         # Monitoring
         report_to="trackio",
-        project="qmd-query-expansion-grpo",
-        run_name="grpo-format-diversity",
     )
     # Create trainer

 #     "trackio",
 #     "datasets",
 #     "bitsandbytes",
 # ]
 # ///
 """
 GRPO (Group Relative Policy Optimization) training for QMD query expansion.
+Uses the comprehensive scoring system from SCORING.md:
+- Format (30%): Must have lex: and vec: prefixes
+- Diversity (30%): No echoing query, diverse expansions
+- Hyde (20%): Concise, no newlines, no repetition
+- Quality (20%): lex=keywords, vec=natural language
 Usage:
     uv run train_grpo.py --sft-model tobil/qmd-query-expansion-0.6B
 import re
 import torch
 import trackio
+from collections import Counter
 from datasets import load_dataset
+from peft import LoraConfig, PeftModel, get_peft_model
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from trl import GRPOTrainer, GRPOConfig
+STOPWORDS = {'the', 'a', 'an', 'is', 'are', 'to', 'for', 'of', 'in', 'and', 'or', 'it', 'this', 'that', 'be', 'with', 'as', 'on', 'by'}
 # ============================================================================
+# Scoring Functions (from SCORING.md)
 # ============================================================================
 def parse_expansion(text: str) -> dict:
+    """Parse expansion into structured format."""
+    lines = text.strip().split("\n")
+    result = {"lex": [], "vec": [], "hyde": [], "invalid": []}
+    for line in lines:
         line = line.strip()
+        if not line:
+            continue
         if line.startswith("lex:"):
             result["lex"].append(line[4:].strip())
         elif line.startswith("vec:"):
             result["vec"].append(line[4:].strip())
         elif line.startswith("hyde:"):
             result["hyde"].append(line[5:].strip())
+        else:
+            result["invalid"].append(line)
     return result
+def edit_distance_simple(a: str, b: str) -> int:
+    """Simple word-level edit distance."""
+    words_a = set(a.lower().split())
+    words_b = set(b.lower().split())
+    return len(words_a ^ words_b)
+def is_diverse(a: str, b: str, min_distance: int = 2) -> bool:
+    """Check if two strings are sufficiently different."""
+    a, b = a.lower().strip(), b.lower().strip()
+    if a == b:
+        return False
+    if a in b or b in a:
+        return False
+    return edit_distance_simple(a, b) >= min_distance
+def echoes_query(expansion: str, query: str) -> bool:
+    """Check if expansion is just echoing the query."""
+    exp = expansion.lower().strip()
+    q = query.lower().strip()
+    if exp == q:
+        return True
+    if q in exp and len(exp) < len(q) + 10:
+        return True
+    return False
+def word_repetition_penalty(text: str) -> int:
+    """Count penalty for repeated words (excluding stopwords)."""
+    words = re.findall(r'\b\w+\b', text.lower())
+    counts = Counter(words)
+    penalty = 0
+    for word, count in counts.items():
+        if count >= 3 and word not in STOPWORDS and len(word) > 2:
+            penalty += (count - 2) * 2
+    return penalty
+def score_expansion(query: str, expansion: str) -> float:
     """
+    Score an expansion based on SCORING.md criteria.
+    Returns normalized score 0.0-1.0 for RL reward.
     """
+    parsed = parse_expansion(expansion)
+    # === FORMAT (0-30) ===
+    format_score = 0
+    if parsed["lex"]:
+        format_score += 10
+    if parsed["vec"]:
+        format_score += 10
+    if not parsed["invalid"]:
+        format_score += 10
     else:
+        format_score += max(0, 10 - len(parsed["invalid"]) * 5)
+    # === DIVERSITY (0-30) ===
+    diversity_score = 0
+    # 2+ different types
+    types_present = sum(1 for t in ["lex", "vec"] if parsed[t])
+    if types_present >= 2:
+        diversity_score += 10
+    # 2+ total expansions
+    total_expansions = len(parsed["lex"]) + len(parsed["vec"])
+    if total_expansions >= 2:
+        diversity_score += 5
+    # Lex diversity
+    lex_score = 5
+    for i, a in enumerate(parsed["lex"]):
+        for b in parsed["lex"][i+1:]:
+            if not is_diverse(a, b, 2):
+                lex_score -= 2
+    diversity_score += max(0, lex_score)
+    # Vec diversity
+    vec_score = 5
+    for i, a in enumerate(parsed["vec"]):
+        for b in parsed["vec"][i+1:]:
+            if not is_diverse(a, b, 3):
+                vec_score -= 2
+    diversity_score += max(0, vec_score)
+    # Don't echo query
+    echo_score = 5
+    for exp in parsed["lex"] + parsed["vec"]:
+        if echoes_query(exp, query):
+            echo_score -= 3  # Heavier penalty for echoing
+    diversity_score += max(0, echo_score)
+    # === HYDE (0-20) ===
+    hyde_score = 0
+    if parsed["hyde"]:
+        hyde_text = parsed["hyde"][0]
+        hyde_score += 5  # Present
+        # Length check (50-200 chars ideal)
+        hyde_len = len(hyde_text)
+        if 50 <= hyde_len <= 200:
+            hyde_score += 5
+        elif hyde_len < 50:
+            hyde_score += 2
+        # No newlines
+        if "\n" not in hyde_text:
+            hyde_score += 5
+        # No repetition
+        rep_penalty = word_repetition_penalty(hyde_text)
+        hyde_score += max(0, 5 - rep_penalty)
+    # === QUALITY (0-20) ===
+    quality_score = 10  # Base
+    # Lex should be shorter than vec
+    if parsed["lex"] and parsed["vec"]:
+        avg_lex = sum(len(l) for l in parsed["lex"]) / len(parsed["lex"])
+        avg_vec = sum(len(v) for v in parsed["vec"]) / len(parsed["vec"])
+        if avg_lex <= avg_vec:
+            quality_score += 5
+    # Vec should be natural language
+    if parsed["vec"]:
+        natural = sum(1 for v in parsed["vec"] if " " in v and len(v) > 15)
+        if natural == len(parsed["vec"]):
+            quality_score += 5
+        else:
+            quality_score += 2
+    # === TOTAL ===
+    total = format_score + diversity_score + hyde_score + quality_score
+    max_possible = 100 if parsed["hyde"] else 80
+    # Normalize to 0-1
+    return total / max_possible
+def extract_query_from_prompt(prompt: str) -> str:
+    """Extract the query from the prompt template."""
+    # Prompt format: "Expand this search query:\n\n{query}"
+    if "Expand this search query:" in prompt:
+        return prompt.split("Expand this search query:")[-1].strip()
+    return prompt.strip()
 class QMDRewardFunction:
+    """Reward function using comprehensive SCORING.md criteria."""
+    __name__ = "qmd_scoring_reward"
     def __call__(self, completions: list[str], prompts: list[str] = None, **kwargs) -> list[float]:
         """Compute rewards for a batch of completions."""
         rewards = []
+        for i, completion in enumerate(completions):
+            # Get the query from prompt if available
+            query = ""
+            if prompts and i < len(prompts):
+                query = extract_query_from_prompt(prompts[i])
+            # Score using comprehensive system
+            score = score_expansion(query, completion)
+            rewards.append(score)
         return rewards
                         help="SFT model to use as starting point")
     parser.add_argument("--base-model", default="Qwen/Qwen3-0.6B",
                         help="Base model (for loading tokenizer)")
+    parser.add_argument("--output", default="tobil/qmd-query-expansion-0.6B-grpo-v2",
                         help="Output model name on Hub")
     parser.add_argument("--epochs", type=int, default=1)
+    parser.add_argument("--lr", type=float, default=1e-6,
+                        help="Learning rate (lower for stability)")
     parser.add_argument("--dry-run", action="store_true")
     args = parser.parse_args()
         print(f"  Base Model: {args.base_model}")
         print(f"  Output: {args.output}")
         print(f"  Epochs: {args.epochs}")
+        print(f"  LR: {args.lr}")
         return
     # Load dataset (just prompts needed for GRPO)
         return {"prompt": example["messages"][0]["content"]}
     dataset = dataset.map(extract_prompt, remove_columns=dataset.column_names)
+    dataset = dataset.shuffle(seed=42).select(range(min(2000, len(dataset))))
     print(f"Using {len(dataset)} prompts for GRPO")
     # Load tokenizer
         device_map="auto",
     )
     model = PeftModel.from_pretrained(base_model, args.sft_model)
+    model = model.merge_and_unload()
     print("Model loaded and LoRA merged.")
+    # Add new LoRA adapter for GRPO training (smaller rank for stability)
     grpo_lora_config = LoraConfig(
+        r=4,  # Smaller rank for more stable RL
+        lora_alpha=8,
         lora_dropout=0.05,
         bias="none",
         task_type="CAUSAL_LM",
+        target_modules=["q_proj", "v_proj"],  # Fewer modules for stability
     )
     model = get_peft_model(model, grpo_lora_config)
     model.print_trainable_parameters()
     # Initialize reward function
     reward_fn = QMDRewardFunction()
+    # Test reward function
+    print("\nTesting reward function...")
+    test_good = "lex: auth setup\nlex: authentication config\nvec: how to configure authentication\nhyde: Configure auth by setting AUTH_SECRET."
+    test_bad = "auth is important for security"
+    print(f"  Good output score: {score_expansion('auth', test_good):.2f}")
+    print(f"  Bad output score: {score_expansion('auth', test_bad):.2f}")
+    # GRPO config with conservative settings
     config = GRPOConfig(
+        output_dir="qmd-expansion-grpo-v2",
         push_to_hub=True,
         hub_model_id=args.output,
+        # GRPO specific - conservative
+        num_generations=4,
+        max_completion_length=200,  # Shorter to avoid rambling
+        # Training - very conservative
         num_train_epochs=args.epochs,
         per_device_train_batch_size=2,
+        gradient_accumulation_steps=8,
+        learning_rate=args.lr,
+        max_grad_norm=0.5,  # Clip gradients more aggressively
         # Logging
         logging_steps=10,
         # Monitoring
         report_to="trackio",
+        project="qmd-query-expansion-grpo-v2",
+        run_name="grpo-scoring-v2",
     )
     # Create trainer