Spaces:

ishwarraja
/

ERAv4S18

Sleeping

App Files Files Community

ishwarraja commited on Jan 16

Commit

63a0be4

verified ·

1 Parent(s): 4e8c3fc

Upload 2 files

Browse files

Files changed (2) hide show

requirements.txt +30 -0
train.py +255 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+# # 1) Create project
+# mkdir phi2-grpo-qlora && cd phi2-grpo-qlora
+# # 2) Create & activate a virtual environment (choose one)
+# python -m venv .venv && source .venv/bin/activate
+# # or: conda create -n phi2-grpo python=3.10 -y && conda activate phi2-grpo
+# # 3) Install core deps (pin to mature versions that work well together)
+# pip install -U "transformers>=4. Forty" accelerate datasets peft bitsandbytes trl gradio
+# # If your GPU supports bfloat16 well, also:
+# pip install torch --index-url https://download.pytorch.org/whl/cu121
+# # 4) Optional (to log in to HF Hub for pushing adapters later)
+# pip install -U huggingface_hub
+# Core
+transformers>=4.0
+accelerate
+datasets
+peft
+trl
+bitsandbytes
+gradio
+# If logs show CUDA wheel mismatch, uncomment and adjust per Spaces GPU doc:
+# --extra-index-url https://download.pytorch.org/whl/cu121
+# torch==2.3.1

train.py ADDED Viewed

	@@ -0,0 +1,255 @@

+# train.py
+import os
+import math
+import warnings
+from dataclasses import dataclass
+from typing import List, Dict, Any
+import torch
+from datasets import load_dataset
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    BitsAndBytesConfig,
+)
+from peft import LoraConfig, get_peft_model
+from trl import GRPOTrainer, GRPOConfig
+# ---------------------------
+# 0) Basic config (edit here)
+# ---------------------------
+MODEL_ID = "microsoft/phi-2"          # base model
+OUTPUT_DIR = "./runs/phi2-grpo-qlora" # logs + checkpoints
+ADAPTER_DIR = "./adapters"            # where LoRA adapters are saved
+HF_DATASET = "OpenAssistant/oasst1"   # dataset id
+# Token lengths: keep well within Phi-2 context (2048)
+# Prompt 1536 + completion 256 leaves headroom for BOS/EOS & formatting
+MAX_PROMPT_LEN = 1536
+MAX_COMPLETION_LEN = 256
+# GRPO shape parameters (we'll sanity-check these below)
+NUM_GENERATIONS = 4                 # completions per prompt
+PER_DEVICE_TRAIN_BS = 4            # "effective per-device" batch used by GRPO
+GENERATION_BATCH_SIZE = 4          # how many sequences we generate at once
+LEARNING_RATE = 5e-6
+NUM_EPOCHS = 1
+LOG_STEPS = 5
+SAVE_STEPS = 200
+# ---------------------------
+# 1) Utilities
+# ---------------------------
+def has_gpu() -> bool:
+    return torch.cuda.is_available()
+def suggest_divisible_values(num_processes: int, per_device_bs: int, limit: int = 16) -> List[int]:
+    """Suggest valid num_generations values dividing the global train batch size."""
+    global_bs = num_processes * per_device_bs
+    return [g for g in range(1, limit + 1) if global_bs % g == 0]
+def ensure_divisibility_or_die(num_processes: int):
+    """
+    Validate GRPO constraints to avoid:
+      - ValueError: global train batch size must be divisible by num_generations
+      - ValueError: generation_batch_size must be divisible by num_generations
+    """
+    global_bs = num_processes * PER_DEVICE_TRAIN_BS
+    ok1 = (global_bs % NUM_GENERATIONS == 0)
+    ok2 = (GENERATION_BATCH_SIZE % NUM_GENERATIONS == 0)
+    if ok1 and ok2:
+        return
+    msg = []
+    if not ok1:
+        vals = suggest_divisible_values(num_processes, PER_DEVICE_TRAIN_BS, limit=64)
+        msg.append(
+            f"- With num_processes={num_processes} and per_device_train_batch_size={PER_DEVICE_TRAIN_BS}, "
+            f"the global train batch size is {global_bs}; choose NUM_GENERATIONS ∈ {vals}."
+        )
+    if not ok2:
+        # suggest the next multiple of NUM_GENERATIONS
+        next_mult = (GENERATION_BATCH_SIZE // NUM_GENERATIONS + 1) * NUM_GENERATIONS
+        msg.append(
+            f"- Set GENERATION_BATCH_SIZE to a multiple of NUM_GENERATIONS={NUM_GENERATIONS} "
+            f"(e.g., {next_mult})."
+        )
+    hint = "\n".join(msg)
+    raise ValueError(
+        "Invalid GRPO batching parameters.\n"
+        + hint
+        + "\n(Constraint documented in TRL’s GRPOTrainer.)"
+    )
+# ---------------------------
+# 2) Rewards
+# ---------------------------
+def reward_format(completions: List[List[Dict[str, str]]], **kwargs) -> List[float]:
+    """
+    Reward if the model produces a non-empty assistant message that ends with punctuation.
+    `completions` is list of conversations; each completion is a list of messages:
+        [{"role": "assistant", "content": "..."}]
+    """
+    rewards = []
+    for completion in completions:
+        text = completion[0]["content"].strip() if completion and completion[0].get("content") else ""
+        ok = (len(text) > 10) and (text[-1] in ".!?")
+        rewards.append(1.0 if ok else 0.0)
+    return rewards
+def reward_length(completions: List[List[Dict[str, str]]], **kwargs) -> List[float]:
+    """
+    Reward completions whose token length is in a 'goldilocks' range [64, 256].
+    If tokenizer is available in kwargs, use it for accurate token counts.
+    """
+    tok = kwargs.get("tokenizer", None)
+    lo, hi = 64, 256
+    scores = []
+    for completion in completions:
+        text = completion[0]["content"] if completion and completion[0].get("content") else ""
+        if not text:
+            scores.append(0.0)
+            continue
+        length = len(tok.encode(text)) if tok else len(text.split())
+        if lo <= length <= hi:
+            scores.append(1.0)
+        else:
+            # soft ramp: distance from range normalized
+            d = 0 if lo <= length <= hi else min(abs(length - lo), abs(length - hi))
+            scores.append(max(0.0, 1.0 - d / hi))
+    return scores
+# ---------------------------
+# 3) Load tokenizer & model (4-bit with CPU fallback)
+# ---------------------------
+def load_tokenizer_and_model():
+    # Tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # 4-bit config for QLoRA
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16 if has_gpu() else torch.float32,
+        bnb_4bit_use_double_quant=True,
+    )
+    device_map = "auto" if has_gpu() else {"": "cpu"}
+    try:
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            quantization_config=bnb_config,
+            device_map=device_map,
+            trust_remote_code=False,
+        )
+    except Exception as e:
+        # CPU fallback without quantization
+        print(f"[WARN] 4-bit load failed ({e}); falling back to CPU fp32.")
+        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map={"": "cpu"})
+    # Disable cache for training; enable gradient checkpointing if GPU-limited
+    model.config.use_cache = False
+    if has_gpu():
+        model.gradient_checkpointing_enable()
+    # QLoRA: target q/k/v projections for Phi-2
+    peft_config = LoraConfig(
+        r=16, lora_alpha=32, lora_dropout=0.05,
+        bias="none", task_type="CAUSAL_LM",
+        target_modules=["q_proj", "k_proj", "v_proj"],  # Phi-2 attention projections
+    )
+    model = get_peft_model(model, peft_config)
+    model.print_trainable_parameters()
+    return tokenizer, model
+# ---------------------------
+# 4) Load & clean dataset (English-only) and build prompts
+# ---------------------------
+def load_dataset_oasst1(tokenizer):
+    ds = load_dataset(HF_DATASET, split="train")
+    # Keep only English rows; strip to columns we need to prevent KeyErrors
+    ds = ds.filter(lambda x: x.get("lang", None) == "en")
+    keep_cols = {"text", "role", "message_id", "parent_id", "message_tree_id", "lang"}
+    drop_cols = [c for c in ds.column_names if c not in keep_cols]
+    ds = ds.remove_columns(drop_cols)
+    # Build single-turn "chat" prompts for GRPO: list of messages with role/content.
+    # We’ll keep only "prompter" -> user prompts.
+    prompts = []
+    for rec in ds:
+        if rec.get("role") == "prompter":
+            content = rec.get("text", "").strip()
+            if not content:
+                continue
+            # minimal chat turn
+            messages = [{"role": "user", "content": content}]
+            # Convert to a single string prompt using a generic chat template (tokenizer may have one)
+            if hasattr(tokenizer, "apply_chat_template"):
+                prompt_str = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+            else:
+                # fallback plain text
+                prompt_str = "User: " + content + "\nAssistant:"
+            prompts.append({"prompt": prompt_str})
+    # Keep it as a simple dict dataset for TRL
+    return prompts
+# ---------------------------
+# 5) Main
+# ---------------------------
+def main():
+    warnings.filterwarnings("default")  # show potential FutureWarnings for max_prompt_length evolution
+    tokenizer, model = load_tokenizer_and_model()
+    # Prepare dataset entries like {"prompt": "..."} as TRL suggests
+    train_dataset = load_dataset_oasst1(tokenizer)
+    # Validate GRPO divisibility constraints (avoid the common ValueError)
+    num_processes = int(os.environ.get("WORLD_SIZE", "1"))
+    ensure_divisibility_or_die(num_processes)
+    # GRPO training args
+    args = GRPOConfig(
+        output_dir=OUTPUT_DIR,
+        num_train_epochs=NUM_EPOCHS,
+        learning_rate=LEARNING_RATE,
+        logging_steps=LOG_STEPS,
+        save_steps=SAVE_STEPS,
+        save_total_limit=2,
+        bf16=has_gpu(),                       # prefer bf16 when available
+        per_device_train_batch_size=PER_DEVICE_TRAIN_BS,
+        generation_batch_size=GENERATION_BATCH_SIZE,
+        num_generations=NUM_GENERATIONS,
+        max_prompt_length=MAX_PROMPT_LEN,     # keep below context limit
+        max_completion_length=MAX_COMPLETION_LEN,
+        gradient_accumulation_steps=1,
+        report_to="none",
+        disable_dropout=True,                 # stabilizes GRPO per TRL notes
+    )
+    # Combine our two reward functions (equal weights)
+    reward_funcs = [reward_format, reward_length]
+    trainer = GRPOTrainer(
+        model=model,
+        args=args,
+        reward_funcs=reward_funcs,
+        train_dataset=train_dataset,
+        tokenizer=tokenizer,   # passed to reward funcs via kwargs
+    )
+    trainer.train()
+    # Save ONLY the adapters (PEFT)
+    os.makedirs(ADAPTER_DIR, exist_ok=True)
+    trainer.model.save_pretrained(ADAPTER_DIR)
+    print(f"[OK] LoRA adapters saved to: {ADAPTER_DIR}")
+if __name__ == "__main__":
+    main()