Spaces:

parth-1
/

MetaGuard-Train

Runtime error

App Files Files Community

parth-1 commited on 21 days ago

Commit

aea0b8c

verified ·

1 Parent(s): 2961503

Update grpo_train.py

Browse files

Files changed (1) hide show

grpo_train.py +99 -350

grpo_train.py CHANGED Viewed

@@ -1,403 +1,152 @@
-# grpo_train.py
-import os
-# Route all caches to /tmp/ to avoid Hugging Face Spaces Read-Only errors
-os.environ.setdefault("USER", "user")
-os.environ.setdefault("HOME", "/tmp/home")
-os.environ.setdefault("HF_HOME", "/tmp/hf_home")
-os.environ.setdefault("TRANSFORMERS_CACHE", "/tmp/hf_home/transformers")
-os.environ.setdefault("TRITON_CACHE_DIR", "/tmp/triton_cache")
-os.environ.setdefault("TORCH_EXTENSIONS_DIR", "/tmp/torch_ext")
-os.environ.setdefault("XDG_CACHE_HOME", "/tmp/cache")
-os.environ.setdefault("MPLCONFIGDIR", "/tmp/mpl")
-for _d in [
-    "/tmp/home", "/tmp/hf_home", "/tmp/hf_home/transformers",
-    "/tmp/triton_cache", "/tmp/torch_ext", "/tmp/cache",
-    "/tmp/mpl", "/tmp/outputs",
-]:
-    os.makedirs(_d, exist_ok=True)
-import time
 import json
-import random
-import requests
 import torch
 from datasets import Dataset
 from unsloth import FastLanguageModel, PatchFastRL
 from trl import GRPOTrainer, GRPOConfig
 PatchFastRL("GRPO", FastLanguageModel)
-# =========================
-# CONFIG
-# =========================
-OUTPUT_DIR = "/tmp/outputs"
-ENV_URL = os.getenv("ENV_URL", "https://parth-1-metaguard.hf.space")
-HF_TOKEN = os.getenv("HF_TOKEN", "")
-HF_REPO = os.getenv("HF_REPO", "")  # e.g. "parth-1/metaguard-llama3.1-8b-grpo"
-ALLOWED_ACTIONS = [
-    "query_regulations",
-    "analyze_image",
-    "check_advertiser_history",
-    "request_landing_page",
-    "request_id_verification",
-    "submit_audit",
-    "approve",
-    "reject",
-]
-# =========================
-# HEALTH CHECK
-# =========================
-def ensure_env_ready():
-    for _ in range(20):
-        try:
-            r = requests.post(
-                f"{ENV_URL}/reset",
-                json={"task_id": "task_1_healthcare"},
-                timeout=5
-            )
-            if r.status_code == 200:
-                print("✅ Environment ready")
-                return
-        except:
-            pass
-        time.sleep(1)
-    raise RuntimeError("❌ ENV not reachable")
-# =========================
-# SAFE CLIENT
-# =========================
-class EnvClient:
-    def __init__(self, url):
-        self.url = url
-    def reset(self, task_id):
-        return requests.post(
-            f"{self.url}/reset",
-            json={"task_id": task_id},
-            timeout=8
-        ).json()
-    def step(self, action):
-        return requests.post(
-            f"{self.url}/step",
-            json={"action": action},
-            timeout=8
-        ).json()
-def safe_step(client, action):
-    for _ in range(3):
-        try:
-            return client.step(action)
-        except:
-            time.sleep(0.5)
-    return {"reward": -0.3}
-# =========================
-# JSON PARSER
-# =========================
-def extract_json(text):
-    try:
-        if "```" in text:
-            text = text.split("```")[1]
-            if text.startswith("json"):
-                text = text[4:]
-        return json.loads(text.strip())
-    except:
-        return None
-# =========================
-# DATASET
-# =========================
-BASE_SCENARIOS = [
-    {
-        "task_id": "task_1_healthcare",
-        "text": "Healthcare ad: 'miracle supplement cures disease'. No actions taken yet.",
-        "actions_already_taken": [],
-        "setup_actions": [],
-    },
-    {
-        "task_id": "task_2_financial",
-        "text": "Financial ad: 'guaranteed 500% returns, zero risk'. No actions taken yet.",
-        "actions_already_taken": [],
-        "setup_actions": [],
-    },
-    {
-        "task_id": "task_3_multimodal",
-        "text": "Multimodal ad: image may contain hidden violation. No actions taken yet.",
-        "actions_already_taken": [],
-        "setup_actions": [],
-    },
-    {
-        "task_id": "task_1_healthcare",
-        "text": "Healthcare ad: pharma product. Policy already queried.",
-        "actions_already_taken": ["query_regulations"],
-        "setup_actions": [
-            {"action_type": "query_regulations", "reasoning": "policy lookup"},
-        ],
-    },
-    {
-        "task_id": "task_3_multimodal",
-        "text": "Multimodal ad: image not yet inspected. Policy already queried.",
-        "actions_already_taken": ["query_regulations"],
-        "setup_actions": [
-            {"action_type": "query_regulations", "reasoning": "policy lookup"},
-        ],
-    },
-    {
-        "task_id": "task_2_financial",
-        "text": "Financial ad: investment scheme. Policy and advertiser history both checked.",
-        "actions_already_taken": ["query_regulations", "check_advertiser_history"],
-        "setup_actions": [
-            {"action_type": "query_regulations", "reasoning": "policy lookup"},
-            {"action_type": "check_advertiser_history", "reasoning": "trust score"},
-        ],
-    },
-    {
-        "task_id": "task_2_financial",
-        "text": "Financial ad: investment scheme. Policy, history, and audit all complete. Make final decision.",
-        "actions_already_taken": ["query_regulations", "check_advertiser_history", "submit_audit"],
-        "setup_actions": [
-            {"action_type": "query_regulations", "reasoning": "policy lookup"},
-            {"action_type": "check_advertiser_history", "reasoning": "trust score"},
-            {"action_type": "submit_audit", "reasoning": "audit log"},
-        ],
-    },
-    {
-        "task_id": "task_4_targeting",
-        "text": "Financial ad targeting young users: 'Start Your First Investment Portfolio'. No actions taken yet.",
-        "actions_already_taken": [],
-        "setup_actions": [],
-    },
-    {
-        "task_id": "task_4_targeting",
-        "text": "Financial ad targeting young users. Policy queried, need to verify age targeting.",
-        "actions_already_taken": ["query_regulations"],
-        "setup_actions": [
-            {"action_type": "query_regulations", "reasoning": "policy lookup"},
-        ],
-    },
-    {
-        "task_id": "task_4_targeting",
-        "text": "Financial ad targeting minors. Policy, advertiser history, and ID verification done. Submit audit.",
-        "actions_already_taken": ["query_regulations", "check_advertiser_history", "request_id_verification"],
-        "setup_actions": [
-            {"action_type": "query_regulations", "reasoning": "policy lookup"},
-            {"action_type": "check_advertiser_history", "reasoning": "trust score"},
-            {"action_type": "request_id_verification", "reasoning": "age check"},
-        ],
-    },
-]
-PROMPT_TEMPLATE = """You are an enterprise Ad Policy Compliance Agent.
-You MUST choose exactly ONE action_type from this list (any other value is invalid):
-- query_regulations
-- analyze_image
-- check_advertiser_history
-- request_landing_page
-- request_id_verification
-- submit_audit
-- approve
-- reject
 REQUIRED PHASE ORDER:
-1. query_regulations  -> always first
-2. analyze_image / check_advertiser_history  -> gather signals
-3. submit_audit  -> always before final decision
-4. approve OR reject  -> only after audit
-HARD RULES:
-- NEVER repeat an action listed in `actions_already_taken`.
-- Respond with ONLY a valid JSON object. No markdown, no prose.
-Required format:
-{{"action_type": "<one_of_the_actions_above>", "reasoning": "<short reason>"}}
-Scenario: {text}
-actions_already_taken: {actions_already_taken}
-Your next action?"""
 def build_dataset():
     rows = []
-    for s in BASE_SCENARIOS:
-        prompt = PROMPT_TEMPLATE.format(
-            text=s["text"],
-            actions_already_taken=json.dumps(s["actions_already_taken"]),
         )
-        rows.append({
-            "prompt": prompt,
-            "task_id": s["task_id"],
-            "setup_actions": s["setup_actions"],
-        })
-    return Dataset.from_list(rows * 10)  # 10 scenarios x 10 = 100 examples
-# =========================
-# REWARD FUNCTION
-# =========================
-def reward_environment(prompts, completions, task_id=None, setup_actions=None, **kwargs):
-    client = EnvClient(ENV_URL)
     rewards = []
-    for completion, t_id, setup in zip(completions, task_id, setup_actions):
-        parsed = extract_json(completion)
-        if not parsed:
-            rewards.append(-1.0)
-            continue
-        action_type = parsed.get("action_type")
-        if action_type not in ALLOWED_ACTIONS:
-            rewards.append(-1.0)
             continue
-        action = {
-            "action_type": action_type,
-            "reasoning": parsed.get("reasoning", "format-compliant"),
-        }
         try:
-            client.reset(t_id)
-            for s in setup:
-                safe_step(client, s)
-            result = safe_step(client, action)
-            env_reward = float(result.get("reward", -0.2))
-            status_msg = (result.get("status_message") or "").lower()
-            rejected = (
-                "api failure" in status_msg
-                or "invalid action" in status_msg
-                or "must call" in status_msg
             )
-            if rejected:
-                shaped = -0.5
-            else:
-                shaped = 0.5 + env_reward
-            rewards.append(shaped)
         except Exception:
-            rewards.append(-0.3)
     return rewards
-# =========================
-# MODEL
-# =========================
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name="unsloth/Llama-3.1-8B-Instruct",
-    load_in_4bit=True, # Strictly True for L4 24GB
-    max_seq_length=2048,
-    dtype=torch.float16, # PERFECT ALIGNMENT: 4-bit uses fp16 math natively
 )
 model = FastLanguageModel.get_peft_model(
     model,
-    r=32,
-    target_modules=[
-        "q_proj", "k_proj", "v_proj", "o_proj",
-        "gate_proj", "up_proj", "down_proj",
-    ],
-    lora_alpha=64,
-    lora_dropout=0,
-    bias="none",
     use_gradient_checkpointing="unsloth",
-    random_state=3407,
 )
-# =========================
-# TRAINER
-# =========================
 dataset = build_dataset()
 trainer = GRPOTrainer(
     model=model,
-    reward_funcs=[reward_environment],
     args=GRPOConfig(
-        output_dir=OUTPUT_DIR,
-        learning_rate=2e-5,
-        num_train_epochs=3,
-        per_device_train_batch_size=1, # Memory safe for L4
-        gradient_accumulation_steps=8, # Maintain effective batch size of 8
-        num_generations=2, # Memory safe generation limit
-        max_prompt_length=768,
         max_completion_length=128,
         logging_steps=5,
-        warmup_ratio=0.1,
-        bf16=False, # DISABLED TO PREVENT CLASH
-        fp16=True,  # ENABLED TO MATCH MODEL DTYPE
         report_to="none",
     ),
     train_dataset=dataset,
     tokenizer=tokenizer,
 )
-# =========================
-# RUN
-# =========================
 if __name__ == "__main__":
-    ensure_env_ready()
-    LORA_DIR = os.path.join(OUTPUT_DIR, "lora_adapter")
-    MERGED_DIR = os.path.join(OUTPUT_DIR, "merged")
-    print("Starting GRPO training...")
-    try:
-        trainer.train()
-    except torch.cuda.OutOfMemoryError:
-        print("OOM detected! Clearing cache and severely restricting memory...")
-        torch.cuda.empty_cache()
-        trainer.args.per_device_train_batch_size = 1
-        trainer.args.gradient_accumulation_steps = 16
-        trainer.train()
-    model.save_pretrained(LORA_DIR)
-    tokenizer.save_pretrained(LORA_DIR)
-    print(f"LoRA adapter saved to {LORA_DIR}")
-    print("Merging adapter into base model (fp16)...")
-    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
-        model_name=LORA_DIR,
-        load_in_4bit=False,
-        max_seq_length=2048,
-    )
-    merged_model.save_pretrained_merged(
-        MERGED_DIR,
-        merged_tokenizer,
-        save_method="merged_16bit",
-    )
-    print(f"Merged model saved to {MERGED_DIR}")
-    if HF_REPO:
-        try:
-            print(f"Pushing merged model to {HF_REPO}...")
-            merged_model.push_to_hub_merged(
-                HF_REPO,
-                merged_tokenizer,
-                save_method="merged_16bit",
-                token=HF_TOKEN,
-            )
-            print(f"Model live at https://huggingface.co/{HF_REPO}")
-        except Exception as e:
-            print(f"Hub push failed: {e}")
-            print(f"Model is still saved locally at {MERGED_DIR}")
-    else:
-        print("Set HF_REPO env var to auto-push to Hub (skipped).")
-    print("Done.")

 import json
 import torch
+import requests
 from datasets import Dataset
 from unsloth import FastLanguageModel, PatchFastRL
 from trl import GRPOTrainer, GRPOConfig
+# MUST be called before trainer instantiation
 PatchFastRL("GRPO", FastLanguageModel)
+ENV_URL = "http://localhost:8000"
+TASKS = ["task_1_healthcare", "task_2_financial",
+         "task_3_multimodal", "task_4_targeting"]
+SYSTEM_PROMPT = """You are an enterprise Ad Policy Compliance Agent.
+Always respond with ONLY valid JSON, no markdown.
 REQUIRED PHASE ORDER:
+1. query_regulations  — always first
+2. analyze_image      — required for multimodal tasks
+3. submit_audit       — always before final decision
+4. approve or reject  — only after audit
+Format: {"action_type": "<action>", "reasoning": "<reason>"}"""
+# ── DATASET ───────────────────────────────────────────────────────────────────
 def build_dataset():
     rows = []
+    for task_id in TASKS:
+        res = requests.post(f"{ENV_URL}/reset", json={"task_id": task_id})
+        obs = res.json()
+        prompt = (
+            f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
+            f"{SYSTEM_PROMPT}<|eot_id|>"
+            f"<|start_header_id|>user<|end_header_id|>\n"
+            f"Task: {task_id}\n"
+            f"Ad: {obs.get('headline','N/A')} — {obs.get('body_text','N/A')}\n"
+            f"Trust Score: {obs.get('advertiser_trust_score','N/A')}\n"
+            f"Status: {obs.get('status_message','')}\n"
+            f"What is your next action?"
+            f"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
         )
+        rows.append({"prompt": prompt, "task_id": task_id})
+    # 25x repetition = 100 rows, enough for 1 epoch
+    return Dataset.from_list(rows * 25)
+# ── REWARD FUNCTION (actually calls the environment) ──────────────────────────
+def reward_environment(prompts, completions, task_id, **kwargs):
+    """
+    This is the real reward — model outputs an action,
+    we send it to the environment, environment returns the reward.
+    """
     rewards = []
+    # Notice we zip with task_id (from the dataset) and use t_id inside the loop
+    for completion, t_id in zip(completions, task_id):
+        try:
+            # Parse model output
+            content = completion.strip()
+            if content.startswith("```"):
+                content = content.split("```")[1]
+                if content.startswith("json"):
+                    content = content[4:]
+            action = json.loads(content.strip())
+            action_type = action.get("action_type", "query_regulations")
+        except Exception:
+            # Malformed JSON = penalty
+            rewards.append(-0.5)
             continue
         try:
+            # Fresh episode for each reward calculation
+            requests.post(f"{ENV_URL}/reset", json={"task_id": t_id})
+            # Run a minimal sequence: if model says query_regulations,
+            # run that then check what reward it generates
+            step_res = requests.post(
+                f"{ENV_URL}/step",
+                json={"action": {"action_type": action_type,
+                                 "reasoning": action.get("reasoning", "")}},
+                timeout=5
             )
+            data = step_res.json()
+            rewards.append(float(data.get("reward", -0.1)))
+        except Exception:
+            rewards.append(-0.1)
+    return rewards
+def reward_json_format(prompts, completions, **kwargs):
+    """Bonus reward for valid JSON output."""
+    rewards = []
+    for completion in completions:
+        try:
+            content = completion.strip()
+            if content.startswith("```"):
+                content = content.split("```")[1]
+                if content.startswith("json"):
+                    content = content[4:]
+            json.loads(content.strip())
+            rewards.append(0.5)
         except Exception:
+            rewards.append(-0.5)
     return rewards
+# ── MODEL SETUP ───────────────────────────────────────────────────────────────
 model, tokenizer = FastLanguageModel.from_pretrained(
     model_name="unsloth/Llama-3.1-8B-Instruct",
+    max_seq_length=1024,
+    load_in_4bit=True,
 )
 model = FastLanguageModel.get_peft_model(
     model,
+    r=16,
+    target_modules=["q_proj", "v_proj"],
+    lora_alpha=16,
+    lora_dropout=0.0,
     use_gradient_checkpointing="unsloth",
 )
+# ── TRAINER ───────────────────────────────────────────────────────────────────
 dataset = build_dataset()
 trainer = GRPOTrainer(
     model=model,
+    reward_funcs=[reward_environment, reward_json_format],
     args=GRPOConfig(
+        output_dir="outputs/meta-ad-agent",
+        learning_rate=5e-6,
+        num_train_epochs=1,
+        per_device_train_batch_size=2,
+        gradient_accumulation_steps=4,
+        max_prompt_length=512,
         max_completion_length=128,
+        num_generations=4,          # lower = faster, enough for demo
         logging_steps=5,
+        save_steps=50,
         report_to="none",
     ),
     train_dataset=dataset,
     tokenizer=tokenizer,
 )
 if __name__ == "__main__":
+    print("Starting GRPO training — environment must be running on :8000")
+    trainer.train()
+    model.save_pretrained("outputs/meta-ad-agent-final")
+    tokenizer.save_pretrained("outputs/meta-ad-agent-final")
+    print("Done. Model saved to outputs/meta-ad-agent-final")