Spaces:

parth-1
/

MetaGuard-Train

Runtime error

File size: 5,811 Bytes

193a9d2
 
aea0b8c
193a9d2
 
 
 
aea0b8c
193a9d2
 
aea0b8c
 
 
a5a9c5a
aea0b8c
 
193a9d2
a5a9c5a
aea0b8c
 
 
 
a5a9c5a
aea0b8c
193a9d2
aea0b8c
193a9d2
 
 
aea0b8c
 
 
 
 
 
 
 
 
 
 
 
 
193a9d2
aea0b8c
 
 
193a9d2
aea0b8c
193a9d2
aea0b8c
 
 
 
 
193a9d2
aea0b8c
 
 
 
 
 
 
 
 
 
 
 
 
 
193a9d2
 
 
aea0b8c
 
 
 
 
 
 
 
 
 
193a9d2
aea0b8c
 
 
 
193a9d2
aea0b8c
193a9d2
aea0b8c
 
 
 
 
 
 
 
 
 
 
 
193a9d2
aea0b8c
193a9d2
 
aea0b8c
193a9d2
 
a5a9c5a
aea0b8c
 
193a9d2
 
 
aea0b8c
 
 
 
193a9d2
 
 
aea0b8c
193a9d2
 
 
 
 
aea0b8c
193a9d2
aea0b8c
 
 
 
 
 
a5a9c5a
aea0b8c
70c7c72
aea0b8c
193a9d2
 
 
 
 
 
 
aea0b8c

import json
import torch
import requests
from datasets import Dataset
from unsloth import FastLanguageModel, PatchFastRL
from trl import GRPOTrainer, GRPOConfig

# MUST be called before trainer instantiation
PatchFastRL("GRPO", FastLanguageModel)

ENV_URL = "http://localhost:8000"
TASKS = ["task_1_healthcare", "task_2_financial",
         "task_3_multimodal", "task_4_targeting"]

SYSTEM_PROMPT = """You are an enterprise Ad Policy Compliance Agent.
Always respond with ONLY valid JSON, no markdown.

REQUIRED PHASE ORDER:
1. query_regulations  — always first
2. analyze_image      — required for multimodal tasks  
3. submit_audit       — always before final decision
4. approve or reject  — only after audit

Format: {"action_type": "<action>", "reasoning": "<reason>"}"""

# ── DATASET ───────────────────────────────────────────────────────────────────

def build_dataset():
    rows = []
    for task_id in TASKS:
        res = requests.post(f"{ENV_URL}/reset", json={"task_id": task_id})
        obs = res.json()
        prompt = (
            f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
            f"{SYSTEM_PROMPT}<|eot_id|>"
            f"<|start_header_id|>user<|end_header_id|>\n"
            f"Task: {task_id}\n"
            f"Ad: {obs.get('headline','N/A')} — {obs.get('body_text','N/A')}\n"
            f"Trust Score: {obs.get('advertiser_trust_score','N/A')}\n"
            f"Status: {obs.get('status_message','')}\n"
            f"What is your next action?"
            f"<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n"
        )
        rows.append({"prompt": prompt, "task_id": task_id})
    # 25x repetition = 100 rows, enough for 1 epoch
    return Dataset.from_list(rows * 25)

# ── REWARD FUNCTION (actually calls the environment) ──────────────────────────

def reward_environment(prompts, completions, task_id, **kwargs):
    """
    This is the real reward — model outputs an action,
    we send it to the environment, environment returns the reward.
    """
    rewards = []
    # Notice we zip with task_id (from the dataset) and use t_id inside the loop
    for completion, t_id in zip(completions, task_id):
        try:
            # Parse model output
            content = completion.strip()
            if content.startswith("```"):
                content = content.split("```")[1]
                if content.startswith("json"):
                    content = content[4:]
            action = json.loads(content.strip())
            action_type = action.get("action_type", "query_regulations")
        except Exception:
            # Malformed JSON = penalty
            rewards.append(-0.5)
            continue

        try:
            # Fresh episode for each reward calculation
            requests.post(f"{ENV_URL}/reset", json={"task_id": t_id})
            
            # Run a minimal sequence: if model says query_regulations,
            # run that then check what reward it generates
            step_res = requests.post(
                f"{ENV_URL}/step",
                json={"action": {"action_type": action_type, 
                                 "reasoning": action.get("reasoning", "")}},
                timeout=5
            )
            data = step_res.json()
            rewards.append(float(data.get("reward", -0.1)))
        except Exception:
            rewards.append(-0.1)

    return rewards

def reward_json_format(prompts, completions, **kwargs):
    """Bonus reward for valid JSON output."""
    rewards = []
    for completion in completions:
        try:
            content = completion.strip()
            if content.startswith("```"):
                content = content.split("```")[1]
                if content.startswith("json"):
                    content = content[4:]
            json.loads(content.strip())
            rewards.append(0.5)
        except Exception:
            rewards.append(-0.5)
    return rewards

# ── MODEL SETUP ───────────────────────────────────────────────────────────────

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.1-8B-Instruct",
    max_seq_length=1024,
    load_in_4bit=True,
)
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "v_proj"],
    lora_alpha=16,
    lora_dropout=0.0,
    use_gradient_checkpointing="unsloth",
)

# ── TRAINER ───────────────────────────────────────────────────────────────────

dataset = build_dataset()

trainer = GRPOTrainer(
    model=model,
    reward_funcs=[reward_environment, reward_json_format],
    args=GRPOConfig(
        output_dir="outputs/meta-ad-agent",
        learning_rate=5e-6,
        num_train_epochs=1,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_prompt_length=512,
        max_completion_length=128,
        num_generations=4,          # lower = faster, enough for demo
        logging_steps=5,
        save_steps=50,
        report_to="none",
    ),
    train_dataset=dataset,
    tokenizer=tokenizer,
)

if __name__ == "__main__":
    print("Starting GRPO training — environment must be running on :8000")
    trainer.train()
    model.save_pretrained("outputs/meta-ad-agent-final")
    tokenizer.save_pretrained("outputs/meta-ad-agent-final")
    print("Done. Model saved to outputs/meta-ad-agent-final")