Spaces:

Aswini-Kumar
/

datacentric-env

Sleeping

App Files Files Community

Aswini-Kumar commited on 22 days ago

Commit

054ddcc

verified ·

1 Parent(s): d8b5f3c

Upload training/train.py with huggingface_hub

Browse files

Files changed (1) hide show

training/train.py +293 -85

training/train.py CHANGED Viewed

@@ -1,71 +1,199 @@
 """
-GRPO training script — DataCentric-Env
 Trains Qwen2.5-3B-Instruct with GRPO via TRL + Unsloth.
-Run in Colab (GPU required). Make sure the environment server is deployed
-to HuggingFace Spaces and set ENV_URL below before running.
 """
-from unsloth import FastLanguageModel
-from trl import GRPOTrainer, GRPOConfig
-from datasets import Dataset
-import requests
 import json
 import torch
-# ─── Configuration ───────────────────────────────────────────────────────────
-ENV_URL = "https://aswini-kumar-datacentric-env.hf.space"  # HuggingFace Space URL
-SYSTEM_PROMPT = """You are a data quality agent. You receive dataset statistics and must choose which specialist tool to call to improve the dataset so a downstream classifier performs better.
-Always respond with valid JSON in this exact format:
-{"agent": "<tool_name>", "target": "<column_or_all>", "strategy": "<strategy_name>"}
-Available tools: cleaner, augmenter, balancer, relabeler, validator
-Cleaner strategies: median_impute, mean_impute, drop_rows
-Balancer strategies: undersample
-Relabeler: use when labels are noisy, costs 2 budget points."""
-# ─── Model setup ─────────────────────────────────────────────────────────────
 model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name="unsloth/Qwen2.5-3B-Instruct",
-    max_seq_length=1024,
     load_in_4bit=True,
 )
-model = FastLanguageModel.get_peft_model(model, r=16, lora_alpha=32)
-# ─── Rollout function ─────────────────────────────────────────────────────────
-def build_prompt(obs):
-    obs_text = json.dumps(obs, indent=2)
-    return f"{SYSTEM_PROMPT}\n\nCurrent state:\n{obs_text}\n\nYour action:"
-def rollout(prompt="start"):
-    """Run one episode and return (prompt, response, reward) tuples."""
-    obs = requests.post(f"{ENV_URL}/reset").json()
     trajectories = []
-    for step in range(10):
-        full_prompt = build_prompt(obs)
-        inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda")
         with torch.no_grad():
-            outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7)
-        response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-        # Parse action
         try:
-            action = json.loads(response.strip())
         except Exception:
-            action = {"agent": "validator"}  # fallback
-        result = requests.post(f"{ENV_URL}/step", json=action).json()
-        reward = result.get("reward", -1.0)
         trajectories.append({
-            "prompt": full_prompt,
             "response": response,
             "reward": reward,
         })
@@ -77,70 +205,150 @@ def rollout(prompt="start"):
     return trajectories
-# ─── Collect rollouts ─────────────────────────────────────────────────────────
-print("Collecting rollouts...")
 all_trajectories = []
-for episode in range(50):
-    all_trajectories.extend(rollout("start"))
-    if episode % 10 == 0:
-        print(f"  Episode {episode}/50 collected")
-# ─── Build training dataset ───────────────────────────────────────────────────
-dataset = Dataset.from_list([
-    {"prompt": t["prompt"], "chosen": t["response"], "reward": t["reward"]}
     for t in all_trajectories
 ])
-# ─── GRPO config ─────────────────────────────────────────────────────────────
 config = GRPOConfig(
     output_dir="./datacentric-grpo",
     num_train_epochs=3,
-    per_device_train_batch_size=4,
-    learning_rate=5e-5,
-    logging_steps=10,
-    save_steps=100,
-    report_to="none",  # swap to "wandb" if you want live curves
 )
-# ─── Monitor logging ──────────────────────────────────────────────────────────
-def log_sample(step):
-    """Log a live episode sample every 20 steps — watch for reward hacking."""
-    obs = requests.post(f"{ENV_URL}/reset").json()
-    print(f"\n--- Generation sample at step {step} ---")
-    for t in range(5):
-        inputs = tokenizer(build_prompt(obs), return_tensors="pt").to("cuda")
-        with torch.no_grad():
-            out = model.generate(**inputs, max_new_tokens=80)
-        response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-        print(f"  Step {t}: agent output = {response[:120]}")
-        try:
-            action = json.loads(response.strip())
-        except Exception:
-            print("  WARNING: agent produced invalid JSON — format reward not working")
-            action = {"agent": "validator"}
-        result = requests.post(f"{ENV_URL}/step", json=action).json()
-        print(f"  Reward: {result.get('reward')} | Accuracy: {result['info']['new_accuracy']} | Done: {result.get('done')}")
-        obs = result.get("observation", obs)
-        if result.get("done"):
-            break
-# ─── Train ────────────────────────────────────────────────────────────────────
 trainer = GRPOTrainer(
     model=model,
     args=config,
-    train_dataset=dataset,
     tokenizer=tokenizer,
 )
-trainer.train()
-# ─── Save — use Unsloth merge path, NOT naive save_pretrained ────────────────
-# IMPORTANT: do NOT upcast 4-bit model to 16-bit then merge naively.
-# That damages model quality. Use the Unsloth merge path instead.
 model.save_pretrained_merged(
     "datacentric-grpo-final",
     tokenizer,
-    save_method="merged_16bit",  # correct merge path via Unsloth
 )
-print("Training complete. Test inference immediately before moving on.")

 """
+training/train.py — GRPO training for DataCentric-Env (v0.5).
 Trains Qwen2.5-3B-Instruct with GRPO via TRL + Unsloth.
+Run end-to-end in Colab (T4 GPU is sufficient).
+Before running:
+  1. Deploy the environment to HF Spaces (run deploy_to_hf.py locally)
+  2. Set ENV_URL below to your HF Space URL
+  3. Runtime → Run all
+What the agent learns:
+  - Given a real messy dataset (UCI Adult Census, Pima Diabetes, etc.)
+  - Query specialist agents to diagnose issues (domain-aware analysis)
+  - Apply recommended fixes to improve classifier accuracy on a frozen holdout
+  - Navigate: when to rollback a bad apply, how to interpret feature importance,
+    how to prioritize domain-specific issues (zeros-as-missing in medical data)
 """
+import os
 import json
+import time
+import requests
 import torch
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from datasets import Dataset
+from unsloth import FastLanguageModel
+from trl import GRPOTrainer, GRPOConfig
+# ── Configuration ──────────────────────────────────────────────────────────────
+ENV_URL = "https://aswini-kumar-datacentric-env.hf.space"  # ← your HF Space URL
+MODEL_NAME = "unsloth/Qwen2.5-3B-Instruct"
+MAX_SEQ_LEN = 2048
+N_ROLLOUT_EPISODES = 60
+MAX_STEPS_PER_EPISODE = 12
+LORA_RANK = 16
+# ── System prompt ──────────────────────────────────────────────────────────────
+SYSTEM_PROMPT = """You are an expert data engineer agent. You are given a real-world dataset \
+with known quality issues and must fix it so a frozen classifier achieves the target accuracy.
+You work by querying specialist agents for analysis, then deciding which recommendation to apply.
+WORKFLOW:
+1. Start by calling query_analyst (cost 2) — it gives you a prioritized action plan and \
+references the published benchmark accuracy for this dataset.
+2. Then call the specific agent it recommends (query_cleaner, query_balancer, etc.)
+3. Apply the best recommendation using its rec_id
+4. If accuracy dropped after an apply, use rollback to undo it (max 3 per episode)
+5. Read feature_importance in the response — it shows what the model actually learned
+DOMAIN RULES (critical):
+- In medical datasets, zero values for physiological measurements are IMPOSSIBLE — they mean \
+missing data. Always apply zero_to_nan_impute before other cleaning.
+- In financial datasets, heavily skewed features (like capital-gain) should be log-transformed.
+- Removing rows is dangerous — data integrity limit is 10% of training rows max.
+- Large augmentation (>200 rows) may overfit training set and HURT holdout accuracy. \
+If accuracy drops after augmentation, rollback and try balancer instead.
+OUTPUT FORMAT — respond with valid JSON only, no explanation:
+For queries: {"action": "query_analyst"} or {"action": "query_cleaner"} etc.
+For apply:   {"action": "apply", "rec_id": "<exact_id_from_recommendations>"}
+For rollback: {"action": "rollback"}"""
+# ── Model setup ────────────────────────────────────────────────────────────────
+print("Loading model...")
 model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=MODEL_NAME,
+    max_seq_length=MAX_SEQ_LEN,
     load_in_4bit=True,
 )
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=LORA_RANK,
+    lora_alpha=LORA_RANK * 2,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                    "gate_proj", "up_proj", "down_proj"],
+    lora_dropout=0.05,
+    bias="none",
+    use_gradient_checkpointing=True,
+)
+print(f"Model loaded: {MODEL_NAME} with LoRA r={LORA_RANK}")
+# ── Prompt builder ─────────────────────────────────────────────────────────────
+def build_prompt(obs: dict) -> str:
+    """
+    Build a compact but information-rich prompt from the observation.
+    Excludes the full pending_recommendations dict (too verbose) —
+    only includes rec_ids and their reason.
+    """
+    # Compact observation for the prompt
+    compact = {
+        "dataset": obs.get("dataset", {}).get("name", "unknown"),
+        "domain": obs.get("dataset", {}).get("domain", ""),
+        "known_issues": obs.get("dataset", {}).get("known_issues", [])[:2],
+        "current_accuracy": obs.get("current_accuracy"),
+        "target_accuracy": obs.get("target_accuracy"),
+        "accuracy_gap": obs.get("accuracy_gap"),
+        "benchmarks": obs.get("benchmarks", {}),
+        "budget_remaining": obs.get("budget_remaining"),
+        "dataset_stats": obs.get("dataset_stats", {}),
+        "pending_recommendations": {
+            rid: {
+                "agent": info.get("agent"),
+                "type": info.get("type"),
+                "reason": info.get("reason", "")[:120],  # truncate
+                "domain_informed": info.get("domain_informed", False),
+            }
+            for rid, info in obs.get("pending_recommendations", {}).items()
+        },
+        "episode_trace": obs.get("episode_trace", [])[-3:],  # last 3 steps
+        "feature_importance": obs.get("feature_importance", {}).get("top_positive", [])[:2],
+        "available_actions": obs.get("available_actions"),
+    }
+    return (
+        f"<|system|>\n{SYSTEM_PROMPT}\n"
+        f"<|user|>\nCurrent environment state:\n{json.dumps(compact, indent=2)}\n"
+        f"<|assistant|>\n"
+    )
+# ── Episode rollout ────────────────────────────────────────────────────────────
+def run_episode(difficulty: str = "easy") -> list[dict]:
+    """
+    Run one full episode. Returns list of (prompt, response, reward) tuples.
+    """
+    # Reset — get session_id and initial observation
+    try:
+        resp = requests.post(
+            f"{ENV_URL}/reset",
+            json={"difficulty": difficulty},
+            timeout=60,
+        )
+        resp.raise_for_status()
+    except Exception as e:
+        print(f"  Reset failed: {e}")
+        return []
+    obs = resp.json()
+    session_id = obs.get("session_id")
+    if not session_id:
+        print("  No session_id in reset response.")
+        return []
     trajectories = []
+    for step_num in range(MAX_STEPS_PER_EPISODE):
+        prompt = build_prompt(obs)
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
+                           max_length=MAX_SEQ_LEN).to("cuda")
         with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=80,
+                temperature=0.8,
+                do_sample=True,
+                pad_token_id=tokenizer.eos_token_id,
+            )
+        response = tokenizer.decode(
+            outputs[0][inputs["input_ids"].shape[1]:],
+            skip_special_tokens=True
+        ).strip()
+        # Parse and validate action
         try:
+            action = json.loads(response)
+            if "action" not in action:
+                raise ValueError("missing 'action' key")
         except Exception:
+            # Invalid JSON — format grader will penalize this
+            action = {"action": "query_analyst"}
+        # Always inject session_id
+        payload = {"session_id": session_id, **action}
+        try:
+            step_resp = requests.post(
+                f"{ENV_URL}/step",
+                json=payload,
+                timeout=30,
+            )
+            step_resp.raise_for_status()
+            result = step_resp.json()
+        except Exception as e:
+            print(f"  Step failed: {e}")
+            break
+        reward = float(result.get("reward", 0.001))
         trajectories.append({
+            "prompt": prompt,
             "response": response,
             "reward": reward,
         })
     return trajectories
+# ── Collect rollouts across difficulties ───────────────────────────────────────
+print(f"\nCollecting {N_ROLLOUT_EPISODES} episodes...")
 all_trajectories = []
+episode_rewards = []
+difficulty_schedule = (
+    ["easy"] * 20 + ["medium"] * 20 + ["hard"] * 20
+)
+for ep_idx, difficulty in enumerate(difficulty_schedule):
+    trajs = run_episode(difficulty=difficulty)
+    if trajs:
+        ep_reward = np.mean([t["reward"] for t in trajs])
+        episode_rewards.append(ep_reward)
+        all_trajectories.extend(trajs)
+        if ep_idx % 10 == 0:
+            print(f"  Episode {ep_idx}/{N_ROLLOUT_EPISODES} | "
+                  f"difficulty={difficulty} | mean_reward={ep_reward:.4f} | "
+                  f"n_steps={len(trajs)}")
+    time.sleep(0.5)  # avoid hammering the server
+print(f"\nTotal training samples: {len(all_trajectories)}")
+print(f"Mean reward across all episodes: {np.mean(episode_rewards):.4f}")
+if len(all_trajectories) < 10:
+    raise RuntimeError("Too few training samples collected. Check ENV_URL and server status.")
+# ── Build GRPO training dataset ────────────────────────────────────────────────
+# GRPO needs: prompt, completion (response), reward
+train_dataset = Dataset.from_list([
+    {
+        "prompt": t["prompt"],
+        "completion": t["response"],
+        "reward": t["reward"],
+    }
     for t in all_trajectories
+    if t["reward"] > 0.001  # filter degenerate samples
 ])
+print(f"Training dataset: {len(train_dataset)} samples")
+# ── GRPO training ──────────────────────────────────────────────────────────────
 config = GRPOConfig(
     output_dir="./datacentric-grpo",
     num_train_epochs=3,
+    per_device_train_batch_size=2,
+    gradient_accumulation_steps=4,
+    learning_rate=2e-5,
+    warmup_ratio=0.1,
+    lr_scheduler_type="cosine",
+    logging_steps=5,
+    save_steps=50,
+    report_to="none",
+    max_grad_norm=0.3,
+    fp16=True,
+    dataloader_num_workers=0,
 )
 trainer = GRPOTrainer(
     model=model,
     args=config,
+    train_dataset=train_dataset,
     tokenizer=tokenizer,
+    reward_funcs=[],  # rewards come from environment, already in dataset
 )
+print("\nStarting GRPO training...")
+train_result = trainer.train()
+print(f"Training complete. Final loss: {train_result.training_loss:.4f}")
+# ── Sample inspection — check for reward hacking ──────────────────────────────
+print("\n--- Sampling 3 agent generations (reward hacking check) ---")
+for i in range(3):
+    try:
+        resp = requests.post(f"{ENV_URL}/reset", json={"difficulty": "easy"}, timeout=60)
+        obs = resp.json()
+        session_id = obs["session_id"]
+        prompt = build_prompt(obs)
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
+                           max_length=MAX_SEQ_LEN).to("cuda")
+        with torch.no_grad():
+            out = model.generate(**inputs, max_new_tokens=80, do_sample=False)
+        response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+        print(f"\n  Sample {i+1}: {response[:200]}")
+        try:
+            action = json.loads(response.strip())
+            payload = {"session_id": session_id, **action}
+            step_r = requests.post(f"{ENV_URL}/step", json=payload, timeout=30).json()
+            print(f"  → reward={step_r.get('reward')} | accuracy={step_r.get('observation', {}).get('current_accuracy')}")
+        except Exception as e:
+            print(f"  → parse/step failed: {e}")
+    except Exception as e:
+        print(f"  Sample {i+1} failed: {e}")
+# ── Save model — Unsloth merge path (NOT naive save_pretrained) ───────────────
+print("\nSaving model (Unsloth merged_16bit path)...")
 model.save_pretrained_merged(
     "datacentric-grpo-final",
     tokenizer,
+    save_method="merged_16bit",
 )
+print("Model saved to ./datacentric-grpo-final")
+# ── Plot training curves → results.png ────────────────────────────────────────
+fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+fig.suptitle("DataCentric-Env — GRPO Training Results", fontsize=14, fontweight="bold")
+# Episode rewards
+ax1 = axes[0]
+ax1.plot(episode_rewards, color="#4f46e5", linewidth=1.5, alpha=0.6, label="Episode mean reward")
+if len(episode_rewards) >= 5:
+    smoothed = np.convolve(episode_rewards, np.ones(5)/5, mode="valid")
+    ax1.plot(range(4, len(episode_rewards)), smoothed,
+             color="#4f46e5", linewidth=2.5, label="5-ep moving avg")
+ax1.axvline(x=20, color="gray", linestyle="--", alpha=0.5, label="→ medium")
+ax1.axvline(x=40, color="gray", linestyle=":", alpha=0.5, label="→ hard")
+ax1.set_xlabel("Episode")
+ax1.set_ylabel("Mean Reward")
+ax1.set_title("Reward Progression Over Episodes")
+ax1.legend()
+ax1.set_ylim(0, 1)
+ax1.grid(alpha=0.3)
+# Reward distribution
+ax2 = axes[1]
+rewards_array = [t["reward"] for t in all_trajectories]
+ax2.hist(rewards_array, bins=30, color="#7c3aed", alpha=0.7, edgecolor="white")
+ax2.axvline(np.mean(rewards_array), color="#ef4444", linewidth=2,
+            label=f"Mean={np.mean(rewards_array):.3f}")
+ax2.axvline(np.median(rewards_array), color="#f97316", linewidth=2,
+            linestyle="--", label=f"Median={np.median(rewards_array):.3f}")
+ax2.set_xlabel("Reward")
+ax2.set_ylabel("Count")
+ax2.set_title("Distribution of Step Rewards")
+ax2.legend()
+ax2.grid(alpha=0.3)
+plt.tight_layout()
+plt.savefig("results.png", dpi=150, bbox_inches="tight")
+print("results.png saved.")
+plt.show()
+print("\n✅ All done. Submit results.png + datacentric-grpo-final/ directory.")