Spaces:

Aswini-Kumar
/

datacentric-env

Sleeping

App Files Files Community

datacentric-env / training /train.py

Aswini-Kumar

Upload training/train.py with huggingface_hub

054ddcc verified about 1 month ago

raw

history blame contribute delete

14.2 kB

	"""
	training/train.py — GRPO training for DataCentric-Env (v0.5).

	Trains Qwen2.5-3B-Instruct with GRPO via TRL + Unsloth.
	Run end-to-end in Colab (T4 GPU is sufficient).

	Before running:
	1. Deploy the environment to HF Spaces (run deploy_to_hf.py locally)
	2. Set ENV_URL below to your HF Space URL
	3. Runtime → Run all

	What the agent learns:
	- Given a real messy dataset (UCI Adult Census, Pima Diabetes, etc.)
	- Query specialist agents to diagnose issues (domain-aware analysis)
	- Apply recommended fixes to improve classifier accuracy on a frozen holdout
	- Navigate: when to rollback a bad apply, how to interpret feature importance,
	how to prioritize domain-specific issues (zeros-as-missing in medical data)
	"""

	import os
	import json
	import time
	import requests
	import torch
	import numpy as np
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	from datasets import Dataset
	from unsloth import FastLanguageModel
	from trl import GRPOTrainer, GRPOConfig

	# ── Configuration ──────────────────────────────────────────────────────────────
	ENV_URL = "https://aswini-kumar-datacentric-env.hf.space" # ← your HF Space URL
	MODEL_NAME = "unsloth/Qwen2.5-3B-Instruct"
	MAX_SEQ_LEN = 2048
	N_ROLLOUT_EPISODES = 60
	MAX_STEPS_PER_EPISODE = 12
	LORA_RANK = 16

	# ── System prompt ──────────────────────────────────────────────────────────────
	SYSTEM_PROMPT = """You are an expert data engineer agent. You are given a real-world dataset \
	with known quality issues and must fix it so a frozen classifier achieves the target accuracy.

	You work by querying specialist agents for analysis, then deciding which recommendation to apply.

	WORKFLOW:
	1. Start by calling query_analyst (cost 2) — it gives you a prioritized action plan and \
	references the published benchmark accuracy for this dataset.
	2. Then call the specific agent it recommends (query_cleaner, query_balancer, etc.)
	3. Apply the best recommendation using its rec_id
	4. If accuracy dropped after an apply, use rollback to undo it (max 3 per episode)
	5. Read feature_importance in the response — it shows what the model actually learned

	DOMAIN RULES (critical):
	- In medical datasets, zero values for physiological measurements are IMPOSSIBLE — they mean \
	missing data. Always apply zero_to_nan_impute before other cleaning.
	- In financial datasets, heavily skewed features (like capital-gain) should be log-transformed.
	- Removing rows is dangerous — data integrity limit is 10% of training rows max.
	- Large augmentation (>200 rows) may overfit training set and HURT holdout accuracy. \
	If accuracy drops after augmentation, rollback and try balancer instead.

	OUTPUT FORMAT — respond with valid JSON only, no explanation:
	For queries: {"action": "query_analyst"} or {"action": "query_cleaner"} etc.
	For apply: {"action": "apply", "rec_id": "<exact_id_from_recommendations>"}
	For rollback: {"action": "rollback"}"""


	# ── Model setup ────────────────────────────────────────────────────────────────
	print("Loading model...")
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=MODEL_NAME,
	max_seq_length=MAX_SEQ_LEN,
	load_in_4bit=True,
	)
	model = FastLanguageModel.get_peft_model(
	model,
	r=LORA_RANK,
	lora_alpha=LORA_RANK * 2,
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj"],
	lora_dropout=0.05,
	bias="none",
	use_gradient_checkpointing=True,
	)
	print(f"Model loaded: {MODEL_NAME} with LoRA r={LORA_RANK}")


	# ── Prompt builder ─────────────────────────────────────────────────────────────
	def build_prompt(obs: dict) -> str:
	"""
	Build a compact but information-rich prompt from the observation.
	Excludes the full pending_recommendations dict (too verbose) —
	only includes rec_ids and their reason.
	"""
	# Compact observation for the prompt
	compact = {
	"dataset": obs.get("dataset", {}).get("name", "unknown"),
	"domain": obs.get("dataset", {}).get("domain", ""),
	"known_issues": obs.get("dataset", {}).get("known_issues", [])[:2],
	"current_accuracy": obs.get("current_accuracy"),
	"target_accuracy": obs.get("target_accuracy"),
	"accuracy_gap": obs.get("accuracy_gap"),
	"benchmarks": obs.get("benchmarks", {}),
	"budget_remaining": obs.get("budget_remaining"),
	"dataset_stats": obs.get("dataset_stats", {}),
	"pending_recommendations": {
	rid: {
	"agent": info.get("agent"),
	"type": info.get("type"),
	"reason": info.get("reason", "")[:120], # truncate
	"domain_informed": info.get("domain_informed", False),
	}
	for rid, info in obs.get("pending_recommendations", {}).items()
	},
	"episode_trace": obs.get("episode_trace", [])[-3:], # last 3 steps
	"feature_importance": obs.get("feature_importance", {}).get("top_positive", [])[:2],
	"available_actions": obs.get("available_actions"),
	}
	return (
	f"<\|system\|>\n{SYSTEM_PROMPT}\n"
	f"<\|user\|>\nCurrent environment state:\n{json.dumps(compact, indent=2)}\n"
	f"<\|assistant\|>\n"
	)


	# ── Episode rollout ────────────────────────────────────────────────────────────
	def run_episode(difficulty: str = "easy") -> list[dict]:
	"""
	Run one full episode. Returns list of (prompt, response, reward) tuples.
	"""
	# Reset — get session_id and initial observation
	try:
	resp = requests.post(
	f"{ENV_URL}/reset",
	json={"difficulty": difficulty},
	timeout=60,
	)
	resp.raise_for_status()
	except Exception as e:
	print(f" Reset failed: {e}")
	return []

	obs = resp.json()
	session_id = obs.get("session_id")
	if not session_id:
	print(" No session_id in reset response.")
	return []

	trajectories = []

	for step_num in range(MAX_STEPS_PER_EPISODE):
	prompt = build_prompt(obs)
	inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
	max_length=MAX_SEQ_LEN).to("cuda")

	with torch.no_grad():
	outputs = model.generate(
	**inputs,
	max_new_tokens=80,
	temperature=0.8,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id,
	)
	response = tokenizer.decode(
	outputs[0][inputs["input_ids"].shape[1]:],
	skip_special_tokens=True
	).strip()

	# Parse and validate action
	try:
	action = json.loads(response)
	if "action" not in action:
	raise ValueError("missing 'action' key")
	except Exception:
	# Invalid JSON — format grader will penalize this
	action = {"action": "query_analyst"}

	# Always inject session_id
	payload = {"session_id": session_id, **action}

	try:
	step_resp = requests.post(
	f"{ENV_URL}/step",
	json=payload,
	timeout=30,
	)
	step_resp.raise_for_status()
	result = step_resp.json()
	except Exception as e:
	print(f" Step failed: {e}")
	break

	reward = float(result.get("reward", 0.001))
	trajectories.append({
	"prompt": prompt,
	"response": response,
	"reward": reward,
	})

	obs = result.get("observation", obs)
	if result.get("done"):
	break

	return trajectories


	# ── Collect rollouts across difficulties ───────────────────────────────────────
	print(f"\nCollecting {N_ROLLOUT_EPISODES} episodes...")
	all_trajectories = []
	episode_rewards = []
	difficulty_schedule = (
	["easy"] * 20 + ["medium"] * 20 + ["hard"] * 20
	)

	for ep_idx, difficulty in enumerate(difficulty_schedule):
	trajs = run_episode(difficulty=difficulty)
	if trajs:
	ep_reward = np.mean([t["reward"] for t in trajs])
	episode_rewards.append(ep_reward)
	all_trajectories.extend(trajs)
	if ep_idx % 10 == 0:
	print(f" Episode {ep_idx}/{N_ROLLOUT_EPISODES} \| "
	f"difficulty={difficulty} \| mean_reward={ep_reward:.4f} \| "
	f"n_steps={len(trajs)}")
	time.sleep(0.5) # avoid hammering the server

	print(f"\nTotal training samples: {len(all_trajectories)}")
	print(f"Mean reward across all episodes: {np.mean(episode_rewards):.4f}")

	if len(all_trajectories) < 10:
	raise RuntimeError("Too few training samples collected. Check ENV_URL and server status.")


	# ── Build GRPO training dataset ────────────────────────────────────────────────
	# GRPO needs: prompt, completion (response), reward
	train_dataset = Dataset.from_list([
	{
	"prompt": t["prompt"],
	"completion": t["response"],
	"reward": t["reward"],
	}
	for t in all_trajectories
	if t["reward"] > 0.001 # filter degenerate samples
	])
	print(f"Training dataset: {len(train_dataset)} samples")


	# ── GRPO training ──────────────────────────────────────────────────────────────
	config = GRPOConfig(
	output_dir="./datacentric-grpo",
	num_train_epochs=3,
	per_device_train_batch_size=2,
	gradient_accumulation_steps=4,
	learning_rate=2e-5,
	warmup_ratio=0.1,
	lr_scheduler_type="cosine",
	logging_steps=5,
	save_steps=50,
	report_to="none",
	max_grad_norm=0.3,
	fp16=True,
	dataloader_num_workers=0,
	)

	trainer = GRPOTrainer(
	model=model,
	args=config,
	train_dataset=train_dataset,
	tokenizer=tokenizer,
	reward_funcs=[], # rewards come from environment, already in dataset
	)

	print("\nStarting GRPO training...")
	train_result = trainer.train()
	print(f"Training complete. Final loss: {train_result.training_loss:.4f}")


	# ── Sample inspection — check for reward hacking ──────────────────────────────
	print("\n--- Sampling 3 agent generations (reward hacking check) ---")
	for i in range(3):
	try:
	resp = requests.post(f"{ENV_URL}/reset", json={"difficulty": "easy"}, timeout=60)
	obs = resp.json()
	session_id = obs["session_id"]
	prompt = build_prompt(obs)
	inputs = tokenizer(prompt, return_tensors="pt", truncation=True,
	max_length=MAX_SEQ_LEN).to("cuda")
	with torch.no_grad():
	out = model.generate(**inputs, max_new_tokens=80, do_sample=False)
	response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
	print(f"\n Sample {i+1}: {response[:200]}")

	try:
	action = json.loads(response.strip())
	payload = {"session_id": session_id, **action}
	step_r = requests.post(f"{ENV_URL}/step", json=payload, timeout=30).json()
	print(f" → reward={step_r.get('reward')} \| accuracy={step_r.get('observation', {}).get('current_accuracy')}")
	except Exception as e:
	print(f" → parse/step failed: {e}")
	except Exception as e:
	print(f" Sample {i+1} failed: {e}")


	# ── Save model — Unsloth merge path (NOT naive save_pretrained) ───────────────
	print("\nSaving model (Unsloth merged_16bit path)...")
	model.save_pretrained_merged(
	"datacentric-grpo-final",
	tokenizer,
	save_method="merged_16bit",
	)
	print("Model saved to ./datacentric-grpo-final")


	# ── Plot training curves → results.png ────────────────────────────────────────
	fig, axes = plt.subplots(1, 2, figsize=(14, 5))
	fig.suptitle("DataCentric-Env — GRPO Training Results", fontsize=14, fontweight="bold")

	# Episode rewards
	ax1 = axes[0]
	ax1.plot(episode_rewards, color="#4f46e5", linewidth=1.5, alpha=0.6, label="Episode mean reward")
	if len(episode_rewards) >= 5:
	smoothed = np.convolve(episode_rewards, np.ones(5)/5, mode="valid")
	ax1.plot(range(4, len(episode_rewards)), smoothed,
	color="#4f46e5", linewidth=2.5, label="5-ep moving avg")
	ax1.axvline(x=20, color="gray", linestyle="--", alpha=0.5, label="→ medium")
	ax1.axvline(x=40, color="gray", linestyle=":", alpha=0.5, label="→ hard")
	ax1.set_xlabel("Episode")
	ax1.set_ylabel("Mean Reward")
	ax1.set_title("Reward Progression Over Episodes")
	ax1.legend()
	ax1.set_ylim(0, 1)
	ax1.grid(alpha=0.3)

	# Reward distribution
	ax2 = axes[1]
	rewards_array = [t["reward"] for t in all_trajectories]
	ax2.hist(rewards_array, bins=30, color="#7c3aed", alpha=0.7, edgecolor="white")
	ax2.axvline(np.mean(rewards_array), color="#ef4444", linewidth=2,
	label=f"Mean={np.mean(rewards_array):.3f}")
	ax2.axvline(np.median(rewards_array), color="#f97316", linewidth=2,
	linestyle="--", label=f"Median={np.median(rewards_array):.3f}")
	ax2.set_xlabel("Reward")
	ax2.set_ylabel("Count")
	ax2.set_title("Distribution of Step Rewards")
	ax2.legend()
	ax2.grid(alpha=0.3)

	plt.tight_layout()
	plt.savefig("results.png", dpi=150, bbox_inches="tight")
	print("results.png saved.")
	plt.show()

	print("\n✅ All done. Submit results.png + datacentric-grpo-final/ directory.")