Spaces:

Meta-HF-hackathon
/

updated-policy

Sleeping

App Files Files Community

updated-policy / logger /trajectory.py

srinjoyd

add files

e60af4b 26 days ago

raw

history blame contribute delete

58.4 kB

	# /// script
	# dependencies = [
	# "requests",
	# "huggingface_hub",
	# ]
	# ///

	"""
	=============================================================
	SRE INCIDENT RESPONSE — COMPREHENSIVE TRAJECTORY COLLECTOR
	=============================================================

	Generates fine-tuning trajectories from the SRE incident simulator covering:
	• All 10 tasks (8 training + 2 held-out compound scenarios)
	• All 4 pools:
	A — Phase 1 only (incident response)
	B — Phase 2 only (code investigation, oracle belief injected)
	C — Joint P1→P2 (full two-phase pipeline)
	D — Held-out joint (generalization test)
	• Full 17-action action space across both phases
	• Multiple models from 1.5B to 70B+ (round-robin rotation)
	• ALL episodes retained — negative-reward trajectories are kept as
	hard-negative examples for RL/GRPO training

	Output files:
	sre_raw_trajectories.json — full episode records with score breakdowns
	sre_sft_dataset.jsonl — per-step SFT samples (both phases, all rewards)
	sre_grpo_dataset.jsonl — (prompt, chosen, rejected) pairs for GRPO/DPO

	Usage:
	export HF_TOKEN=hf_...
	python sre_finetune_collector.py

	Optional env vars:
	NUM_EPISODES total episodes to collect (default: 200)
	BASE_URL simulator URL (default: HF Space URL)
	MAX_STEPS max steps per episode (default: 35)
	SLEEP_BETWEEN seconds between steps (default: 0.6)
	"""

	from __future__ import annotations

	import json
	import os
	import random
	import time
	import traceback
	from collections import defaultdict
	from typing import Any, Dict, List, Optional, Tuple

	import requests

	def upload_checkpoint(api, repo_id):
	for fname in ["sre_raw_trajectories.jsonl", "sre_sft_dataset.jsonl", "sre_grpo_dataset.jsonl"]:
	if os.path.exists(fname):
	try:
	api.upload_file(
	path_or_fileobj=fname,
	path_in_repo=fname,
	repo_id=repo_id,
	repo_type="dataset",
	)
	print(f"✅ Uploaded {fname}")
	except Exception as e:
	print(f"❌ Upload failed {fname}: {e}")

	# ──────────────────────────────────────────────────────────────────────────────
	# Configuration
	# ──────────────────────────────────────────────────────────────────────────────

	HF_TOKEN = os.environ.get("HF_TOKEN")
	BASE_URL = os.environ.get("BASE_URL", "https://meta-hf-hackathon-updated-policy.hf.space")
	HF_ROUTER_URL = "https://router.huggingface.co/v1/chat/completions"
	NUM_EPISODES = int(os.environ.get("NUM_EPISODES", "100"))
	MAX_STEPS = int(os.environ.get("MAX_STEPS", "35"))
	SLEEP_BETWEEN = float(os.environ.get("SLEEP_BETWEEN", "0.6"))

	# ── Model ─────────────────────────────────────────────────────────────────────
	MODELS: List[str] = [
	"Qwen/Qwen2.5-7B-Instruct:fastest",
	]

	# ── Task registry per pool ────────────────────────────────────────────────────
	# Pool A: P1-only incident response (all 8 training tasks)
	# Pool B: P2-only code investigation (oracle belief injected; 7 tasks with code)
	# Pool C: Joint P1→P2 full pipeline (all 8 training tasks)
	# Pool D: Held-out joint (2 compound scenarios — generalization evaluation)
	POOL_TASKS: Dict[str, List[str]] = {
	"A": [
	"memory_leak", "cascading_failure", "distributed_deadlock",
	"circuit_breaker_noop", "aliased_fault", "severity_inversion",
	"confidence_inversion", "info_ordering",
	],
	"B": [
	"memory_leak", "cascading_failure", "distributed_deadlock",
	"aliased_fault", "severity_inversion", "confidence_inversion", "info_ordering",
	],
	"C": [
	"memory_leak", "cascading_failure", "distributed_deadlock",
	"circuit_breaker_noop", "aliased_fault", "severity_inversion",
	"confidence_inversion", "info_ordering",
	],
	"D": [
	"heldout_aliased_severity", "heldout_confidence_ordering",
	],
	}

	# Episode budget distribution across pools (must sum to 1.0)
	POOL_WEIGHTS: Dict[str, float] = {"A": 0.35, "B": 0.20, "C": 0.35, "D": 0.10}

	# ── Action space definitions ──────────────────────────────────────────────────
	P1_DIAGNOSTIC = ["view_alerts", "query_logs", "check_metrics", "check_dependencies",
	"check_deploy_history", "run_health_check"]
	P1_REMEDIATION = ["restart_service", "rollback_deploy", "scale_service"]
	P1_TERMINAL = ["declare_root_cause", "transition_to_phase2"]
	P1_ACTIONS = P1_DIAGNOSTIC + P1_REMEDIATION + P1_TERMINAL

	P2_DIAGNOSTIC = ["list_dir", "read_file", "search_code", "get_git_log", "get_file_diff"]
	P2_TERMINAL = ["propose_patch", "declare_no_change"]
	P2_ACTIONS = P2_DIAGNOSTIC + P2_TERMINAL

	ALL_SERVICES = ["api_gateway", "auth", "orders", "payment", "cache", "database", "queue"]

	TARGETED_ACTIONS = {
	"query_logs", "check_metrics", "check_dependencies", "check_deploy_history",
	"run_health_check", "restart_service", "rollback_deploy", "scale_service",
	}

	# Service dependency graph (for smarter fallbacks)
	DEPENDENCY_GRAPH: Dict[str, List[str]] = {
	"api_gateway": ["auth", "orders", "cache"],
	"auth": ["database"],
	"orders": ["database", "payment", "auth"],
	"payment": ["queue", "database"],
	"cache": [],
	"database": [],
	"queue": [],
	}


	# ──────────────────────────────────────────────────────────────────────────────
	# System Prompts
	# ──────────────────────────────────────────────────────────────────────────────

	SYSTEM_PROMPT_P1 = """You are an expert SRE handling a production incident in a microservices system.

	## Service Topology (downstream ← upstream)
	api_gateway ← auth, orders, cache
	auth ← database
	orders ← database, payment, auth
	payment ← queue, database
	cache, database, queue ← (no dependencies)

	## Phase 1 Action Space
	Output EXACTLY ONE valid JSON action per turn. No markdown, no explanation.

	Diagnostic (read-only):
	{"action_type": "view_alerts"}
	{"action_type": "query_logs", "target_service": "<svc>", "parameters": {"level": "ERROR", "keyword": "<optional>", "limit": 20}}
	{"action_type": "check_metrics", "target_service": "<svc>"}
	{"action_type": "check_dependencies", "target_service": "<svc>"}
	{"action_type": "check_deploy_history", "target_service": "<svc>"}
	{"action_type": "run_health_check", "target_service": "<svc>"}

	Remediation (mutates state):
	{"action_type": "restart_service", "target_service": "<svc>"}
	{"action_type": "rollback_deploy", "target_service": "<svc>"}
	{"action_type": "scale_service", "target_service": "<svc>", "parameters": {"replicas": 5}}

	Declare root cause (ALL tasks — always call this once you have a diagnosis):
	{"action_type": "declare_root_cause", "parameters": {"root_cause": "<specific diagnosis — service, what failed, why>"}}

	Then for joint-mode tasks, ALSO transition to code investigation:
	{"action_type": "transition_to_phase2", "parameters": {"belief": {
	"suspected_service": "<root_cause_svc>",
	"suspected_fault_class": "memory_leak\|config_change\|deadlock\|dep_upgrade\|none",
	"service_confidence": 0.85,
	"fault_confidence": 0.80,
	"evidence_gaps": ["<what_you_didnt_check>"],
	"estimated_p2_cost": "low\|medium\|high",
	"decision": "transition",
	"reasoning": "<concise evidence summary>"
	}}}

	## Investigation Strategy
	1. ALWAYS start with view_alerts to understand severity and scope
	2. check_metrics on the highest-alert service first
	3. query_logs (level=ERROR) on degraded/down services
	4. check_dependencies on the affected service to find upstream causes
	5. check_deploy_history before any rollback
	6. Remediate the ROOT CAUSE service, not the symptom
	7. After 6-8 diagnostic steps you MUST call declare_root_cause with your diagnosis.
	For P1-only tasks this ends the episode. For joint-mode tasks, follow it immediately
	with transition_to_phase2. Do NOT keep diagnosing indefinitely — commit to a conclusion.

	CRITICAL: Output ONLY valid JSON. No markdown. No explanation. No code blocks."""

	SYSTEM_PROMPT_P2 = """You are an expert SRE investigating a code-level fault in a sandboxed repository.

	## Phase 2 Action Space
	Output EXACTLY ONE valid JSON action per turn. No markdown, no explanation.

	Code Exploration:
	{"action_type": "list_dir", "parameters": {"path": "."}}
	{"action_type": "read_file", "parameters": {"path": "relative/path/to/file.py"}}
	{"action_type": "search_code", "parameters": {"query": "<search string>", "file_pattern": "*.py", "max_hits": 20}}
	{"action_type": "get_git_log", "parameters": {"path": ".", "n_commits": 15}}
	{"action_type": "get_file_diff", "parameters": {"commit_sha": "<sha>", "path": "relative/path/file.py"}}

	Terminal:
	{"action_type": "propose_patch", "parameters": {"diff": "<unified diff — minimal, correct, applies cleanly>"}}
	{"action_type": "declare_no_change", "parameters": {"reason": "<why no code fix is needed — infrastructure issue, not code>"}}

	## Investigation Strategy
	1. list_dir "." to understand project structure
	2. get_git_log to find recent commits — especially the bad_commit_sha from Phase 1 context
	3. get_file_diff on the suspicious commit SHA to see what changed
	4. read_file on affected files to understand the bug
	5. search_code to find related patterns or the fault injection site
	6. If you found a code bug: propose_patch with a minimal, syntactically valid unified diff.
	The bad_commit_sha in your context tells you exactly what changed — read that diff and revert/fix it.
	7. declare_no_change ONLY if Phase 1 confirmed a spurious alert / circuit-breaker false positive
	with no deployment or code change involved. If there IS a bad commit in the git log, propose_patch.

	CRITICAL: Output ONLY valid JSON. No markdown. No explanation. No code blocks."""


	# ──────────────────────────────────────────────────────────────────────────────
	# Observation Formatters
	# ──────────────────────────────────────────────────────────────────────────────

	def _fmt_service_statuses(statuses: Dict[str, str]) -> str:
	symbols = {"healthy": "✓", "degraded": "~", "down": "✗"}
	return " ".join(
	f"{symbols.get(v,'?')}{svc}({v})"
	for svc, v in sorted(statuses.items())
	)


	def _fmt_action_result(result: Any, max_chars: int = 3000) -> str:
	if result is None:
	return "(no result)"
	text = json.dumps(result, indent=2) if not isinstance(result, str) else result
	if len(text) > max_chars:
	text = text[:max_chars] + f"\n... [truncated {len(text)-max_chars} chars]"
	return text


	def format_initial_p1_obs(obs: dict, info: dict) -> str:
	"""Format the very first observation for Phase 1."""
	task = info.get("task_name", "unknown")
	pool = info.get("pool", "?")
	mode = info.get("mode", "unknown")
	phase = obs.get("current_phase", 1)

	svc_line = _fmt_service_statuses(obs.get("service_statuses", {}))
	valid = obs.get("valid_actions", P1_ACTIONS)

	return (
	f"INCIDENT RESPONSE \| Pool {pool} \| Mode: {mode} \| Task: {task}\n"
	f"{'─'*60}\n"
	f"Summary: {obs.get('incident_summary', 'No summary available')}\n"
	f"Severity: {obs.get('severity', '?')} \| "
	f"Time Budget: {obs.get('time_budget_minutes', '?')} min \| "
	f"Max Steps: {obs.get('max_steps', MAX_STEPS)}\n"
	f"Phase: {phase}\n"
	f"\nService Statuses:\n {svc_line}\n"
	f"Active Alerts: {obs.get('active_alerts_count', 0)}\n"
	f"\nValid Actions: {valid}\n"
	f"\nWhat is your FIRST action?"
	)


	def format_step_result_p1(obs: dict, reward: float) -> str:
	"""Format a step result during Phase 1."""
	svc_line = _fmt_service_statuses(obs.get("service_statuses", {}))
	result = _fmt_action_result(obs.get("action_result"))

	lines = [
	f"Action Result (success={obs.get('action_success', '?')}): "
	f"{obs.get('action_message', '')}",
	f"\n{result}",
	f"\n{'─'*40}",
	f"Services: {svc_line}",
	f"Alerts: {obs.get('active_alerts_count', 0)} active",
	f"Step: {obs.get('steps_taken','?')}/{obs.get('max_steps', MAX_STEPS)} "
	f"\| Time: {obs.get('time_elapsed_minutes','?')}/{obs.get('time_budget_minutes','?')} min",
	f"Reward: {reward:+.3f} \| Cumulative: {obs.get('cumulative_reward', 0):+.3f}",
	]
	if obs.get("bad_commit_sha"):
	lines.append(f"Bad Commit SHA: {obs['bad_commit_sha']} (remember for Phase 2)")

	valid = obs.get("valid_actions", P1_ACTIONS)
	lines.append(f"\nValid Actions: {valid}")
	lines.append("\nWhat is your next action?")
	return "\n".join(lines)


	def format_initial_p2_obs(obs: dict, info: dict, belief: Optional[dict]) -> str:
	"""Format the first Phase 2 observation (after transition or Pool B auto-start)."""
	task = info.get("task_name", "unknown")
	pool = info.get("pool", "?")

	belief_text = ""
	if belief:
	belief_text = (
	f"\n[Phase 1 Belief]\n"
	f" Suspected service: {belief.get('suspected_service', '?')}\n"
	f" Suspected fault: {belief.get('suspected_fault_class', '?')}\n"
	f" Service confidence: {belief.get('service_confidence', 0):.0%}\n"
	f" Fault confidence: {belief.get('fault_confidence', 0):.0%}\n"
	f" Reasoning: {belief.get('reasoning', '')}\n"
	f" P2 cost estimate: {belief.get('estimated_p2_cost', '?')}\n"
	)

	sha_line = (
	f"Bad Commit SHA: {obs.get('bad_commit_sha', '(check git log)')}\n"
	if obs.get("bad_commit_sha") else ""
	)

	return (
	f"CODE INVESTIGATION \| Pool {pool} \| Task: {task}\n"
	f"{'─'*60}\n"
	f"{sha_line}"
	f"{belief_text}\n"
	f"Step: {obs.get('steps_taken', 0)}/{obs.get('max_steps', MAX_STEPS)} "
	f"\| Cumulative Reward: {obs.get('cumulative_reward', 0):+.3f}\n"
	f"\nValid Actions: {obs.get('valid_actions', P2_ACTIONS)}\n"
	f"\nWhat is your first Phase 2 action?"
	)


	def format_step_result_p2(obs: dict, reward: float) -> str:
	"""Format a step result during Phase 2."""
	result = _fmt_action_result(obs.get("action_result"))

	lines = [
	f"Action Result (success={obs.get('action_success', '?')}): "
	f"{obs.get('action_message', '')}",
	f"\n{result}",
	f"\n{'─'*40}",
	f"Step: {obs.get('steps_taken','?')}/{obs.get('max_steps', MAX_STEPS)}",
	f"Reward: {reward:+.3f} \| Cumulative: {obs.get('cumulative_reward', 0):+.3f}",
	f"\nValid Actions: {obs.get('valid_actions', P2_ACTIONS)}",
	"\nWhat is your next action?",
	]
	return "\n".join(lines)


	# ──────────────────────────────────────────────────────────────────────────────
	# Message Builder
	# ──────────────────────────────────────────────────────────────────────────────

	def build_messages(
	history: List[Dict],
	initial_user_msg: str,
	phase: int,
	max_recent: int = 10,
	) -> List[Dict]:
	"""
	Build the full OpenAI-format messages list.
	history: [{"action_json": str, "result_text": str, "reward": float}, ...]
	max_recent caps how many turns are included to avoid context-length 422s.
	"""
	system = SYSTEM_PROMPT_P1 if phase == 1 else SYSTEM_PROMPT_P2
	messages: List[Dict] = [
	{"role": "system", "content": system},
	{"role": "user", "content": initial_user_msg},
	]
	for entry in history[-max_recent:]:
	messages.append({"role": "assistant", "content": entry["action_json"]})
	messages.append({"role": "user", "content": entry["result_text"]})
	return messages


	# ──────────────────────────────────────────────────────────────────────────────
	# Model Caller
	# ──────────────────────────────────────────────────────────────────────────────

	def call_model(
	messages: List[Dict],
	model: str,
	temperature: float = 0.5,
	max_tokens: int = 512,
	retries: int = 3,
	) -> str:
	if not HF_TOKEN:
	raise ValueError("HF_TOKEN is not set.")
	payload = {
	"model": model,
	"messages": messages,
	"max_tokens": max_tokens,
	"temperature": temperature,
	}
	last_exc: Exception = RuntimeError("No attempts made")
	for attempt in range(retries):
	try:
	resp = requests.post(
	HF_ROUTER_URL,
	headers={
	"Authorization": f"Bearer {HF_TOKEN}",
	"Content-Type": "application/json",
	},
	json=payload,
	timeout=90,
	)
	resp.raise_for_status()
	return resp.json()["choices"][0]["message"]["content"].strip()
	except requests.HTTPError as e:
	code = e.response.status_code if e.response is not None else 0
	if code in (400, 422):
	raise # client-format errors — retrying won't help; let caller handle
	last_exc = e
	wait = 2 ** attempt
	print(f" [model retry {attempt+1}/{retries}] {e} — waiting {wait}s")
	time.sleep(wait)
	except Exception as e:
	last_exc = e
	wait = 2 ** attempt
	print(f" [model retry {attempt+1}/{retries}] {e} — waiting {wait}s")
	time.sleep(wait)
	raise last_exc


	def _merge_system_into_user(messages: List[Dict]) -> List[Dict]:
	"""Fold system prompt into the first user message for models without system role."""
	if not messages or messages[0]["role"] != "system":
	return messages
	system_text = messages[0]["content"]
	rest = messages[1:]
	if not rest or rest[0]["role"] != "user":
	return rest
	merged_first = {"role": "user", "content": f"{system_text}\n\n{rest[0]['content']}"}
	return [merged_first] + rest[1:]


	# Models confirmed to reject the system role — merged format used from the start.
	_MODELS_NEEDING_MERGE: set = set()


	def call_model_adaptive(
	history: List[Dict],
	initial_msg: str,
	phase: int,
	model: str,
	temperature: float = 0.5,
	) -> str:
	"""
	Call model with two layers of fallback:
	400 (system role) → merge system into first user message and cache the result
	so every subsequent step skips the wasted probe.
	400 (after merge) → content still too long; halve history window.
	422 (ctx length) → halve history window.
	"""
	use_merge = model in _MODELS_NEEDING_MERGE
	probed_merge = use_merge # True = already confirmed in a prior step, no re-probe needed
	max_recent = 10
	while True:
	messages = build_messages(history, initial_msg, phase, max_recent=max_recent)
	if use_merge:
	messages = _merge_system_into_user(messages)
	try:
	result = call_model(messages, model=model, temperature=temperature)
	if use_merge and not probed_merge:
	# Merged succeeded for the first time — cache it
	_MODELS_NEEDING_MERGE.add(model)
	print(f" [merged format confirmed for {model.split('/')[1]}, cached]")
	return result
	except requests.HTTPError as e:
	code = e.response.status_code if e.response is not None else 0
	if code == 400 and not use_merge:
	use_merge = True
	probed_merge = False
	print(f" [400: probing merged format for {model.split('/')[1]}]")
	elif code in (400, 422) and max_recent > 1:
	max_recent = max(1, max_recent // 2)
	print(f" [ctx truncated to last {max_recent} turns]")
	else:
	raise


	# ──────────────────────────────────────────────────────────────────────────────
	# Action Parsers
	# ──────────────────────────────────────────────────────────────────────────────

	def _extract_json(raw: str) -> dict:
	"""Extract the first JSON object from model output and normalise colon-format action types.

	Some models output {"action_type": "check_metrics:api_gateway"} instead of
	the correct {"action_type": "check_metrics", "target_service": "api_gateway"}.
	Split and normalise so the environment never sees an invalid action_type.
	"""
	start = raw.find("{")
	end = raw.rfind("}") + 1
	if start == -1 or end == 0:
	raise ValueError("No JSON object in model output")
	action = json.loads(raw[start:end])
	atype = action.get("action_type", "")
	if ":" in atype:
	parts = atype.split(":", 1)
	action["action_type"] = parts[0]
	if parts[1] in ALL_SERVICES and "target_service" not in action:
	action["target_service"] = parts[1]
	return action


	def _recent_sigs(recent_actions: List[dict], n: int = 3) -> set:
	return {(a.get("action_type"), a.get("target_service")) for a in recent_actions[-n:]}


	def _diversify_p1(obs: dict, recent_actions: List[dict]) -> dict:
	"""Return the next logical diagnostic action that hasn't been done recently."""
	statuses = obs.get("service_statuses") or {}
	bad_svcs = [s for s, st in statuses.items() if st != "healthy"]
	used_sigs = _recent_sigs(recent_actions, n=4)
	used_svcs = {a.get("target_service") for a in recent_actions[-6:]}
	used_types = {a.get("action_type") for a in recent_actions[-4:]}

	# Build a uniform list of (score, action) tuples
	candidates: List[Tuple[int, dict]] = []
	for atype in P1_DIAGNOSTIC:
	if atype == "view_alerts":
	score = 0 if ("view_alerts", None) in used_sigs else 2
	candidates.append((score, {"action_type": "view_alerts"}))
	continue
	for svc in (bad_svcs or ALL_SERVICES):
	a: dict = {"action_type": atype, "target_service": svc}
	if atype == "query_logs":
	a["parameters"] = {"level": "ERROR", "limit": 20}
	already_used = (atype, svc) in used_sigs
	score = (not already_used) * 2 + (svc not in used_svcs) + (atype not in used_types)
	candidates.append((score, a))

	candidates.sort(key=lambda x: -x[0])
	if candidates:
	return candidates[0][1]
	# Last resort
	svc = next((s for s in ALL_SERVICES if s not in used_svcs), random.choice(ALL_SERVICES))
	return {"action_type": "query_logs", "target_service": svc,
	"parameters": {"level": "ERROR", "limit": 20}}


	def parse_p1_action(raw: str, step: int, obs: dict, recent_actions: Optional[List[dict]] = None) -> dict:
	"""Parse Phase 1 action with smart fallbacks and anti-repetition."""
	recent_actions = recent_actions or []
	valid = set(obs.get("valid_actions") or P1_ACTIONS)
	try:
	action = _extract_json(raw)
	atype = action.get("action_type", "")

	if atype not in valid:
	action = _diversify_p1(obs, recent_actions) if step > 0 else {"action_type": "view_alerts"}
	atype = action["action_type"]

	# Ensure target_service for targeted actions
	if atype in TARGETED_ACTIONS:
	if "target_service" not in action or action["target_service"] not in ALL_SERVICES:
	svcs = obs.get("available_services") or ALL_SERVICES
	action["target_service"] = random.choice(svcs)

	# Anti-repetition: if this exact (type, service) was used recently, diversify
	sig = (action.get("action_type"), action.get("target_service"))
	if sig in _recent_sigs(recent_actions, n=2) and atype not in ("declare_root_cause", "transition_to_phase2"):
	action = _diversify_p1(obs, recent_actions)
	atype = action["action_type"]

	# Validate transition_to_phase2 belief structure
	if atype == "transition_to_phase2":
	params = action.setdefault("parameters", {})
	belief = params.setdefault("belief", {})
	degraded = [s for s, st in (obs.get("service_statuses") or {}).items()
	if st != "healthy"]
	belief.setdefault("suspected_service", degraded[0] if degraded else random.choice(ALL_SERVICES))
	belief.setdefault("suspected_fault_class", "memory_leak")
	belief.setdefault("service_confidence", 0.7)
	belief.setdefault("fault_confidence", 0.65)
	belief.setdefault("evidence_gaps", [])
	belief.setdefault("estimated_p2_cost", "medium")
	belief.setdefault("decision", "transition")
	belief.setdefault("reasoning", "Transitioning based on collected evidence")

	return action

	except Exception:
	if step == 0:
	return {"action_type": "view_alerts"}
	return _diversify_p1(obs, recent_actions)


	def _force_p1_terminal(obs: dict) -> dict:
	"""Build a best-effort terminal action from observed state."""
	valid = set(obs.get("valid_actions") or P1_ACTIONS)
	statuses = obs.get("service_statuses") or {}
	degraded = [s for s, st in statuses.items() if st != "healthy"]

	if "transition_to_phase2" in valid:
	svc = degraded[0] if degraded else random.choice(ALL_SERVICES)
	return {
	"action_type": "transition_to_phase2",
	"parameters": {"belief": {
	"suspected_service": svc,
	"suspected_fault_class": "memory_leak",
	"service_confidence": 0.5,
	"fault_confidence": 0.5,
	"evidence_gaps": ["forced_terminal_after_step_limit"],
	"estimated_p2_cost": "medium",
	"decision": "transition",
	"reasoning": f"Forced transition: degraded={degraded}",
	}},
	}
	cause = (f"Degradation detected in: {', '.join(degraded)}"
	if degraded else "Root cause undetermined within step budget")
	return {"action_type": "declare_root_cause",
	"parameters": {"root_cause": cause}}


	def parse_p2_action(raw: str, step: int, obs: dict, recent_actions: Optional[List[dict]] = None) -> dict:
	"""Parse Phase 2 action with smart fallbacks and anti-repetition."""
	recent_actions = recent_actions or []
	valid = set(obs.get("valid_actions") or P2_ACTIONS)
	used_sigs = _recent_sigs(recent_actions, n=3)

	p2_fallback_sequence = [
	{"action_type": "list_dir", "parameters": {"path": "."}},
	{"action_type": "get_git_log", "parameters": {"path": ".", "n_commits": 15}},
	{"action_type": "search_code", "parameters": {"query": "error", "file_pattern": "*.py", "max_hits": 15}},
	{"action_type": "search_code", "parameters": {"query": "def ", "file_pattern": "*.py", "max_hits": 10}},
	{"action_type": "list_dir", "parameters": {"path": "src"}},
	]

	try:
	action = _extract_json(raw)
	atype = action.get("action_type", "")

	if atype not in valid:
	action = p2_fallback_sequence[step % len(p2_fallback_sequence)]
	atype = action["action_type"]

	# Ensure required params
	params = action.setdefault("parameters", {})
	if atype == "list_dir":
	params.setdefault("path", ".")
	elif atype == "read_file":
	params.setdefault("path", ".")
	elif atype == "search_code":
	params.setdefault("query", "error")
	params.setdefault("file_pattern", "*.py")
	params.setdefault("max_hits", 15)
	elif atype == "get_git_log":
	params.setdefault("path", ".")
	params.setdefault("n_commits", 10)
	elif atype == "get_file_diff":
	sha = obs.get("bad_commit_sha") or "HEAD"
	params.setdefault("commit_sha", sha)
	params.setdefault("path", ".")
	elif atype == "propose_patch" and "diff" not in params:
	action = {"action_type": "declare_no_change",
	"parameters": {"reason": "Unable to determine code fix from available evidence"}}
	elif atype == "declare_no_change":
	params.setdefault("reason", "No code-level fix required based on investigation")

	# Anti-repetition for non-terminal actions
	sig = (action.get("action_type"), str(action.get("parameters", {}).get("path", "")))
	if sig in used_sigs and atype not in P2_TERMINAL:
	action = p2_fallback_sequence[(step + len(recent_actions)) % len(p2_fallback_sequence)]

	return action

	except Exception:
	return p2_fallback_sequence[step % len(p2_fallback_sequence)]


	# ──────────────────────────────────────────────────────────────────────────────
	# Environment HTTP Helpers
	# ──────────────────────────────────────────────────────────────────────────────

	def _mask_p1_obs(obs: dict, pool: str) -> dict:
	"""Pool A is p1_only — remove transition_to_phase2 the server incorrectly exposes."""
	if pool == "A" and obs.get("valid_actions"):
	obs = dict(obs)
	obs["valid_actions"] = [a for a in obs["valid_actions"] if a != "transition_to_phase2"]
	return obs


	def env_reset(task_name: str, pool: str, seed: Optional[int] = None) -> dict:
	body: dict = {"task_name": task_name, "pool": pool}
	if seed is not None:
	body["seed"] = seed
	resp = requests.post(f"{BASE_URL}/reset", json=body, timeout=30)
	resp.raise_for_status()
	return resp.json()


	def env_step(action: dict) -> dict:
	resp = requests.post(f"{BASE_URL}/step", json=action, timeout=30)
	resp.raise_for_status()
	return resp.json()


	def env_score(declared_patch: Optional[str], declared_no_change: bool,
	belief_history: List[dict]) -> dict:
	"""Fetch unified grader scores for the completed episode."""
	try:
	resp = requests.post(
	f"{BASE_URL}/score",
	json={
	"declared_patch": declared_patch,
	"declared_no_change": declared_no_change,
	"belief_history": belief_history,
	},
	timeout=30,
	)
	resp.raise_for_status()
	return resp.json()
	except Exception as e:
	print(f" [score] {e}")
	return {}


	def env_get_trajectory() -> dict:
	"""Fetch the full trajectory from the server."""
	try:
	resp = requests.get(f"{BASE_URL}/trajectory", timeout=30)
	resp.raise_for_status()
	return resp.json()
	except Exception:
	return {}


	# ──────────────────────────────────────────────────────────────────────────────
	# Episode Runner
	# ──────────────────────────────────────────────────────────────────────────────

	def run_episode(
	task_name: str,
	pool: str,
	model: str,
	episode_id: int,
	seed: Optional[int] = None,
	) -> dict:
	"""
	Run one full episode (Phase 1, Phase 2, or Joint) through the HTTP API.

	Returns a rich episode record including:
	- step records with (action, raw_model_output, observation, reward)
	- final score breakdown from /score
	- SFT-ready message sequences per step
	"""
	print(f"\n{'═'*60}")
	print(f" Ep {episode_id+1:>3} \| Pool {pool} \| Task: {task_name}")
	print(f" Model: {model}")
	print(f"{'─'*60}")

	reset_resp = env_reset(task_name, pool, seed)
	obs = _mask_p1_obs(reset_resp.get("observation", {}), pool)
	info = reset_resp.get("info", {})
	initial_phase = obs.get("current_phase", 1)

	# Tracks for the episode
	p1_steps: List[dict] = []
	p2_steps: List[dict] = []
	belief_history: List[dict] = []
	declared_patch: Optional[str] = None
	declared_no_change: bool = False

	# Conversation history per phase (for message building)
	p1_history: List[dict] = []
	p2_history: List[dict] = []
	last_belief: Optional[dict] = None

	# Recent actions (flattened, for anti-repetition)
	recent_actions: List[dict] = []
	consecutive_errors = [0] # consecutive model call failures
	consecutive_negative = [0] # consecutive negative-reward steps (patience)

	# Initial user messages
	initial_p1_msg = format_initial_p1_obs(obs, info)
	initial_p2_msg: Optional[str] = None # set on transition

	current_phase = initial_phase
	done = False

	for step_idx in range(MAX_STEPS):
	if done:
	break

	# Pool B: transition to phase 2 only if the env actually started in phase 1.
	# If the env auto-transitioned during reset, current_phase is already 2 — skip.
	if pool == "B" and current_phase == 1 and len(p1_steps) == 0 and "transition_to_phase2" in (obs.get("valid_actions") or []):
	raw = "{}"
	action = {
	"action_type": "transition_to_phase2",
	"parameters": {"belief": {
	"suspected_service": None,
	"suspected_fault_class": None,
	"service_confidence": 0.0,
	"fault_confidence": 0.0,
	"evidence_gaps": [],
	"estimated_p2_cost": "unknown",
	"decision": "transition",
	"reasoning": "Pool B: oracle belief injected by environment",
	}},
	}
	else:
	# Hard ceiling: force terminal if too close to max_steps
	p1_hard_limit = MAX_STEPS - 8 if pool in ("C", "D") else MAX_STEPS - 3
	if current_phase == 1 and step_idx >= p1_hard_limit:
	action = _force_p1_terminal(obs)
	raw = json.dumps(action)
	print(f" [step limit: forcing terminal]")
	else:
	# Call model with adaptive history truncation on 422
	cur_history = p1_history if current_phase == 1 else p2_history
	cur_initial = initial_p1_msg if current_phase == 1 else (
	initial_p2_msg or format_initial_p2_obs(obs, info, last_belief)
	)
	if current_phase == 2 and initial_p2_msg is None:
	initial_p2_msg = cur_initial

	model_ok = True
	try:
	raw = call_model_adaptive(cur_history, cur_initial, current_phase, model)
	except Exception as e:
	print(f" [model error] {e}")
	raw = "{}"
	model_ok = False

	# If model keeps failing in P1, force terminal after 8 consecutive errors
	if not model_ok:
	consecutive_errors[0] += 1
	else:
	consecutive_errors[0] = 0

	if current_phase == 1 and consecutive_errors[0] >= 8:
	action = _force_p1_terminal(obs)
	raw = json.dumps(action)
	consecutive_errors[0] = 0
	print(f" [8 consecutive model errors: forcing terminal]")
	elif current_phase == 1:
	action = parse_p1_action(raw, step_idx, obs, recent_actions)
	else:
	action = parse_p2_action(raw, step_idx, obs, recent_actions)

	print(f" step {step_idx+1:>2} \| ph{current_phase} \| {action.get('action_type')}"
	+ (f"({action.get('target_service','')})" if action.get("target_service") else ""))

	# Track terminal/transition actions before stepping
	atype = action.get("action_type", "")
	if atype == "transition_to_phase2":
	belief = action.get("parameters", {}).get("belief", {})
	last_belief = belief
	belief_history.append(belief)
	if atype == "propose_patch":
	declared_patch = action.get("parameters", {}).get("diff", "")
	if atype == "declare_no_change":
	declared_no_change = True

	# Step environment
	try:
	step_resp = env_step(action)
	except Exception as e:
	print(f" [env error] {e}")
	break

	reward = float(step_resp.get("reward", 0.0))
	done = step_resp.get("done", False)
	new_obs = step_resp.get("observation", {})
	new_phase = new_obs.get("current_phase", current_phase)

	print(f" reward={reward:+.3f} cumulative={new_obs.get('cumulative_reward', 0):+.3f}"
	+ (" DONE" if done else ""))

	# Build result text for next turn
	if current_phase == 1:
	result_text = format_step_result_p1(new_obs, reward)
	else:
	result_text = format_step_result_p2(new_obs, reward)

	step_record = {
	"step": step_idx,
	"phase": current_phase,
	"action": action,
	"raw_output": raw,
	"observation": new_obs,
	"reward": reward,
	"result_text": result_text, # stored for SFT building
	}

	if current_phase == 1:
	p1_steps.append(step_record)
	p1_history.append({"action_json": json.dumps(action), "result_text": result_text})
	else:
	p2_steps.append(step_record)
	p2_history.append({"action_json": json.dumps(action), "result_text": result_text})

	recent_actions.append(action)

	# Patience: 10 consecutive negative rewards → force terminal immediately
	if reward < 0:
	consecutive_negative[0] += 1
	else:
	consecutive_negative[0] = 0

	if consecutive_negative[0] >= 10 and not done:
	print(f" [patience exhausted: 10 consecutive negatives — forcing terminal]")
	if current_phase == 1:
	term_action = _force_p1_terminal(new_obs)
	else:
	term_action = {"action_type": "declare_no_change",
	"parameters": {"reason": "Patience exhausted — no progress detected"}}
	try:
	term_resp = env_step(term_action)
	term_reward = float(term_resp.get("reward", 0.0))
	done = term_resp.get("done", False)
	term_obs = term_resp.get("observation", new_obs)
	print(f" [forced terminal] reward={term_reward:+.3f} cumulative={term_obs.get('cumulative_reward',0):+.3f} DONE")
	steps_list = p1_steps if current_phase == 1 else p2_steps
	steps_list.append({"step": step_idx + 1, "phase": current_phase,
	"action": term_action, "raw_output": "{}",
	"observation": term_obs, "reward": term_reward,
	"result_text": ""})
	new_obs = term_obs
	except Exception as e:
	print(f" [forced terminal env error] {e}")
	break

	# Detect phase transition
	if new_phase != current_phase and new_phase == 2:
	print(" ── Phase 1 → Phase 2 ──")
	initial_p2_msg = format_initial_p2_obs(new_obs, info, last_belief)
	recent_actions.clear() # reset repetition tracking for the new phase
	consecutive_negative[0] = 0 # reset patience on phase change

	current_phase = new_phase
	obs = _mask_p1_obs(new_obs, pool)
	time.sleep(SLEEP_BETWEEN)

	# Fetch unified scores
	score = env_score(declared_patch, declared_no_change, belief_history)
	cumulative = obs.get("cumulative_reward", 0.0)

	print(f" Final cumulative reward: {cumulative:.3f}")
	if score:
	print(f" Scores: {json.dumps({k: round(v, 3) for k, v in score.items()})}")

	return {
	"episode_id": episode_id,
	"task_name": task_name,
	"pool": pool,
	"model": model,
	"seed": seed,
	"p1_steps": p1_steps,
	"p2_steps": p2_steps,
	"num_p1_steps": len(p1_steps),
	"num_p2_steps": len(p2_steps),
	"cumulative_reward": round(cumulative, 4),
	"score_breakdown": score,
	"declared_patch": declared_patch,
	"declared_no_change": declared_no_change,
	"belief_history": belief_history,
	"done": done,
	# Reconstructed conversation contexts for SFT building
	"_initial_p1_msg": initial_p1_msg,
	"_initial_p2_msg": initial_p2_msg,
	"_p1_history": p1_history,
	"_p2_history": p2_history,
	}


	# ──────────────────────────────────────────────────────────────────────────────
	# SFT Dataset Formatter
	# ──────────────────────────────────────────────────────────────────────────────

	def episode_to_sft_samples(ep: dict) -> List[dict]:
	"""
	Convert one episode into per-step SFT samples for BOTH phases.

	ALL steps are included regardless of reward — negative-reward steps
	provide hard-negative signal critical for RL/preference training.
	The `reward` field is preserved so the training code can filter or weight.
	"""
	samples: List[dict] = []

	def _extract_samples(steps, phase, initial_msg, history_key):
	history_so_far: List[dict] = []
	for i, step_rec in enumerate(steps):
	system = SYSTEM_PROMPT_P1 if phase == 1 else SYSTEM_PROMPT_P2
	messages = build_messages(history_so_far, initial_msg, phase=phase)
	messages.append({
	"role": "assistant",
	"content": json.dumps(step_rec["action"]),
	})
	samples.append({
	"messages": messages,
	"reward": step_rec["reward"],
	"phase": phase,
	"action_type": step_rec["action"].get("action_type"),
	"task_name": ep["task_name"],
	"pool": ep["pool"],
	"model": ep["model"],
	"episode_id": ep["episode_id"],
	"step": i,
	})
	history_so_far.append({
	"action_json": json.dumps(step_rec["action"]),
	"result_text": step_rec.get("result_text", ""),
	})

	if ep.get("p1_steps") and ep.get("_initial_p1_msg"):
	_extract_samples(ep["p1_steps"], 1, ep["_initial_p1_msg"], "_p1_history")

	if ep.get("p2_steps") and ep.get("_initial_p2_msg"):
	_extract_samples(ep["p2_steps"], 2, ep["_initial_p2_msg"], "_p2_history")

	return samples


	# ──────────────────────────────────────────────────────────────────────────────
	# GRPO / DPO Dataset Formatter
	# ──────────────────────────────────────────────────────────────────────────────

	def episodes_to_grpo_pairs(episodes: List[dict]) -> List[dict]:
	"""
	Build (prompt, chosen, rejected) triplets for GRPO/DPO training.

	Three pairing strategies:
	1. Within-episode: best vs worst step (same prompt context)
	2. Cross-episode: same task+pool, different models, different outcomes
	3. Phase-specific: separate within-phase pairs for P2

	Chosen = action with higher reward.
	Rejected = action with lower reward.
	Both are kept regardless of absolute reward sign.
	"""
	pairs: List[dict] = []

	# ── Strategy 1: within-episode best/worst per phase ───────────────────────
	for ep in episodes:
	for phase, steps, initial_msg in [
	(1, ep.get("p1_steps", []), ep.get("_initial_p1_msg", "")),
	(2, ep.get("p2_steps", []), ep.get("_initial_p2_msg", "")),
	]:
	if len(steps) < 2 or not initial_msg:
	continue
	best = max(steps, key=lambda s: s["reward"])
	worst = min(steps, key=lambda s: s["reward"])
	if best["reward"] == worst["reward"]:
	continue
	if best is worst:
	continue

	prompt_msgs = build_messages([], initial_msg, phase=phase)
	pairs.append({
	"prompt": prompt_msgs,
	"chosen": json.dumps(best["action"]),
	"rejected": json.dumps(worst["action"]),
	"chosen_reward": best["reward"],
	"rejected_reward": worst["reward"],
	"margin": best["reward"] - worst["reward"],
	"task_name": ep["task_name"],
	"pool": ep["pool"],
	"phase": phase,
	"strategy": "within_episode",
	"episode_id": ep["episode_id"],
	})

	# ── Strategy 2: cross-episode, same task+pool ─────────────────────────────
	by_task_pool: Dict[str, List[dict]] = defaultdict(list)
	for ep in episodes:
	key = f"{ep['task_name']}_{ep['pool']}"
	by_task_pool[key].append(ep)

	for key, task_eps in by_task_pool.items():
	if len(task_eps) < 2:
	continue
	# Sort by cumulative reward; pair best vs worst episode
	sorted_eps = sorted(task_eps, key=lambda e: e["cumulative_reward"])
	best_ep = sorted_eps[-1]
	worst_ep = sorted_eps[0]
	if best_ep["cumulative_reward"] == worst_ep["cumulative_reward"]:
	continue
	if best_ep["episode_id"] == worst_ep["episode_id"]:
	continue

	# Use the first non-view_alerts action as representative
	def _first_substantive_action(ep_inner, phase):
	steps = ep_inner.get(f"p{phase}_steps", [])
	for s in steps:
	if s["action"].get("action_type") != "view_alerts":
	return s
	return steps[0] if steps else None

	for phase in [1, 2]:
	best_step = _first_substantive_action(best_ep, phase)
	worst_step = _first_substantive_action(worst_ep, phase)
	initial_msg = best_ep.get(f"_initial_p{phase}_msg", "")
	if not best_step or not worst_step or not initial_msg:
	continue

	prompt_msgs = build_messages([], initial_msg, phase=phase)
	pairs.append({
	"prompt": prompt_msgs,
	"chosen": json.dumps(best_step["action"]),
	"rejected": json.dumps(worst_step["action"]),
	"chosen_reward": best_ep["cumulative_reward"],
	"rejected_reward": worst_ep["cumulative_reward"],
	"margin": best_ep["cumulative_reward"] - worst_ep["cumulative_reward"],
	"task_name": best_ep["task_name"],
	"pool": best_ep["pool"],
	"phase": phase,
	"strategy": "cross_episode",
	"best_model": best_ep["model"],
	"worst_model": worst_ep["model"],
	})

	return pairs


	# ──────────────────────────────────────────────────────────────────────────────
	# Episode Schedule Builder
	# ──────────────────────────────────────────────────────────────────────────────

	def build_episode_schedule(n: int) -> List[Tuple[str, str, str, int]]:
	"""
	Return list of (task_name, pool, model, seed) tuples.

	Distribution:
	- Pools weighted by POOL_WEIGHTS
	- Tasks within each pool: round-robin
	- Models: round-robin across all MODELS
	- Seeds: random per episode (for reproducibility, logged in output)
	"""
	schedule: List[Tuple[str, str, str, int]] = []
	pool_counts = {
	pool: max(1, round(n * weight))
	for pool, weight in POOL_WEIGHTS.items()
	}
	# Adjust to exactly n
	total = sum(pool_counts.values())
	diff = n - total
	if diff > 0:
	pool_counts["C"] += diff
	elif diff < 0:
	pool_counts["A"] += diff # reduce A if over

	model_idx = 0
	for pool, count in pool_counts.items():
	tasks = POOL_TASKS[pool]
	for i in range(count):
	task = tasks[i % len(tasks)]
	model = MODELS[model_idx % len(MODELS)]
	seed = random.randint(0, 99999)
	schedule.append((task, pool, model, seed))
	model_idx += 1

	random.shuffle(schedule)
	return schedule


	# ──────────────────────────────────────────────────────────────────────────────
	# Main
	# ──────────────────────────────────────────────────────────────────────────────

	def _flush_episode(ep: dict, raw_f, sft_f) -> Tuple[int, int, int]:
	"""Append one episode to the open raw and SFT files. Returns (pos, zer, neg) step counts."""
	clean = {k: v for k, v in ep.items() if not k.startswith("_")}
	raw_f.write(json.dumps(clean) + "\n")
	raw_f.flush()

	samples = episode_to_sft_samples(ep)
	for s in samples:
	sft_f.write(json.dumps(s) + "\n")
	sft_f.flush()

	pos = sum(1 for s in samples if s["reward"] > 0)
	zer = sum(1 for s in samples if s["reward"] == 0)
	neg = sum(1 for s in samples if s["reward"] < 0)
	return pos, zer, neg


	def _finalize(all_episodes: List[dict], stats: Dict[str, List[float]]) -> None:
	"""Generate GRPO pairs and print final statistics from whatever was collected."""
	print(f"\n{'═'*60}")
	print(f"✅ Collected {len(all_episodes)} episodes")

	grpo_pairs = episodes_to_grpo_pairs(all_episodes)
	grpo_path = "sre_grpo_dataset.jsonl"
	with open(grpo_path, "w") as f:
	for p in grpo_pairs:
	f.write(json.dumps(p) + "\n")
	within = sum(1 for p in grpo_pairs if p["strategy"] == "within_episode")
	cross = sum(1 for p in grpo_pairs if p["strategy"] == "cross_episode")
	print(f"💾 GRPO dataset ({len(grpo_pairs)} pairs) → {grpo_path}")
	print(f" Pairs: {within} within-episode + {cross} cross-episode")

	if not all_episodes:
	return

	all_rewards = [ep["cumulative_reward"] for ep in all_episodes]
	print(f"\n📈 Reward statistics:")
	print(f" Overall: avg={sum(all_rewards)/len(all_rewards):.3f} "
	f"max={max(all_rewards):.3f} min={min(all_rewards):.3f}")

	print(f"\n By pool:")
	for pool in ["A", "B", "C", "D"]:
	rs = stats.get(f"pool_{pool}", [])
	if rs:
	print(f" Pool {pool}: n={len(rs):>3} avg={sum(rs)/len(rs):.3f} "
	f"max={max(rs):.3f} min={min(rs):.3f}")

	print(f"\n By task:")
	for task in sorted(set(ep["task_name"] for ep in all_episodes)):
	rs = stats.get(f"task_{task}", [])
	if rs:
	print(f" {task:<35} n={len(rs):>2} avg={sum(rs)/len(rs):.3f}")

	print(f"\n By model tier:")
	model_short_names = set()
	for ep in all_episodes:
	model_short_names.add(ep["model"].split("/")[1].split(":")[0])
	for mname in sorted(model_short_names):
	rs = stats.get(f"model_{mname}", [])
	if rs:
	print(f" {mname:<40} n={len(rs):>2} avg={sum(rs)/len(rs):.3f}")


	def main():
	from huggingface_hub import HfApi
	api = HfApi(token=HF_TOKEN)
	api.create_repo(repo_id="srinjoyd/sre-data", repo_type="dataset", exist_ok=True)
	if not HF_TOKEN:
	print("❌ HF_TOKEN is not set.\n export HF_TOKEN=hf_...")
	return

	print(f"🚀 SRE Trajectory Collector")
	print(f" Episodes: {NUM_EPISODES}")
	print(f" Models: {len(MODELS)} (rotating)")
	print(f" Tasks: {len(set(t for ts in POOL_TASKS.values() for t in ts))} unique")
	print(f" Pools: A / B / C / D")
	print(f" Base URL: {BASE_URL}")
	print(f" Keeping ALL episodes (negative reward = hard negatives for RL)")
	print(f" Saving incrementally — Ctrl+C safe\n")

	schedule = build_episode_schedule(NUM_EPISODES)
	all_episodes: List[dict] = []
	stats: Dict[str, List[float]] = defaultdict(list)
	total_pos = total_zer = total_neg = 0

	raw_path = "sre_raw_trajectories.jsonl"
	sft_path = "sre_sft_dataset.jsonl"
	print(f"💾 Writing to: {raw_path} \| {sft_path} (appending per episode)")
	print(f" GRPO pairs written at end (or on Ctrl+C)\n")

	with open(raw_path, "a") as raw_f, open(sft_path, "a") as sft_f:
	try:
	for ep_id, (task, pool, model, seed) in enumerate(schedule):
	try:
	ep = run_episode(task, pool, model, ep_id, seed=seed)
	except Exception as e:
	print(f" [!] Episode {ep_id+1} FAILED: {e}")
	traceback.print_exc()
	time.sleep(2)
	continue

	all_episodes.append(ep)
	pos, zer, neg = _flush_episode(ep, raw_f, sft_f)
	upload_checkpoint(api, "srinjoyd/sre-data")
	total_pos += pos
	total_zer += zer
	total_neg += neg

	r = ep["cumulative_reward"]
	stats[f"pool_{pool}"].append(r)
	stats[f"task_{task}"].append(r)
	stats[f"model_{model.split('/')[1].split(':')[0]}"].append(r)

	print(f" [saved] ep {ep_id+1}/{NUM_EPISODES} \| "
	f"SFT steps so far: +{total_pos}/0:{total_zer}/-{total_neg}")
	time.sleep(1.0)

	except KeyboardInterrupt:
	print(f"\n\n⚠️ Interrupted after {len(all_episodes)} episodes — saving what we have...")
	finally:
	upload_checkpoint(api, "srinjoyd/sre-data") # always runs, even on crash
	_finalize(all_episodes, stats)

	print(f"\n💾 Raw trajectories ({len(all_episodes)} eps) → {raw_path}")
	print(f"💾 SFT dataset ({total_pos+total_zer+total_neg} steps) → {sft_path}")
	print(f" Reward split: +{total_pos} / 0:{total_zer} / -{total_neg}")

	_finalize(all_episodes, stats)

	print(f"\n💡 To upload to HuggingFace Hub:")
	print(f" from datasets import Dataset")
	print(f" import json")
	print(f" sft = [json.loads(l) for l in open('sre_sft_dataset.jsonl')]")
	print(f" grpo = [json.loads(l) for l in open('sre_grpo_dataset.jsonl')]")
	print(f" Dataset.from_list(sft).push_to_hub('your-username/sre-sft-data')")
	print(f" Dataset.from_list(grpo).push_to_hub('your-username/sre-grpo-data')")


	if __name__ == "__main__":
	main()