from __future__ import annotations import argparse import json from pathlib import Path from typing import Any PROJECT_ROOT = Path(__file__).resolve().parents[1] DEFAULT_TRAIN_INPUT = PROJECT_ROOT / "data" / "train.jsonl" DEFAULT_EVAL_INPUT = PROJECT_ROOT / "data" / "eval.jsonl" DEFAULT_TRAIN_OUTPUT = PROJECT_ROOT / "data" / "routercore_train_instruct.jsonl" DEFAULT_EVAL_OUTPUT = PROJECT_ROOT / "data" / "routercore_eval_instruct.jsonl" SYSTEM_PROMPT = """You are RouterCore, a routing model for DevOps agent workflows. Return only valid JSON matching the RouterCore schema. Do not include markdown, explanations, or code fences. Allowed statuses: routed, needs_clarification, requires_confirmation, rejected, fallback Allowed workflows: create_web_app, create_storage_bucket, create_service_account, grant_iam_role, create_scheduler_job Required JSON fields: status, workflow, confidence, parameters, missing_fields, candidate_workflows, failure_reasons, clarifying_question""" def build_inference_prompt(user_request: str) -> str: return f"""{SYSTEM_PROMPT} User request: {user_request} RouterCore JSON: """ def build_training_prompt(user_request: str, expected: dict[str, Any]) -> str: expected_json = json.dumps(expected, sort_keys=True) return f"{build_inference_prompt(user_request)}{expected_json}" def load_jsonl(path: Path) -> list[dict[str, Any]]: rows: list[dict[str, Any]] = [] with path.open("r", encoding="utf-8") as handle: for line in handle: if line.strip(): rows.append(json.loads(line)) return rows def write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None: path.parent.mkdir(parents=True, exist_ok=True) lines = [json.dumps(row, sort_keys=True) for row in rows] path.write_text("\n".join(lines) + "\n", encoding="utf-8") def format_rows(rows: list[dict[str, Any]]) -> list[dict[str, str]]: return [ { "id": row["id"], "text": build_training_prompt(row["input"], row["expected"]), } for row in rows ] def format_dataset( train_input: Path = DEFAULT_TRAIN_INPUT, eval_input: Path = DEFAULT_EVAL_INPUT, train_output: Path = DEFAULT_TRAIN_OUTPUT, eval_output: Path = DEFAULT_EVAL_OUTPUT, ) -> tuple[Path, Path]: train_rows = format_rows(load_jsonl(train_input)) eval_rows = format_rows(load_jsonl(eval_input)) write_jsonl(train_output, train_rows) write_jsonl(eval_output, eval_rows) return train_output, eval_output def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Format RouterCore JSONL data for causal-LM instruction tuning.") parser.add_argument("--train-input", type=Path, default=DEFAULT_TRAIN_INPUT) parser.add_argument("--eval-input", type=Path, default=DEFAULT_EVAL_INPUT) parser.add_argument("--train-output", type=Path, default=DEFAULT_TRAIN_OUTPUT) parser.add_argument("--eval-output", type=Path, default=DEFAULT_EVAL_OUTPUT) return parser.parse_args() def main() -> None: args = parse_args() train_output, eval_output = format_dataset( train_input=args.train_input, eval_input=args.eval_input, train_output=args.train_output, eval_output=args.eval_output, ) print(f"Wrote instruction train data to {train_output}") print(f"Wrote instruction eval data to {eval_output}") if __name__ == "__main__": main()