routercore / training /format_dataset.py
Jayteare's picture
Deploy RouterCore Gradio demo
1137e50 verified
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Any
PROJECT_ROOT = Path(__file__).resolve().parents[1]
DEFAULT_TRAIN_INPUT = PROJECT_ROOT / "data" / "train.jsonl"
DEFAULT_EVAL_INPUT = PROJECT_ROOT / "data" / "eval.jsonl"
DEFAULT_TRAIN_OUTPUT = PROJECT_ROOT / "data" / "routercore_train_instruct.jsonl"
DEFAULT_EVAL_OUTPUT = PROJECT_ROOT / "data" / "routercore_eval_instruct.jsonl"
SYSTEM_PROMPT = """You are RouterCore, a routing model for DevOps agent workflows.
Return only valid JSON matching the RouterCore schema.
Do not include markdown, explanations, or code fences.
Allowed statuses:
routed, needs_clarification, requires_confirmation, rejected, fallback
Allowed workflows:
create_web_app, create_storage_bucket, create_service_account, grant_iam_role, create_scheduler_job
Required JSON fields:
status, workflow, confidence, parameters, missing_fields, candidate_workflows, failure_reasons, clarifying_question"""
def build_inference_prompt(user_request: str) -> str:
return f"""{SYSTEM_PROMPT}
User request:
{user_request}
RouterCore JSON:
"""
def build_training_prompt(user_request: str, expected: dict[str, Any]) -> str:
expected_json = json.dumps(expected, sort_keys=True)
return f"{build_inference_prompt(user_request)}{expected_json}"
def load_jsonl(path: Path) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
with path.open("r", encoding="utf-8") as handle:
for line in handle:
if line.strip():
rows.append(json.loads(line))
return rows
def write_jsonl(path: Path, rows: list[dict[str, Any]]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
lines = [json.dumps(row, sort_keys=True) for row in rows]
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
def format_rows(rows: list[dict[str, Any]]) -> list[dict[str, str]]:
return [
{
"id": row["id"],
"text": build_training_prompt(row["input"], row["expected"]),
}
for row in rows
]
def format_dataset(
train_input: Path = DEFAULT_TRAIN_INPUT,
eval_input: Path = DEFAULT_EVAL_INPUT,
train_output: Path = DEFAULT_TRAIN_OUTPUT,
eval_output: Path = DEFAULT_EVAL_OUTPUT,
) -> tuple[Path, Path]:
train_rows = format_rows(load_jsonl(train_input))
eval_rows = format_rows(load_jsonl(eval_input))
write_jsonl(train_output, train_rows)
write_jsonl(eval_output, eval_rows)
return train_output, eval_output
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Format RouterCore JSONL data for causal-LM instruction tuning.")
parser.add_argument("--train-input", type=Path, default=DEFAULT_TRAIN_INPUT)
parser.add_argument("--eval-input", type=Path, default=DEFAULT_EVAL_INPUT)
parser.add_argument("--train-output", type=Path, default=DEFAULT_TRAIN_OUTPUT)
parser.add_argument("--eval-output", type=Path, default=DEFAULT_EVAL_OUTPUT)
return parser.parse_args()
def main() -> None:
args = parse_args()
train_output, eval_output = format_dataset(
train_input=args.train_input,
eval_input=args.eval_input,
train_output=args.train_output,
eval_output=args.eval_output,
)
print(f"Wrote instruction train data to {train_output}")
print(f"Wrote instruction eval data to {eval_output}")
if __name__ == "__main__":
main()