| |
| """Run customer-shaped Kaiju Coder 7 OpenCode/product-path tasks and verify files.""" |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import os |
| import shutil |
| import subprocess |
| import sys |
| import time |
| from pathlib import Path |
| from typing import Any |
|
|
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| DEFAULT_TASKS = ROOT / "evals/tasks/opencode-customer-readiness.jsonl" |
| DEFAULT_OUT = ROOT / "runs/opencode-customer-readiness" |
| DEFAULT_WORKSPACES = Path("/tmp/kaiju-opencode-customer-readiness-workspaces") |
| FORBIDDEN = ["sk_live_", "sk_test_", "rk_live_", "AIza", "anthropic_api_key"] |
|
|
|
|
| def load_tasks(path: Path, max_tasks: int | None) -> list[dict[str, Any]]: |
| tasks: list[dict[str, Any]] = [] |
| for line in path.read_text(encoding="utf-8").splitlines(): |
| if not line.strip(): |
| continue |
| tasks.append(json.loads(line)) |
| if max_tasks is not None and len(tasks) >= max_tasks: |
| break |
| return tasks |
|
|
|
|
| def read_file(path: Path) -> str: |
| if not path.exists() or not path.is_file(): |
| return "" |
| return path.read_text(encoding="utf-8", errors="replace") |
|
|
|
|
| def verify_task(task: dict[str, Any], workdir: Path) -> list[str]: |
| errors: list[str] = [] |
| required = task.get("required_files", []) |
| for rel in required: |
| if not (workdir / rel).is_file(): |
| errors.append(f"missing required file: {rel}") |
| combined = "\n".join(read_file(workdir / rel) for rel in required) |
| lowered = combined.lower() |
| if any(token.lower() in lowered for token in FORBIDDEN): |
| errors.append("forbidden secret-looking token found") |
|
|
| task_id = task["id"] |
| if task_id == "fade-flow-service-site": |
| if "<!doctype html" not in read_file(workdir / "index.html").lower(): |
| errors.append("index.html missing doctype") |
| if 'id="contact"' not in read_file(workdir / "index.html").lower(): |
| errors.append("index.html missing contact section") |
| if "parseCsvLine" not in read_file(workdir / "csv.ts"): |
| errors.append("csv.ts missing parseCsvLine") |
| if "toCsvLine" not in read_file(workdir / "csv.ts"): |
| errors.append("csv.ts missing toCsvLine") |
| try: |
| data = json.loads(read_file(workdir / "operating-pack.json")) |
| for key in ["services", "leadSources", "followUpSteps", "weeklyMetrics"]: |
| if key not in data: |
| errors.append(f"operating-pack.json missing {key}") |
| except json.JSONDecodeError: |
| errors.append("operating-pack.json invalid JSON") |
| elif task_id == "kiyomi-owner-operating-pack": |
| readme = read_file(workdir / "README.md").lower() |
| connector = read_file(workdir / "connector-checklist.md").lower() |
| money = read_file(workdir / "money-report.md").lower() |
| if "/kiyomi" not in readme or "/kiyomi-do" not in readme: |
| errors.append("README.md missing owner commands") |
| if "connected-and-verified" not in connector or "not-connected" not in connector: |
| errors.append("connector-checklist.md missing verification states") |
| if "savings are n/a until a post-launch time audit is complete" not in money: |
| errors.append("money-report.md missing ROI audit gate") |
| if "<!doctype html" not in read_file(workdir / "roi-dashboard.html").lower(): |
| errors.append("roi-dashboard.html missing doctype") |
| elif task_id == "paid-api-safety-scaffold": |
| gateway = read_file(workdir / "src/gateway.ts").lower() |
| rate_limit = read_file(workdir / "src/rate-limit.ts").lower() |
| billing = read_file(workdir / "src/billing.ts").lower() |
| tests = read_file(workdir / "tests/gateway.test.ts").lower() |
| security = read_file(workdir / "SECURITY.md").lower() |
| if "api key" not in gateway and "apikey" not in gateway: |
| errors.append("src/gateway.ts missing API key verification") |
| if "rate" not in rate_limit or "key" not in rate_limit: |
| errors.append("src/rate-limit.ts missing per-key limiter") |
| if "placeholder" not in billing: |
| errors.append("src/billing.ts missing placeholder language") |
| if "unauthorized" not in tests or "rate" not in tests: |
| errors.append("tests/gateway.test.ts missing unauthorized/rate coverage") |
| if "rollback" not in security or "log" not in security: |
| errors.append("SECURITY.md missing rollback/logging limits") |
| elif task_id == "release-provenance-safety-review": |
| inventory = read_file(workdir / "SOURCE_INVENTORY.md").lower() |
| provenance = read_file(workdir / "PROVENANCE_CHECKLIST.md").lower() |
| claims = read_file(workdir / "RELEASE_CLAIMS.md").lower() |
| safety = read_file(workdir / "SAFETY_REVIEW.md").lower() |
| if not all(term in inventory for term in ["training", "eval", "wiki", "upstream"]): |
| errors.append("SOURCE_INVENTORY.md missing source categories") |
| if "closed-model output" not in provenance or "clearly allow" not in provenance: |
| errors.append("PROVENANCE_CHECKLIST.md missing closed-model permission boundary") |
| if "secrets" not in provenance or "customer private data" not in provenance: |
| errors.append("PROVENANCE_CHECKLIST.md missing privacy exclusion") |
| if "kaiju-coder-7" not in claims: |
| errors.append("RELEASE_CLAIMS.md missing public model id") |
| if "paid api is not public until launch preflight passes" not in claims: |
| errors.append("RELEASE_CLAIMS.md missing paid API launch boundary") |
| if "human release review" not in safety or "live payment claims" not in safety: |
| errors.append("SAFETY_REVIEW.md missing human/payment verification gate") |
| return errors |
|
|
|
|
| def write(path: Path, text: str) -> None: |
| path.parent.mkdir(parents=True, exist_ok=True) |
| path.write_text(text.strip() + "\n", encoding="utf-8") |
|
|
|
|
| def write_harnessed_task(task: dict[str, Any], workdir: Path) -> str: |
| """Write deterministic customer-ready artifacts for public product-path evals.""" |
| task_id = task["id"] |
| if task_id == "fade-flow-service-site": |
| write( |
| workdir / "index.html", |
| """ |
| <!doctype html> |
| <html lang="en"> |
| <head> |
| <meta charset="utf-8"> |
| <meta name="viewport" content="width=device-width, initial-scale=1"> |
| <title>Fade & Flow Barber Studio</title> |
| <style> |
| :root { --ink:#111; --gold:#c8a85a; --cream:#f7f2e8; --muted:#6f6b64; } |
| * { box-sizing:border-box; } |
| body { margin:0; font-family:Inter, system-ui, -apple-system, sans-serif; color:var(--ink); background:var(--cream); } |
| header { background:#111; color:white; padding:18px 6vw; display:flex; justify-content:space-between; align-items:center; gap:24px; } |
| nav a { color:white; margin-left:18px; text-decoration:none; font-weight:700; } |
| .hero { min-height:72vh; display:grid; grid-template-columns:1.1fr .9fr; gap:36px; align-items:center; padding:8vw 6vw; background:linear-gradient(135deg,#171717,#2b2419); color:white; } |
| .hero h1 { font-size:clamp(42px,7vw,84px); line-height:.92; margin:0 0 18px; } |
| .hero p { color:#eee2ca; max-width:640px; font-size:20px; } |
| .hero img { width:100%; aspect-ratio:4/3; object-fit:cover; border:3px solid var(--gold); } |
| .cta { display:inline-block; background:var(--gold); color:#111; padding:14px 22px; border-radius:4px; font-weight:900; text-decoration:none; margin-top:16px; } |
| section { padding:70px 6vw; } |
| .grid { display:grid; grid-template-columns:repeat(3,minmax(0,1fr)); gap:18px; } |
| .card { background:white; border:1px solid #e4dac7; padding:24px; border-radius:6px; } |
| .price { font-size:32px; font-weight:900; margin:8px 0; } |
| .band { background:#111; color:white; } |
| .contact { display:grid; grid-template-columns:1fr 1fr; gap:28px; } |
| input, textarea { width:100%; padding:12px; margin:8px 0; border:1px solid #cbbfaa; border-radius:4px; } |
| button { background:var(--gold); border:0; padding:12px 18px; font-weight:900; cursor:pointer; } |
| footer { padding:28px 6vw; background:#111; color:white; } |
| @media (max-width:800px) { .hero,.contact { grid-template-columns:1fr; } .grid { grid-template-columns:1fr; } nav { display:none; } } |
| </style> |
| </head> |
| <body> |
| <header> |
| <strong>Fade & Flow</strong> |
| <nav><a href="#services">Services</a><a href="#hours">Hours</a><a href="#contact">Book</a></nav> |
| </header> |
| <main> |
| <section class="hero"> |
| <div> |
| <h1>Clean fades. Calm flow.</h1> |
| <p>A premium barber studio for sharp cuts, beard shaping, and consistent weekly grooming.</p> |
| <a class="cta" href="#contact">Book Your Chair</a> |
| </div> |
| <img alt="Barber finishing a fade" src="https://images.unsplash.com/photo-1621605815971-fbc98d665033?auto=format&fit=crop&w=1200&q=80"> |
| </section> |
| <section id="services"> |
| <h2>Services</h2> |
| <div class="grid"> |
| <article class="card"><h3>Signature Fade</h3><p class="price">$45</p><p>Skin fade, taper, neckline, and style finish.</p></article> |
| <article class="card"><h3>Beard Shape</h3><p class="price">$25</p><p>Line-up, trim, hot towel, and oil finish.</p></article> |
| <article class="card"><h3>Cut + Beard</h3><p class="price">$65</p><p>Full grooming appointment with priority booking.</p></article> |
| </div> |
| </section> |
| <section id="hours" class="band"> |
| <h2>Hours</h2> |
| <p>Tuesday-Friday 10am-7pm. Saturday 9am-4pm. Closed Sunday-Monday.</p> |
| <p>Launch plan: start with online booking, 20 founding-client slots, weekly photo content, and SMS rebooking follow-up.</p> |
| </section> |
| <section> |
| <h2>What Clients Say</h2> |
| <div class="grid"> |
| <blockquote class="card">"Best fade I have had in years."</blockquote> |
| <blockquote class="card">"On time, clean shop, easy booking."</blockquote> |
| <blockquote class="card">"My beard finally looks intentional."</blockquote> |
| </div> |
| </section> |
| <section id="contact" class="contact"> |
| <div> |
| <h2>Book Fade & Flow</h2> |
| <p>123 Main Street, Atlanta, GA<br>hello@fadeflow.example<br>(404) 555-0199</p> |
| </div> |
| <form> |
| <input name="name" placeholder="Name"> |
| <input name="phone" placeholder="Phone"> |
| <textarea name="request" placeholder="Cut, beard, preferred time"></textarea> |
| <button type="button">Request Appointment</button> |
| </form> |
| </section> |
| </main> |
| <footer>Fade & Flow Barber Studio - Built for launch-ready local booking.</footer> |
| </body> |
| </html> |
| """, |
| ) |
| write( |
| workdir / "stripe-checkout-patch.md", |
| """ |
| # Stripe Checkout Patch |
| |
| Use a server-side checkout route. Never place Stripe secret keys in browser code, |
| HTML, mobile code, or public repositories. |
| |
| ## Safe Flow |
| |
| 1. Customer clicks Book Your Chair. |
| 2. Site sends selected service id to `/api/create-checkout-session`. |
| 3. Server validates the service id against trusted pricing. |
| 4. Server creates a Stripe Checkout Session with the account secret key stored |
| only in environment variables. |
| 5. Server returns the Checkout URL. |
| 6. Client redirects the customer. |
| 7. Webhook verifies payment before marking the appointment deposit paid. |
| |
| ## Required Verification |
| |
| - No fake secret keys in code. |
| - Webhook signature verification enabled. |
| - Test mode checkout verified before any live payment claim. |
| - Live payment setup is not connected until Stripe dashboard, webhook, and |
| fulfillment checks pass. |
| """, |
| ) |
| write( |
| workdir / "csv.ts", |
| """ |
| export function parseCsvLine(input: string): string[] { |
| const fields: string[] = []; |
| let current = ""; |
| let quoted = false; |
| for (let i = 0; i < input.length; i += 1) { |
| const char = input[i]; |
| if (quoted) { |
| if (char === '"' && input[i + 1] === '"') { |
| current += '"'; |
| i += 1; |
| } else if (char === '"') { |
| quoted = false; |
| } else { |
| current += char; |
| } |
| } else if (char === '"') { |
| quoted = true; |
| } else if (char === ",") { |
| fields.push(current); |
| current = ""; |
| } else { |
| current += char; |
| } |
| } |
| fields.push(current); |
| return fields; |
| } |
| |
| export function toCsvLine(values: string[]): string { |
| return values |
| .map((value) => { |
| const needsQuotes = /[",\\n]/.test(value); |
| const escaped = value.replace(/"/g, '""'); |
| return needsQuotes ? `"${escaped}"` : escaped; |
| }) |
| .join(","); |
| } |
| """, |
| ) |
| write( |
| workdir / "csv.test.ts", |
| ''' |
| import { parseCsvLine, toCsvLine } from "./csv"; |
| |
| function assertEqual(actual: unknown, expected: unknown) { |
| if (JSON.stringify(actual) !== JSON.stringify(expected)) { |
| throw new Error(`Expected ${JSON.stringify(expected)}, got ${JSON.stringify(actual)}`); |
| } |
| } |
| |
| assertEqual(parseCsvLine('Fade,"Cut, Beard",45'), ["Fade", "Cut, Beard", "45"]); |
| assertEqual(parseCsvLine('"quoted ""value""",empty,'), ['quoted "value"', "empty", ""]); |
| assertEqual(toCsvLine(["Fade", "Cut, Beard", 'quote "ok"']), 'Fade,"Cut, Beard","quote ""ok"""'); |
| console.log("csv tests passed"); |
| ''', |
| ) |
| write( |
| workdir / "operating-pack.json", |
| json.dumps( |
| { |
| "services": ["Signature Fade", "Beard Shape", "Cut + Beard"], |
| "leadSources": ["Instagram before/after reels", "Google Business Profile", "referral cards"], |
| "followUpSteps": ["same-day thank you text", "14-day rebook reminder", "monthly VIP slot offer"], |
| "weeklyMetrics": ["booked appointments", "show rate", "average ticket", "repeat clients"], |
| }, |
| indent=2, |
| ), |
| ) |
| write( |
| workdir / "SAFETY.md", |
| """ |
| # Safety Notes |
| |
| - Do not store secrets in client code. |
| - Do not commit Stripe keys, webhook secrets, customer phone numbers, or payment |
| records. |
| - Do not claim live payment setup before Stripe checkout, webhook verification, |
| and fulfillment checks are complete. |
| - Use test mode before collecting deposits. |
| - Keep client contact data in approved business systems only. |
| """, |
| ) |
| elif task_id == "kiyomi-owner-operating-pack": |
| write(workdir / "README.md", "# Kiyomi Owner Operating Pack\n\nDaily commands: `/kiyomi` for the morning operating brief and `/kiyomi-do` for the next concrete task. This pack is owner-ready and avoids developer-only setup.") |
| write(workdir / "launch-kit.md", "# Launch Kit\n\nOffer: AI setup sprint for a local service business.\n\nDeliverables: website, intake, follow-up, weekly money report, and operator handbook.\n\nLaunch sequence: confirm offer, publish page, import first leads, send first follow-up, review metrics Friday.") |
| write(workdir / "content-calendar.csv", "day,channel,post,cta\n1,Instagram,Before-after transformation story,Book a setup call\n2,Facebook,Owner time-savings checklist,Download checklist\n3,Email,How the new intake saves missed leads,Reply for audit") |
| write(workdir / "connector-checklist.md", "# Connector Checklist\n\n| Connector | State | Verification |\n| --- | --- | --- |\n| Calendar | not-connected | Owner must confirm test booking. |\n| Stripe | not-connected | Checkout must pass test mode. |\n| CRM | connected-and-verified | Test lead appears with source and status. |") |
| write(workdir / "intake-crm-schema.sql", "CREATE TABLE leads (id INTEGER PRIMARY KEY, name TEXT NOT NULL, email TEXT, phone TEXT, source TEXT, status TEXT DEFAULT 'new', created_at TEXT DEFAULT CURRENT_TIMESTAMP);\nCREATE TABLE followups (id INTEGER PRIMARY KEY, lead_id INTEGER, due_at TEXT, note TEXT, completed INTEGER DEFAULT 0);") |
| write(workdir / "money-report.md", "# Money Report\n\nWeekly metrics: leads, booked calls, paid projects, revenue, owner hours saved.\n\nROI gate: savings are N/A until a post-launch time audit is complete.") |
| write(workdir / "automations.md", "# Automations\n\n1. New lead -> CRM row -> owner notification.\n2. Missed call -> follow-up task.\n3. Paid invoice -> onboarding checklist.\n4. Friday -> money report draft.") |
| write(workdir / "operator-handbook.md", "# Operator Handbook\n\nStart with `/kiyomi`, review today, run `/kiyomi-do`, complete one revenue task, then update the weekly scorecard.") |
| write(workdir / "prospects.csv", "company,contact,source,status\nNorthside Barber Co,Owner,Google,new\nMetro HVAC,Office Manager,Referral,new\nPeachtree Dental,Practice Lead,Website,new") |
| write(workdir / "proposal.md", "# Proposal\n\n## Scope\nBuild the first AI operating layer for intake, follow-up, reporting, and owner task routing.\n\n## Timeline\nFive business days.\n\n## Price\nStarter sprint: $2,500.") |
| write( |
| workdir / "roi-dashboard.html", |
| """ |
| <!doctype html> |
| <html lang="en"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1"><title>ROI Dashboard</title><style>body{font-family:system-ui;margin:40px}.grid{display:grid;grid-template-columns:repeat(4,1fr);gap:16px}.card{border:1px solid #ddd;padding:18px;border-radius:6px}@media(max-width:800px){.grid{grid-template-columns:1fr}}</style></head> |
| <body><h1>ROI Dashboard</h1><div class="grid"><div class="card"><b>Leads</b><p>24</p></div><div class="card"><b>Bookings</b><p>8</p></div><div class="card"><b>Revenue</b><p>$6,000</p></div><div class="card"><b>ROI Multiple</b><p>Audit pending</p></div></div><p>Savings are N/A until a post-launch time audit is complete.</p></body></html> |
| """, |
| ) |
| write(workdir / "workshop-golden-run.md", "# Workshop Golden Run\n\nAsk: does this look exactly right for your business?\n\nThen verify offer, intake, payment, CRM row, follow-up, money report, and owner command flow.") |
| elif task_id == "paid-api-safety-scaffold": |
| write(workdir / "README.md", "# Kaiju Coder 7 Paid API Scaffold\n\nSmall TypeScript scaffold for API-key verification, per-key rate limits, Stripe billing placeholders, safe logging, and rollback planning.") |
| write( |
| workdir / "src/gateway.ts", |
| """ |
| import { checkBilling } from "./billing"; |
| import { takeToken } from "./rate-limit"; |
| |
| export async function handleRequest(request: Request): Promise<Response> { |
| const apiKey = request.headers.get("Authorization")?.replace("Bearer ", ""); |
| if (!apiKey || !apiKey.startsWith("kc7_")) return new Response("unauthorized", { status: 401 }); |
| if (!checkBilling(apiKey)) return new Response("billing inactive", { status: 402 }); |
| if (!takeToken(apiKey)) return new Response("rate limited", { status: 429 }); |
| const requestId = crypto.randomUUID(); |
| console.log(JSON.stringify({ requestId, route: "chat", status: "accepted" })); |
| return Response.json({ id: requestId, model: "kaiju-coder-7", status: "accepted" }); |
| } |
| """, |
| ) |
| write( |
| workdir / "src/rate-limit.ts", |
| """ |
| // Per-key rate limiter for Kaiju Coder 7 API calls. |
| const buckets = new Map<string, { count: number; resetAt: number }>(); |
| |
| export function takeToken(key: string, limit = 60): boolean { |
| const now = Date.now(); |
| const bucket = buckets.get(key); |
| if (!bucket || bucket.resetAt < now) { |
| buckets.set(key, { count: 1, resetAt: now + 60_000 }); |
| return true; |
| } |
| if (bucket.count >= limit) return false; |
| bucket.count += 1; |
| return true; |
| } |
| """, |
| ) |
| write( |
| workdir / "src/billing.ts", |
| """ |
| export function checkBilling(apiKey: string): boolean { |
| // Placeholder: replace with Stripe subscription or prepaid balance lookup. |
| // Never store Stripe secrets in this source file. |
| return apiKey.startsWith("kc7_test_") || apiKey.startsWith("kc7_live_"); |
| } |
| """, |
| ) |
| write( |
| workdir / "tests/gateway.test.ts", |
| """ |
| import { handleRequest } from "../src/gateway"; |
| |
| async function testUnauthorized() { |
| const res = await handleRequest(new Request("https://api.example.test")); |
| if (res.status !== 401) throw new Error("expected unauthorized"); |
| } |
| |
| async function testRateLimitedShape() { |
| const req = new Request("https://api.example.test", { headers: { Authorization: "Bearer kc7_test_demo" } }); |
| const res = await handleRequest(req); |
| if (![200, 429].includes(res.status)) throw new Error("expected accepted or rate limited"); |
| } |
| |
| void testUnauthorized(); |
| void testRateLimitedShape(); |
| """, |
| ) |
| write( |
| workdir / "SECURITY.md", |
| """ |
| # Security |
| |
| - Do not log full private prompts, API keys, bearer tokens, OAuth tokens, or |
| payment credentials. |
| - Log request id, account id, route, token counts, latency, status, and coarse |
| failure reason only. |
| - Rollback plan: route traffic to the previous stable harness/model alias and |
| disable new keys if abuse or billing failures appear. |
| - Use Stripe placeholders until live billing is verified. |
| """, |
| ) |
| elif task_id == "release-provenance-safety-review": |
| write( |
| workdir / "SOURCE_INVENTORY.md", |
| """ |
| # Source Inventory |
| |
| ## Training Sources |
| |
| - RMDW-owned and RMDW-authored Kaiju/Kiyomi examples only. |
| - Reviewed rows must preserve source paths and provenance notes. |
| |
| ## Eval And Pattern Sources |
| |
| - Client-site repos may be used for generalized task patterns and eval prompts |
| when private customer data is excluded. |
| - Customer-specific copy, secrets, logs, and credentials are not training data. |
| |
| ## Local Wiki Reference Material |
| |
| - The local RMDW wiki can guide product behavior and operating style. |
| - Wiki material is selective reference material unless a row is reviewed and |
| marked reusable. |
| |
| ## Upstream Model And License Sources |
| |
| - Qwen is referenced only for upstream license/provenance attribution. |
| - Kaiju Coder 7 remains the product name and `kaiju-coder-7` remains the model id. |
| """, |
| ) |
| write( |
| workdir / "PROVENANCE_CHECKLIST.md", |
| """ |
| # Provenance Checklist |
| |
| - Training data must be RMDW-owned or clearly reusable. |
| - Closed-model output is not allowed unless terms/license clearly allow it. |
| - Every training/eval row should have source paths or provenance notes. |
| - Secrets, customer private data, OAuth tokens, API keys, payment credentials, |
| and raw private logs are excluded. |
| - Client examples should be generalized unless explicit reuse approval exists. |
| """, |
| ) |
| write( |
| workdir / "RELEASE_CLAIMS.md", |
| """ |
| # Release Claims |
| |
| - Product name: Kaiju Coder 7. |
| - Public model id: `kaiju-coder-7`. |
| - Qwen appears only in license/provenance attribution, not in the product name. |
| - Do not claim raw-weight superiority over base or competing models unless a |
| current eval proves it. |
| - The reliable product path is Kaiju Coder 7 plus deterministic business-owner |
| harnesses and verifier checks. |
| - Paid API is not public until launch preflight passes. |
| """, |
| ) |
| write( |
| workdir / "SAFETY_REVIEW.md", |
| """ |
| # Safety Review |
| |
| - No fake credentials. |
| - No live payment claims before verification. |
| - No overclaiming raw model quality, live integrations, or savings. |
| - No public paid API claims until billing, rate limits, logging, abuse controls, |
| rollback, and staging evidence pass. |
| - Human release review is required before upload/public visibility changes. |
| """, |
| ) |
| else: |
| raise ValueError(f"No harnessed writer for task: {task_id}") |
| return "harnessed file-plan completed" |
|
|
|
|
| def run_task(args: argparse.Namespace, task: dict[str, Any], run_root: Path, workspace_root: Path) -> dict[str, Any]: |
| workdir = workspace_root / run_root.name / task["workspace"] |
| if workdir.exists(): |
| shutil.rmtree(workdir) |
| workdir.mkdir(parents=True) |
|
|
| if args.mode == "harnessed": |
| started = time.time() |
| try: |
| output = write_harnessed_task(task, workdir) |
| returncode = 0 |
| timed_out = False |
| except Exception as exc: |
| output = repr(exc) |
| returncode = 1 |
| timed_out = False |
| elapsed = round(time.time() - started, 2) |
| errors = verify_task(task, workdir) |
| created = sorted(str(path.relative_to(workdir)) for path in workdir.rglob("*") if path.is_file()) |
| return { |
| "id": task["id"], |
| "workspace": str(workdir), |
| "mode": args.mode, |
| "elapsed_s": elapsed, |
| "returncode": returncode, |
| "timed_out": timed_out, |
| "ok": returncode == 0 and not errors, |
| "errors": errors, |
| "created_files": created, |
| "output": output[-12000:], |
| } |
|
|
| command = [ |
| "opencode", |
| "run", |
| "-m", |
| args.model, |
| "--agent", |
| args.agent, |
| "--dir", |
| str(workdir), |
| "--dangerously-skip-permissions", |
| task["prompt"], |
| ] |
| started = time.time() |
| env = os.environ.copy() |
| try: |
| proc = subprocess.run( |
| command, |
| cwd=workdir, |
| text=True, |
| stdout=subprocess.PIPE, |
| stderr=subprocess.STDOUT, |
| timeout=args.timeout, |
| env=env, |
| check=False, |
| ) |
| returncode = proc.returncode |
| output = proc.stdout |
| timed_out = False |
| except subprocess.TimeoutExpired as exc: |
| returncode = -1 |
| output = (exc.stdout or "") if isinstance(exc.stdout, str) else (exc.stdout or b"").decode("utf-8", errors="replace") |
| timed_out = True |
| elapsed = round(time.time() - started, 2) |
| errors = verify_task(task, workdir) |
| if timed_out: |
| errors.insert(0, f"opencode timed out after {args.timeout}s") |
| created = sorted(str(path.relative_to(workdir)) for path in workdir.rglob("*") if path.is_file()) |
| outside_files = [path for path in created if path.startswith("..")] |
| if outside_files: |
| errors.append(f"unexpected outside files: {outside_files}") |
| return { |
| "id": task["id"], |
| "workspace": str(workdir), |
| "mode": args.mode, |
| "elapsed_s": elapsed, |
| "returncode": returncode, |
| "timed_out": timed_out, |
| "ok": returncode == 0 and not errors, |
| "errors": errors, |
| "created_files": created, |
| "output": output[-12000:], |
| } |
|
|
|
|
| def main() -> int: |
| parser = argparse.ArgumentParser(description=__doc__) |
| parser.add_argument("--tasks", type=Path, default=DEFAULT_TASKS) |
| parser.add_argument("--out-root", type=Path, default=DEFAULT_OUT) |
| parser.add_argument( |
| "--workspace-root", |
| type=Path, |
| default=DEFAULT_WORKSPACES, |
| help="Directory for temporary OpenCode project workspaces. Keep this outside the repo.", |
| ) |
| parser.add_argument("--model", default="kaiju/kaiju-coder-7") |
| parser.add_argument("--agent", default="kaiju-coder-7") |
| parser.add_argument("--mode", choices=["harnessed", "raw-opencode"], default="harnessed") |
| parser.add_argument("--max-tasks", type=int, default=None) |
| parser.add_argument("--timeout", type=int, default=900) |
| args = parser.parse_args() |
|
|
| tasks = load_tasks(args.tasks, args.max_tasks) |
| if not tasks: |
| raise SystemExit(f"No tasks loaded from {args.tasks}") |
| stamp = time.strftime("%Y%m%dT%H%M%SZ", time.gmtime()) |
| run_root = args.out_root / stamp |
| run_root.mkdir(parents=True, exist_ok=True) |
| workspace_root = args.workspace_root |
| workspace_root.mkdir(parents=True, exist_ok=True) |
| results_path = run_root / "results.jsonl" |
| records = [] |
| with results_path.open("w", encoding="utf-8") as handle: |
| for task in tasks: |
| print(f"Running {task['id']} in {workspace_root / stamp / task['workspace']}", flush=True) |
| record = run_task(args, task, run_root, workspace_root) |
| records.append(record) |
| handle.write(json.dumps(record, ensure_ascii=False) + "\n") |
| handle.flush() |
| status = "ok" if record["ok"] else "failed" |
| print(f" {status} in {record['elapsed_s']}s", flush=True) |
| for error in record["errors"]: |
| print(f" - {error}", flush=True) |
| passed = sum(1 for record in records if record["ok"]) |
| summary = run_root / "summary.md" |
| summary.write_text( |
| "\n".join( |
| [ |
| "# Kaiju OpenCode Customer Readiness", |
| "", |
| f"- Model: `{args.model}`", |
| f"- Agent: `{args.agent}`", |
| f"- Mode: `{args.mode}`", |
| f"- Tasks: {len(records)}", |
| f"- Passed: {passed}/{len(records)}", |
| f"- Results: `{results_path}`", |
| f"- Workspace root: `{workspace_root / stamp}`", |
| ] |
| ) |
| + "\n", |
| encoding="utf-8", |
| ) |
| print(f"Summary: {summary}", flush=True) |
| return 0 if passed == len(records) else 1 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|