import { chromium } from "@playwright/test"; import { existsSync } from "node:fs"; const baseUrl = process.argv[2] || "http://localhost:5173/?model=qwen25coder&device=wasm"; const executablePath = process.env.CHROMIUM_PATH || (existsSync("/usr/bin/google-chrome") ? "/usr/bin/google-chrome" : "/snap/bin/chromium"); const maxNewTokens = process.env.MAX_NEW_TOKENS || "256"; const tasks = [ { id: "simple-calc", prompt: "Create hello.js containing JavaScript that computes 21 * 2 and prints result: 42, then run node hello.js.", expectedOutput: "42", expectedFiles: ["hello.js"], }, { id: "npm-dependency", prompt: "Install npm package is-number@7.0.0, create check-package.mjs that imports it and prints dependency check: true, then run it with Node.", expectedOutput: "dependency check: true", expectedFiles: ["check-package.mjs", "node_modules"], }, { id: "multi-file-module", prompt: "Create src/math.mjs exporting multiply(a,b), create test.mjs importing it and printing multi result: 42 for multiply(6,7), then run node test.mjs.", expectedOutput: "multi result: 42", expectedFiles: ["test.mjs", "src"], }, ]; async function runTask(task) { const browser = await chromium.launch({ executablePath, headless: true, args: ["--no-sandbox", "--disable-dev-shm-usage"], }); try { const page = await browser.newPage(); page.setDefaultTimeout(1200000); const consoleLines = []; page.on("console", (message) => { consoleLines.push(`${message.type()}: ${message.text()}`); }); page.on("pageerror", (error) => { consoleLines.push(`pageerror: ${error.stack || error.message}`); }); await page.goto(baseUrl, { waitUntil: "networkidle" }); await page.waitForFunction(() => document.querySelector("#status")?.textContent === "Ready"); const isolated = await page.evaluate(() => globalThis.crossOriginIsolated); if (!isolated) throw new Error("Page is not cross-origin isolated."); await page.fill("#max-new-tokens", maxNewTokens); await page.fill("#temperature", "0"); await page.selectOption("#gate-device", "wasm"); await page.click("#confirm-load-model"); await page.waitForFunction(() => window.__piWebAgent?.modelReady === true, null, { timeout: 600000, }); const started = Date.now(); await page.fill("#prompt", task.prompt); await page.click("#send"); await page.waitForFunction(() => document.querySelector("#status")?.textContent === "Agent running", null, { timeout: 120000, }); await page.waitForFunction(() => document.querySelector("#status")?.textContent === "Ready", null, { timeout: 1200000, }); const transcript = await page.evaluate(() => window.__piWebAgent?.transcript || ""); const files = await page.textContent("#files"); const events = await page.textContent("#event-log"); const modelStatus = await page.textContent("#model-status"); const lowerTranscript = transcript.toLowerCase(); const lowerEvents = events?.toLowerCase() || ""; const lowerFiles = files?.toLowerCase() || ""; if (!lowerTranscript.includes(task.expectedOutput.toLowerCase())) { throw new Error(`Expected output ${task.expectedOutput}.\n\nTranscript:\n${transcript}\n\nEvents:\n${events}`); } for (const file of task.expectedFiles) { if (!lowerFiles.includes(file.toLowerCase())) { throw new Error(`Expected ${file} in file listing.\n\nFiles:\n${files}\n\nTranscript:\n${transcript}`); } } if (!lowerEvents.includes("tool: run_command finished")) { throw new Error(`Expected run_command tool execution.\n\nEvents:\n${events}`); } return { id: task.id, ok: true, maxNewTokens, modelStatus, runMs: Date.now() - started, files, transcriptChars: transcript.length, warnings: consoleLines.filter((line) => line.startsWith("warning:")).slice(-3), }; } finally { await browser.close(); } } const results = []; for (const task of tasks) { results.push(await runTask(task)); } console.log(JSON.stringify(results, null, 2));