MiniCPM5-Pi-Web-Agent-Static / scripts /smoke_complex_tasks.mjs
Mike0021's picture
Use Qwen2.5 Coder planner and verified token caps
53f8186 verified
import { chromium } from "@playwright/test";
import { existsSync } from "node:fs";
const baseUrl = process.argv[2] || "http://localhost:5173/?model=qwen25coder&device=wasm";
const executablePath = process.env.CHROMIUM_PATH || (existsSync("/usr/bin/google-chrome") ? "/usr/bin/google-chrome" : "/snap/bin/chromium");
const maxNewTokens = process.env.MAX_NEW_TOKENS || "256";
const tasks = [
{
id: "simple-calc",
prompt:
"Create hello.js containing JavaScript that computes 21 * 2 and prints result: 42, then run node hello.js.",
expectedOutput: "42",
expectedFiles: ["hello.js"],
},
{
id: "npm-dependency",
prompt:
"Install npm package is-number@7.0.0, create check-package.mjs that imports it and prints dependency check: true, then run it with Node.",
expectedOutput: "dependency check: true",
expectedFiles: ["check-package.mjs", "node_modules"],
},
{
id: "multi-file-module",
prompt:
"Create src/math.mjs exporting multiply(a,b), create test.mjs importing it and printing multi result: 42 for multiply(6,7), then run node test.mjs.",
expectedOutput: "multi result: 42",
expectedFiles: ["test.mjs", "src"],
},
];
async function runTask(task) {
const browser = await chromium.launch({
executablePath,
headless: true,
args: ["--no-sandbox", "--disable-dev-shm-usage"],
});
try {
const page = await browser.newPage();
page.setDefaultTimeout(1200000);
const consoleLines = [];
page.on("console", (message) => {
consoleLines.push(`${message.type()}: ${message.text()}`);
});
page.on("pageerror", (error) => {
consoleLines.push(`pageerror: ${error.stack || error.message}`);
});
await page.goto(baseUrl, { waitUntil: "networkidle" });
await page.waitForFunction(() => document.querySelector("#status")?.textContent === "Ready");
const isolated = await page.evaluate(() => globalThis.crossOriginIsolated);
if (!isolated) throw new Error("Page is not cross-origin isolated.");
await page.fill("#max-new-tokens", maxNewTokens);
await page.fill("#temperature", "0");
await page.selectOption("#gate-device", "wasm");
await page.click("#confirm-load-model");
await page.waitForFunction(() => window.__piWebAgent?.modelReady === true, null, {
timeout: 600000,
});
const started = Date.now();
await page.fill("#prompt", task.prompt);
await page.click("#send");
await page.waitForFunction(() => document.querySelector("#status")?.textContent === "Agent running", null, {
timeout: 120000,
});
await page.waitForFunction(() => document.querySelector("#status")?.textContent === "Ready", null, {
timeout: 1200000,
});
const transcript = await page.evaluate(() => window.__piWebAgent?.transcript || "");
const files = await page.textContent("#files");
const events = await page.textContent("#event-log");
const modelStatus = await page.textContent("#model-status");
const lowerTranscript = transcript.toLowerCase();
const lowerEvents = events?.toLowerCase() || "";
const lowerFiles = files?.toLowerCase() || "";
if (!lowerTranscript.includes(task.expectedOutput.toLowerCase())) {
throw new Error(`Expected output ${task.expectedOutput}.\n\nTranscript:\n${transcript}\n\nEvents:\n${events}`);
}
for (const file of task.expectedFiles) {
if (!lowerFiles.includes(file.toLowerCase())) {
throw new Error(`Expected ${file} in file listing.\n\nFiles:\n${files}\n\nTranscript:\n${transcript}`);
}
}
if (!lowerEvents.includes("tool: run_command finished")) {
throw new Error(`Expected run_command tool execution.\n\nEvents:\n${events}`);
}
return {
id: task.id,
ok: true,
maxNewTokens,
modelStatus,
runMs: Date.now() - started,
files,
transcriptChars: transcript.length,
warnings: consoleLines.filter((line) => line.startsWith("warning:")).slice(-3),
};
} finally {
await browser.close();
}
}
const results = [];
for (const task of tasks) {
results.push(await runTask(task));
}
console.log(JSON.stringify(results, null, 2));