File size: 4,183 Bytes
53f8186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import { chromium } from "@playwright/test";
import { existsSync } from "node:fs";

const baseUrl = process.argv[2] || "http://localhost:5173/?model=qwen25coder&device=wasm";
const executablePath = process.env.CHROMIUM_PATH || (existsSync("/usr/bin/google-chrome") ? "/usr/bin/google-chrome" : "/snap/bin/chromium");
const maxNewTokens = process.env.MAX_NEW_TOKENS || "256";

const tasks = [
  {
    id: "simple-calc",
    prompt:
      "Create hello.js containing JavaScript that computes 21 * 2 and prints result: 42, then run node hello.js.",
    expectedOutput: "42",
    expectedFiles: ["hello.js"],
  },
  {
    id: "npm-dependency",
    prompt:
      "Install npm package is-number@7.0.0, create check-package.mjs that imports it and prints dependency check: true, then run it with Node.",
    expectedOutput: "dependency check: true",
    expectedFiles: ["check-package.mjs", "node_modules"],
  },
  {
    id: "multi-file-module",
    prompt:
      "Create src/math.mjs exporting multiply(a,b), create test.mjs importing it and printing multi result: 42 for multiply(6,7), then run node test.mjs.",
    expectedOutput: "multi result: 42",
    expectedFiles: ["test.mjs", "src"],
  },
];

async function runTask(task) {
  const browser = await chromium.launch({
    executablePath,
    headless: true,
    args: ["--no-sandbox", "--disable-dev-shm-usage"],
  });

  try {
    const page = await browser.newPage();
    page.setDefaultTimeout(1200000);
    const consoleLines = [];
    page.on("console", (message) => {
      consoleLines.push(`${message.type()}: ${message.text()}`);
    });
    page.on("pageerror", (error) => {
      consoleLines.push(`pageerror: ${error.stack || error.message}`);
    });

    await page.goto(baseUrl, { waitUntil: "networkidle" });
    await page.waitForFunction(() => document.querySelector("#status")?.textContent === "Ready");
    const isolated = await page.evaluate(() => globalThis.crossOriginIsolated);
    if (!isolated) throw new Error("Page is not cross-origin isolated.");

    await page.fill("#max-new-tokens", maxNewTokens);
    await page.fill("#temperature", "0");
    await page.selectOption("#gate-device", "wasm");
    await page.click("#confirm-load-model");
    await page.waitForFunction(() => window.__piWebAgent?.modelReady === true, null, {
      timeout: 600000,
    });

    const started = Date.now();
    await page.fill("#prompt", task.prompt);
    await page.click("#send");
    await page.waitForFunction(() => document.querySelector("#status")?.textContent === "Agent running", null, {
      timeout: 120000,
    });
    await page.waitForFunction(() => document.querySelector("#status")?.textContent === "Ready", null, {
      timeout: 1200000,
    });

    const transcript = await page.evaluate(() => window.__piWebAgent?.transcript || "");
    const files = await page.textContent("#files");
    const events = await page.textContent("#event-log");
    const modelStatus = await page.textContent("#model-status");
    const lowerTranscript = transcript.toLowerCase();
    const lowerEvents = events?.toLowerCase() || "";
    const lowerFiles = files?.toLowerCase() || "";

    if (!lowerTranscript.includes(task.expectedOutput.toLowerCase())) {
      throw new Error(`Expected output ${task.expectedOutput}.\n\nTranscript:\n${transcript}\n\nEvents:\n${events}`);
    }
    for (const file of task.expectedFiles) {
      if (!lowerFiles.includes(file.toLowerCase())) {
        throw new Error(`Expected ${file} in file listing.\n\nFiles:\n${files}\n\nTranscript:\n${transcript}`);
      }
    }
    if (!lowerEvents.includes("tool: run_command finished")) {
      throw new Error(`Expected run_command tool execution.\n\nEvents:\n${events}`);
    }

    return {
      id: task.id,
      ok: true,
      maxNewTokens,
      modelStatus,
      runMs: Date.now() - started,
      files,
      transcriptChars: transcript.length,
      warnings: consoleLines.filter((line) => line.startsWith("warning:")).slice(-3),
    };
  } finally {
    await browser.close();
  }
}

const results = [];
for (const task of tasks) {
  results.push(await runTask(task));
}

console.log(JSON.stringify(results, null, 2));