Spaces:
Running
Running
| // References for model evaluation metrics: | |
| // - Chatbot Arena: https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH | |
| // - Evalica: https://github.com/dustalov/evalica/blob/master/Chatbot-Arena.ipynb | |
| import "dotenv/config"; | |
| import { mkdirSync, rmSync } from "node:fs"; | |
| import { spawn, execFile, execFileSync } from "node:child_process"; | |
| import { promisify } from "node:util"; | |
| import { tmpdir } from "node:os"; | |
| import { join } from "node:path"; | |
| import { randomUUID } from "node:crypto"; | |
| import { URL } from "node:url"; | |
| import express from "express"; | |
| import cookieSession from "cookie-session"; | |
| import OpenAI from "openai"; | |
| import { Octokit } from "@octokit/rest"; | |
| import { Gitlab } from "@gitbeaker/rest"; | |
| import { uploadFile, listFiles, downloadFile } from "@huggingface/hub"; | |
| import whichSync from "which"; | |
| const execFileAsync = promisify(execFile); | |
| // --------------------------------------------------------------------------- | |
| // Environment & constants | |
| // --------------------------------------------------------------------------- | |
| const openaiClient = new OpenAI({ | |
| apiKey: process.env.OPENROUTER_API_KEY, | |
| baseURL: "https://openrouter.ai/api/v1", | |
| }); | |
| const CLI_DATA_REPO = "SWE-Arena/cli_data"; | |
| const LEADERBOARD_REPO = "SWE-Arena/leaderboard_data"; | |
| const VOTE_REPO = "SWE-Arena/vote_data"; | |
| const CONVERSATION_REPO = "SWE-Arena/conversation_data"; | |
| const LEADERBOARD_FILE = "agent_arena"; | |
| const AGENT_TIMEOUT = 600_000; // 10 minutes per agent (ms) | |
| const AGENT_TIMEOUT_LABEL = `${AGENT_TIMEOUT / 60_000}min`; | |
| const LEADERBOARD_UPDATE_TIME_FRAME_DAYS = 365; | |
| let leaderboardCache = null; // in-memory cache, populated at startup | |
| const SHOW_HINT_STRING = true; | |
| const HINT_STRING = "Once signed in, your votes will be recorded securely."; | |
| const SYSTEM_PREFIX = | |
| "You are an expert software engineer. " + | |
| "The user will give you a task — follow their instructions precisely and completely. " + | |
| "Do exactly what is asked: no more, no less. " + | |
| "If the task involves writing or modifying code, produce clean, correct, and working code. " + | |
| "If the task involves debugging, identify and fix the root cause. " + | |
| "If the task involves explaining, be clear and concise. " + | |
| "WORKSPACE: Your current working directory is a fresh, isolated sandbox created exclusively for this task. " + | |
| "It starts empty (or contains the cloned repository if a URL was provided). " + | |
| "You are free to create any files, subdirectories, or build artifacts you need within it. " + | |
| "For temporary files, prefer a subdirectory here (e.g., './tmp/') rather than system temp directories. " + | |
| "CRITICAL CONSTRAINT: You MUST operate entirely within the current working directory. " + | |
| "ALL file operations (read, write, create, modify, execute) must be within this directory. " + | |
| "Do NOT access any files or directories outside your current working directory. " + | |
| "Use relative paths only (e.g., './file.py', 'subdir/file.txt'), never absolute paths like '/tmp/', '/home/', etc. " + | |
| "If you attempt to access files outside the working directory, the operation will fail."; | |
| const MAX_AGENT_RETRIES = 3; // max retries per agent before moving to the next one | |
| // --------------------------------------------------------------------------- | |
| // Agent definitions — loaded from HF dataset SWE-Arena/cli_data at startup. | |
| // Each {id}.json declares CLI binary and two "styles" that drive generic | |
| // buildAgentCommand() / runFollowup(). | |
| // | |
| // promptStyle: | |
| // "flag" → [bin, "-p", <prompt>, ...initArgs] | |
| // "exec" → [bin, "exec", ...initArgs, <prompt>] | |
| // "none" → [bin, ...initArgs, <prompt>] | |
| // | |
| // followupStyle: | |
| // "continue" → [bin, "-p", <followup>, ...followupArgs] (e.g. --continue) | |
| // "resume" → [bin, "-p", <followup>, "--resume", <session-id>, ...followupArgs] (flag-style, e.g. Claude Code) | |
| // → [bin, "exec", "--resume", <session-id>, "-p", <followup>, ...followupArgs] (exec-style, e.g. Codex) | |
| // → falls back to [bin, "exec", ...followupArgs, "resume", "--last", <followup>] if no session-id | |
| // "replay" → rebuild full conversation, then use promptStyle | |
| // "none" → [bin, ...followupArgs, <followup>] | |
| // --------------------------------------------------------------------------- | |
| let agents = []; | |
| let agentById = {}; | |
| let agentByName = {}; | |
| async function loadAgentsFromHf() { | |
| const token = process.env.HF_TOKEN; | |
| const credentials = token ? { accessToken: token } : undefined; | |
| const repo = { type: "dataset", name: CLI_DATA_REPO }; | |
| const loaded = []; | |
| for await (const file of listFiles({ repo, credentials })) { | |
| if (!file.path.endsWith(".json")) continue; | |
| // Skip hidden / nested paths (e.g. .gitattributes) | |
| if (file.path.includes("/")) continue; | |
| const resp = await downloadFile({ repo, path: file.path, credentials }); | |
| if (!resp) continue; | |
| const data = JSON.parse(await resp.text()); | |
| const name = file.path.replace(/\.json$/, ""); | |
| loaded.push({ id: data.bin, name, ...data }); | |
| } | |
| agents = loaded; | |
| agentById = Object.fromEntries(agents.map((a) => [a.id, a])); | |
| agentByName = Object.fromEntries(agents.map((a) => [a.name, a])); | |
| console.log(`Loaded ${agents.length} agent(s) from ${CLI_DATA_REPO}: ${agents.map((a) => a.name).join(", ")}`); | |
| } | |
| // --------------------------------------------------------------------------- | |
| // CLI availability | |
| // --------------------------------------------------------------------------- | |
| function availableAgents() { | |
| return agents.filter((a) => { | |
| if (a.state !== "active") return false; | |
| try { | |
| whichSync.sync(a.bin); | |
| return true; | |
| } catch { | |
| return false; | |
| } | |
| }); | |
| } | |
| // --------------------------------------------------------------------------- | |
| // URL parsing helpers | |
| // --------------------------------------------------------------------------- | |
| function parseUrlPath(url) { | |
| try { | |
| const parsed = new URL(url); | |
| const segments = parsed.pathname.split("/").filter(Boolean); | |
| return { hostname: parsed.hostname || "", segments }; | |
| } catch { | |
| return { hostname: null, segments: [] }; | |
| } | |
| } | |
| // --------------------------------------------------------------------------- | |
| // GitHub | |
| // --------------------------------------------------------------------------- | |
| const octokit = process.env.GITHUB_TOKEN | |
| ? new Octokit({ auth: process.env.GITHUB_TOKEN }) | |
| : new Octokit(); | |
| function classifyGithubUrl(segments) { | |
| if (segments.length < 2) return null; | |
| let repo = segments[1]; | |
| if (repo.endsWith(".git")) repo = repo.slice(0, -4); | |
| const base = { owner: segments[0], repo }; | |
| if (segments.length === 2) return { ...base, resource: null }; | |
| const res = segments[2]; | |
| if (res === "issues" && segments.length >= 4) | |
| return { ...base, resource: "issues", id: segments[3] }; | |
| if (res === "pull" && segments.length >= 4) | |
| return { ...base, resource: "pull", id: segments[3] }; | |
| if (res === "commit" && segments.length >= 4) | |
| return { ...base, resource: "commit", sha: segments[3] }; | |
| if (res === "blob" && segments.length >= 4) | |
| return { | |
| ...base, | |
| resource: "blob", | |
| branch: segments[3], | |
| path: segments.slice(4).join("/"), | |
| }; | |
| if (res === "tree" && segments.length >= 4) | |
| return { | |
| ...base, | |
| resource: "tree", | |
| branch: segments[3], | |
| path: segments.slice(4).join("/"), | |
| }; | |
| if (res === "discussions" && segments.length >= 4) | |
| return { ...base, resource: "discussions", id: segments[3] }; | |
| if (res === "releases" && segments.length >= 5 && segments[3] === "tag") | |
| return { ...base, resource: "releases", tag: segments[4] }; | |
| if (res === "compare" && segments.length >= 4) | |
| return { ...base, resource: "compare", spec: segments[3] }; | |
| if (res === "actions" && segments.length >= 5 && segments[3] === "runs") | |
| return { ...base, resource: "actions", run_id: segments[4] }; | |
| if (res === "wiki") | |
| return { | |
| ...base, | |
| resource: "wiki", | |
| page: segments.length >= 4 ? segments[3] : null, | |
| }; | |
| return { ...base, resource: "unknown" }; | |
| } | |
| async function fmtGithubRepo(owner, repo) { | |
| const { data } = await octokit.repos.get({ owner, repo }); | |
| const parts = [`Repository: ${data.full_name}`]; | |
| if (data.description) parts.push(`Description: ${data.description}`); | |
| try { | |
| const readme = await octokit.repos.getReadme({ owner, repo }); | |
| const content = Buffer.from(readme.data.content, "base64").toString( | |
| "utf-8" | |
| ); | |
| parts.push(`README (first 2000 chars):\n${content.slice(0, 2000)}`); | |
| } catch {} | |
| return parts.join("\n\n"); | |
| } | |
| async function fmtGithubIssue(owner, repo, issueId) { | |
| const { data: issue } = await octokit.issues.get({ | |
| owner, | |
| repo, | |
| issue_number: Number(issueId), | |
| }); | |
| const parts = [ | |
| `Issue #${issue.number}: ${issue.title}`, | |
| `State: ${issue.state}`, | |
| `Body:\n${issue.body || "(empty)"}`, | |
| ]; | |
| const { data: comments } = await octokit.issues.listComments({ | |
| owner, | |
| repo, | |
| issue_number: Number(issueId), | |
| per_page: 10, | |
| }); | |
| if (comments.length) { | |
| const texts = comments.map( | |
| (c) => ` Comment by ${c.user.login}:\n ${c.body}` | |
| ); | |
| parts.push("Comments (first 10):\n" + texts.join("\n---\n")); | |
| } | |
| return parts.join("\n\n"); | |
| } | |
| async function fmtGithubPr(owner, repo, prId) { | |
| const { data: pr } = await octokit.pulls.get({ | |
| owner, | |
| repo, | |
| pull_number: Number(prId), | |
| }); | |
| const parts = [ | |
| `Pull Request #${pr.number}: ${pr.title}`, | |
| `State: ${pr.state} Merged: ${pr.merged}`, | |
| `Body:\n${pr.body || "(empty)"}`, | |
| ]; | |
| const { data: files } = await octokit.pulls.listFiles({ | |
| owner, | |
| repo, | |
| pull_number: Number(prId), | |
| }); | |
| const diffParts = files.map((f) => { | |
| const header = `--- ${f.filename} (${f.status}, +${f.additions}/-${f.deletions})`; | |
| const patch = f.patch || "(binary or too large)"; | |
| return `${header}\n${patch}`; | |
| }); | |
| if (diffParts.length) { | |
| let diffText = diffParts.join("\n\n"); | |
| if (diffText.length > 5000) | |
| diffText = diffText.slice(0, 5000) + "\n... (diff truncated)"; | |
| parts.push(`Diff:\n${diffText}`); | |
| } | |
| return parts.join("\n\n"); | |
| } | |
| async function fmtGithubCommit(owner, repo, sha) { | |
| const { data: commit } = await octokit.repos.getCommit({ owner, repo, ref: sha }); | |
| const parts = [ | |
| `Commit: ${commit.sha}`, | |
| `Message: ${commit.commit.message}`, | |
| `Author: ${commit.commit.author.name}`, | |
| `Stats: +${commit.stats.additions}/-${commit.stats.deletions}`, | |
| ]; | |
| const fileParts = (commit.files || []).map( | |
| (f) => ` ${f.filename} (${f.status}): ${f.patch || "(binary)"}` | |
| ); | |
| if (fileParts.length) { | |
| let patchText = fileParts.join("\n"); | |
| if (patchText.length > 5000) | |
| patchText = patchText.slice(0, 5000) + "\n... (patch truncated)"; | |
| parts.push(`Files changed:\n${patchText}`); | |
| } | |
| return parts.join("\n\n"); | |
| } | |
| async function fmtGithubBlob(owner, repo, branch, path) { | |
| const { data } = await octokit.repos.getContent({ | |
| owner, | |
| repo, | |
| path, | |
| ref: branch, | |
| }); | |
| if (Array.isArray(data)) { | |
| const listing = data.map((c) => ` ${c.path} (${c.type})`).join("\n"); | |
| return `Directory listing at ${branch}/${path}:\n${listing}`; | |
| } | |
| let content = Buffer.from(data.content, "base64").toString("utf-8"); | |
| if (content.length > 5000) | |
| content = content.slice(0, 5000) + "\n... (content truncated)"; | |
| return `File: ${path} (branch: ${branch})\n\n${content}`; | |
| } | |
| async function fmtGithubTree(owner, repo, branch, path) { | |
| const { data } = await octokit.repos.getContent({ | |
| owner, | |
| repo, | |
| path: path || "", | |
| ref: branch, | |
| }); | |
| const items = Array.isArray(data) ? data : [data]; | |
| const listing = items | |
| .map((c) => ` ${c.path} (${c.type}, ${c.size} bytes)`) | |
| .join("\n"); | |
| return `Tree at ${branch}/${path || "(root)"}:\n${listing}`; | |
| } | |
| async function fmtGithubRelease(owner, repo, tag) { | |
| const { data: release } = await octokit.repos.getReleaseByTag({ | |
| owner, | |
| repo, | |
| tag, | |
| }); | |
| return [ | |
| `Release: ${release.name || release.tag_name}`, | |
| `Tag: ${release.tag_name}`, | |
| `Body:\n${release.body || "(empty)"}`, | |
| ].join("\n\n"); | |
| } | |
| async function fmtGithubCompare(owner, repo, spec) { | |
| let base, head; | |
| if (spec.includes("...")) [base, head] = spec.split("...", 2); | |
| else if (spec.includes("..")) [base, head] = spec.split("..", 2); | |
| else return null; | |
| const { data } = await octokit.repos.compareCommits({ | |
| owner, | |
| repo, | |
| base, | |
| head, | |
| }); | |
| const parts = [ | |
| `Comparison: ${base}...${head}`, | |
| `Status: ${data.status}`, | |
| `Ahead by: ${data.ahead_by}, Behind by: ${data.behind_by}`, | |
| `Total commits: ${data.total_commits}`, | |
| ]; | |
| const commitSummaries = (data.commits || []) | |
| .slice(0, 20) | |
| .map((c) => ` ${c.sha.slice(0, 8)}: ${c.commit.message.split("\n")[0]}`); | |
| if (commitSummaries.length) | |
| parts.push("Commits:\n" + commitSummaries.join("\n")); | |
| const fileSummaries = (data.files || []) | |
| .slice(0, 30) | |
| .map( | |
| (f) => | |
| ` ${f.filename} (${f.status}, +${f.additions}/-${f.deletions})` | |
| ); | |
| if (fileSummaries.length) | |
| parts.push("Files changed:\n" + fileSummaries.join("\n")); | |
| return parts.join("\n\n"); | |
| } | |
| async function fmtGithubActions(owner, repo, runId) { | |
| const { data: run } = await octokit.actions.getWorkflowRun({ | |
| owner, | |
| repo, | |
| run_id: Number(runId), | |
| }); | |
| const parts = [ | |
| `Workflow Run: ${run.name} #${run.run_number}`, | |
| `Status: ${run.status} Conclusion: ${run.conclusion}`, | |
| `SHA: ${run.head_sha}`, | |
| ]; | |
| try { | |
| const { data: jobsData } = await octokit.actions.listJobsForWorkflowRun({ | |
| owner, | |
| repo, | |
| run_id: Number(runId), | |
| }); | |
| for (const job of jobsData.jobs) { | |
| if (job.conclusion === "failure") { | |
| parts.push(`Failed job: ${job.name}`); | |
| for (const step of job.steps || []) { | |
| if (step.conclusion === "failure") | |
| parts.push(` Failed step: ${step.name}`); | |
| } | |
| } | |
| } | |
| } catch {} | |
| return parts.join("\n\n"); | |
| } | |
| function fmtGithubWiki(owner, repo, page) { | |
| if (page) | |
| return `Wiki page: ${page} (from ${owner}/${repo}/wiki)\nNote: Wiki content cannot be fetched via API.`; | |
| return `Wiki: ${owner}/${repo}/wiki\nNote: Wiki content cannot be fetched via API.`; | |
| } | |
| async function fetchGithubContent(url) { | |
| if (!process.env.GITHUB_TOKEN) { | |
| console.log("GITHUB_TOKEN not set."); | |
| return null; | |
| } | |
| const { hostname, segments } = parseUrlPath(url); | |
| if (!hostname || !hostname.includes("github.com")) return null; | |
| const info = classifyGithubUrl(segments); | |
| if (!info) return null; | |
| try { | |
| const { owner, repo, resource } = info; | |
| if (resource === null) return await fmtGithubRepo(owner, repo); | |
| if (resource === "issues") return await fmtGithubIssue(owner, repo, info.id); | |
| if (resource === "pull") return await fmtGithubPr(owner, repo, info.id); | |
| if (resource === "commit") return await fmtGithubCommit(owner, repo, info.sha); | |
| if (resource === "blob") | |
| return await fmtGithubBlob(owner, repo, info.branch, info.path); | |
| if (resource === "tree") | |
| return await fmtGithubTree(owner, repo, info.branch, info.path); | |
| if (resource === "releases") | |
| return await fmtGithubRelease(owner, repo, info.tag); | |
| if (resource === "compare") | |
| return await fmtGithubCompare(owner, repo, info.spec); | |
| if (resource === "actions") | |
| return await fmtGithubActions(owner, repo, info.run_id); | |
| if (resource === "wiki") return fmtGithubWiki(owner, repo, info.page); | |
| return null; | |
| } catch (err) { | |
| console.error(`GitHub API error: ${err.message}`); | |
| return null; | |
| } | |
| } | |
| // --------------------------------------------------------------------------- | |
| // GitLab | |
| // --------------------------------------------------------------------------- | |
| const gitlab = process.env.GITLAB_TOKEN | |
| ? new Gitlab({ token: process.env.GITLAB_TOKEN }) | |
| : null; | |
| function classifyGitlabUrl(segments) { | |
| let dashIdx = segments.indexOf("-"); | |
| if (dashIdx === -1) { | |
| if (segments.length >= 2) | |
| return { projectPath: segments.join("/"), resource: null }; | |
| return null; | |
| } | |
| const projectPath = segments.slice(0, dashIdx).join("/"); | |
| const resSegments = segments.slice(dashIdx + 1); | |
| if (!projectPath || !resSegments.length) | |
| return { projectPath, resource: null }; | |
| const res = resSegments[0]; | |
| if (res === "issues" && resSegments.length >= 2) | |
| return { projectPath, resource: "issues", id: resSegments[1] }; | |
| if (res === "merge_requests" && resSegments.length >= 2) | |
| return { projectPath, resource: "merge_requests", id: resSegments[1] }; | |
| if ((res === "commit" || res === "commits") && resSegments.length >= 2) | |
| return { projectPath, resource: "commit", sha: resSegments[1] }; | |
| if (res === "blob" && resSegments.length >= 2) | |
| return { | |
| projectPath, | |
| resource: "blob", | |
| branch: resSegments[1], | |
| path: resSegments.slice(2).join("/"), | |
| }; | |
| if (res === "tree" && resSegments.length >= 2) | |
| return { | |
| projectPath, | |
| resource: "tree", | |
| branch: resSegments[1], | |
| path: resSegments.slice(2).join("/"), | |
| }; | |
| if (res === "releases" && resSegments.length >= 2) | |
| return { projectPath, resource: "releases", tag: resSegments[1] }; | |
| if (res === "compare" && resSegments.length >= 2) | |
| return { projectPath, resource: "compare", spec: resSegments[1] }; | |
| if (res === "pipelines" && resSegments.length >= 2) | |
| return { projectPath, resource: "pipelines", id: resSegments[1] }; | |
| if (res === "wikis") | |
| return { | |
| projectPath, | |
| resource: "wikis", | |
| page: resSegments.length >= 2 ? resSegments[1] : null, | |
| }; | |
| return { projectPath, resource: "unknown" }; | |
| } | |
| async function fetchGitlabContent(url) { | |
| if (!gitlab) { | |
| console.log("GITLAB_TOKEN not set."); | |
| return null; | |
| } | |
| const { hostname, segments } = parseUrlPath(url); | |
| if (!hostname || !hostname.includes("gitlab.com")) return null; | |
| const info = classifyGitlabUrl(segments); | |
| if (!info) return null; | |
| try { | |
| const project = await gitlab.Projects.show(info.projectPath); | |
| const { resource } = info; | |
| if (resource === null) { | |
| const parts = [`Repository: ${project.path_with_namespace}`]; | |
| if (project.description) | |
| parts.push(`Description: ${project.description}`); | |
| try { | |
| const readme = await gitlab.RepositoryFiles.show( | |
| project.id, | |
| "README.md", | |
| project.default_branch | |
| ); | |
| const content = Buffer.from(readme.content, "base64").toString("utf-8"); | |
| parts.push(`README (first 2000 chars):\n${content.slice(0, 2000)}`); | |
| } catch {} | |
| return parts.join("\n\n"); | |
| } | |
| if (resource === "issues") { | |
| const issue = await gitlab.Issues.show(project.id, Number(info.id)); | |
| const parts = [ | |
| `Issue #${issue.iid}: ${issue.title}`, | |
| `State: ${issue.state}`, | |
| `Body:\n${issue.description || "(empty)"}`, | |
| ]; | |
| const notes = await gitlab.IssueNotes.all(project.id, Number(info.id), { | |
| perPage: 10, | |
| }); | |
| const noteTexts = notes.map( | |
| (n) => ` Comment by ${n.author.username}: ${n.body}` | |
| ); | |
| if (noteTexts.length) | |
| parts.push("Comments (first 10):\n" + noteTexts.join("\n---\n")); | |
| return parts.join("\n\n"); | |
| } | |
| if (resource === "merge_requests") { | |
| const mr = await gitlab.MergeRequests.show(project.id, Number(info.id)); | |
| const parts = [ | |
| `Merge Request !${mr.iid}: ${mr.title}`, | |
| `State: ${mr.state}`, | |
| `Body:\n${mr.description || "(empty)"}`, | |
| ]; | |
| try { | |
| const changes = await gitlab.MergeRequests.allDiffs( | |
| project.id, | |
| Number(info.id) | |
| ); | |
| const diffParts = changes | |
| .slice(0, 30) | |
| .map( | |
| (c) => | |
| ` ${c.new_path || "?"}: ${(c.diff || "").slice(0, 500)}` | |
| ); | |
| if (diffParts.length) { | |
| let diffText = diffParts.join("\n"); | |
| if (diffText.length > 5000) | |
| diffText = diffText.slice(0, 5000) + "\n... (diff truncated)"; | |
| parts.push(`Changes:\n${diffText}`); | |
| } | |
| } catch {} | |
| return parts.join("\n\n"); | |
| } | |
| if (resource === "commit") { | |
| const commit = await gitlab.Commits.show(project.id, info.sha); | |
| const parts = [ | |
| `Commit: ${commit.id}`, | |
| `Title: ${commit.title}`, | |
| `Message: ${commit.message}`, | |
| `Author: ${commit.author_name}`, | |
| ]; | |
| try { | |
| const diffs = await gitlab.Commits.showDiff(project.id, info.sha); | |
| const diffParts = diffs | |
| .slice(0, 30) | |
| .map( | |
| (d) => | |
| ` ${d.new_path || "?"}: ${(d.diff || "").slice(0, 500)}` | |
| ); | |
| if (diffParts.length) { | |
| let diffText = diffParts.join("\n"); | |
| if (diffText.length > 5000) | |
| diffText = diffText.slice(0, 5000) + "\n... (diff truncated)"; | |
| parts.push(`Diff:\n${diffText}`); | |
| } | |
| } catch {} | |
| return parts.join("\n\n"); | |
| } | |
| if (resource === "blob") { | |
| const file = await gitlab.RepositoryFiles.show( | |
| project.id, | |
| info.path, | |
| info.branch | |
| ); | |
| let content = Buffer.from(file.content, "base64").toString("utf-8"); | |
| if (content.length > 5000) | |
| content = content.slice(0, 5000) + "\n... (content truncated)"; | |
| return `File: ${info.path} (branch: ${info.branch})\n\n${content}`; | |
| } | |
| if (resource === "tree") { | |
| const items = await gitlab.Repositories.allRepositoryTrees(project.id, { | |
| path: info.path || "", | |
| ref: info.branch, | |
| perPage: 100, | |
| }); | |
| const listing = items | |
| .map((item) => ` ${item.path} (${item.type})`) | |
| .join("\n"); | |
| return `Tree at ${info.branch}/${info.path || "(root)"}:\n${listing}`; | |
| } | |
| if (resource === "releases") { | |
| const release = await gitlab.ProjectReleases.show( | |
| project.id, | |
| info.tag | |
| ); | |
| return [ | |
| `Release: ${release.name || release.tag_name}`, | |
| `Tag: ${release.tag_name}`, | |
| `Description:\n${release.description || "(empty)"}`, | |
| ].join("\n\n"); | |
| } | |
| if (resource === "compare") { | |
| let base, head; | |
| if (info.spec.includes("...")) [base, head] = info.spec.split("...", 2); | |
| else if (info.spec.includes("..")) | |
| [base, head] = info.spec.split("..", 2); | |
| else return null; | |
| const result = await gitlab.Repositories.compare(project.id, base, head); | |
| const parts = [`Comparison: ${base}...${head}`]; | |
| const commits = (result.commits || []) | |
| .slice(0, 20) | |
| .map((c) => ` ${c.short_id || "?"}: ${c.title || ""}`); | |
| if (commits.length) parts.push("Commits:\n" + commits.join("\n")); | |
| const diffs = (result.diffs || []) | |
| .slice(0, 30) | |
| .map( | |
| (d) => | |
| ` ${d.new_path || "?"}: ${(d.diff || "").slice(0, 500)}` | |
| ); | |
| if (diffs.length) { | |
| let diffText = diffs.join("\n"); | |
| if (diffText.length > 5000) | |
| diffText = diffText.slice(0, 5000) + "\n... (diff truncated)"; | |
| parts.push(`Diffs:\n${diffText}`); | |
| } | |
| return parts.join("\n\n"); | |
| } | |
| if (resource === "pipelines") { | |
| const pipeline = await gitlab.Pipelines.show( | |
| project.id, | |
| Number(info.id) | |
| ); | |
| const parts = [ | |
| `Pipeline #${pipeline.id}`, | |
| `Status: ${pipeline.status}`, | |
| `Ref: ${pipeline.ref}`, | |
| `SHA: ${pipeline.sha}`, | |
| ]; | |
| try { | |
| const jobs = await gitlab.PipelineJobs.all(project.id, pipeline.id, { | |
| perPage: 20, | |
| }); | |
| const failed = jobs.filter((j) => j.status === "failed"); | |
| if (failed.length) { | |
| parts.push("Failed jobs:"); | |
| for (const j of failed) | |
| parts.push(` ${j.name}: ${j.status} (stage: ${j.stage})`); | |
| } | |
| } catch {} | |
| return parts.join("\n\n"); | |
| } | |
| if (resource === "wikis") { | |
| if (info.page) { | |
| try { | |
| const page = await gitlab.Wikis.show(project.id, info.page); | |
| return `Wiki page: ${page.title}\n\n${page.content}`; | |
| } catch { | |
| return `Wiki page: ${info.page}\nNote: Could not fetch wiki page content.`; | |
| } | |
| } | |
| try { | |
| const pages = await gitlab.Wikis.all(project.id, { perPage: 20 }); | |
| const listing = pages.map((p) => ` ${p.slug}: ${p.title}`).join("\n"); | |
| return `Wiki pages:\n${listing}`; | |
| } catch { | |
| return "Wiki: Could not fetch wiki pages."; | |
| } | |
| } | |
| return null; | |
| } catch (err) { | |
| console.error(`GitLab API error: ${err.message}`); | |
| return null; | |
| } | |
| } | |
| // --------------------------------------------------------------------------- | |
| // HuggingFace | |
| // --------------------------------------------------------------------------- | |
| function classifyHuggingfaceUrl(segments) { | |
| if (!segments.length) return null; | |
| const segs = [...segments]; | |
| let repoType = null; | |
| if (segs[0] === "datasets" || segs[0] === "spaces") { | |
| repoType = segs[0] === "datasets" ? "dataset" : "space"; | |
| segs.splice(0, 1); | |
| } | |
| if (segs.length < 2) return null; | |
| const repoId = `${segs[0]}/${segs[1]}`; | |
| const base = { repoId, repoType }; | |
| if (segs.length === 2) return { ...base, resource: null }; | |
| const res = segs[2]; | |
| if (res === "blob" && segs.length >= 4) | |
| return { | |
| ...base, | |
| resource: "blob", | |
| revision: segs[3], | |
| path: segs.slice(4).join("/"), | |
| }; | |
| if (res === "resolve" && segs.length >= 4) | |
| return { | |
| ...base, | |
| resource: "resolve", | |
| revision: segs[3], | |
| path: segs.slice(4).join("/"), | |
| }; | |
| if (res === "tree" && segs.length >= 4) | |
| return { | |
| ...base, | |
| resource: "tree", | |
| revision: segs[3], | |
| path: segs.slice(4).join("/"), | |
| }; | |
| if (res === "commit" && segs.length >= 4) | |
| return { ...base, resource: "commit", sha: segs[3] }; | |
| if (res === "discussions" && segs.length >= 4) | |
| return { ...base, resource: "discussions", num: segs[3] }; | |
| return { ...base, resource: "unknown" }; | |
| } | |
| async function fetchHuggingfaceContent(url) { | |
| const token = process.env.HF_TOKEN; | |
| if (!token) { | |
| console.log("HF_TOKEN not set."); | |
| return null; | |
| } | |
| const { hostname, segments } = parseUrlPath(url); | |
| if (!hostname || !hostname.includes("huggingface.co")) return null; | |
| const info = classifyHuggingfaceUrl(segments); | |
| if (!info) return null; | |
| try { | |
| const credentials = { accessToken: token }; | |
| const repo = { type: info.repoType || "model", name: info.repoId }; | |
| if (info.resource === null) { | |
| const parts = [`Repository: ${info.repoId}`]; | |
| try { | |
| const resp = await downloadFile({ repo, path: "README.md", credentials }); | |
| if (resp) { | |
| const content = await resp.text(); | |
| parts.push( | |
| `README (first 2000 chars):\n${content.slice(0, 2000)}` | |
| ); | |
| } | |
| } catch {} | |
| return parts.join("\n\n"); | |
| } | |
| if (info.resource === "blob" || info.resource === "resolve") { | |
| try { | |
| const resp = await downloadFile({ | |
| repo, | |
| path: info.path, | |
| revision: info.revision, | |
| credentials, | |
| }); | |
| if (resp) { | |
| let content = await resp.text(); | |
| if (content.length > 5000) | |
| content = content.slice(0, 5000) + "\n... (content truncated)"; | |
| return `File: ${info.path} (revision: ${info.revision})\n\n${content}`; | |
| } | |
| } catch { | |
| return `File: ${info.path} (revision: ${info.revision})\n(binary or unreadable file)`; | |
| } | |
| } | |
| if (info.resource === "tree") { | |
| const items = []; | |
| for await (const entry of listFiles({ | |
| repo, | |
| path: info.path || undefined, | |
| revision: info.revision, | |
| credentials, | |
| })) { | |
| items.push(` ${entry.path} (${entry.type})`); | |
| if (items.length >= 100) { | |
| items.push(" ... (truncated)"); | |
| break; | |
| } | |
| } | |
| return `Tree at ${info.revision}/${info.path || "(root)"}:\n${items.join("\n")}`; | |
| } | |
| return null; | |
| } catch (err) { | |
| console.error(`Hugging Face API error: ${err.message}`); | |
| return null; | |
| } | |
| } | |
| // --------------------------------------------------------------------------- | |
| // URL router | |
| // --------------------------------------------------------------------------- | |
| async function fetchUrlContent(url) { | |
| if (!url || !url.trim()) return ""; | |
| url = url.trim(); | |
| try { | |
| const { hostname } = parseUrlPath(url); | |
| if (hostname && hostname.includes("github.com")) | |
| return await fetchGithubContent(url); | |
| if (hostname && hostname.includes("gitlab.com")) | |
| return await fetchGitlabContent(url); | |
| if (hostname && hostname.includes("huggingface.co")) | |
| return await fetchHuggingfaceContent(url); | |
| } catch (err) { | |
| console.error(`Error fetching URL content: ${err.message}`); | |
| } | |
| return ""; | |
| } | |
| // --------------------------------------------------------------------------- | |
| // Agent execution via CLI | |
| // --------------------------------------------------------------------------- | |
| function buildAgentCommand(agent, prompt) { | |
| switch (agent.promptStyle) { | |
| case "flag": | |
| return [agent.bin, ["-p", prompt, ...agent.initArgs]]; | |
| case "exec": | |
| return [agent.bin, ["exec", ...agent.initArgs, prompt]]; | |
| case "none": | |
| return [agent.bin, [...agent.initArgs, prompt]]; | |
| default: | |
| throw new Error(`Unknown promptStyle "${agent.promptStyle}" for ${agent.id}`); | |
| } | |
| } | |
| // Extract human-readable text from agent output (some CLIs return JSON/JSONL) | |
| function parseAgentOutput(raw) { | |
| if (!raw || typeof raw !== "string") return raw || ""; | |
| const trimmed = raw.trim(); | |
| // Try JSONL first (one JSON object per line — e.g. Grok CLI chat format, Claude Code JSON format) | |
| const lines = trimmed.split("\n").filter((l) => l.trim()); | |
| const hasJsonLines = lines.length > 0 && lines.every((l) => { | |
| const t = l.trim(); | |
| return t.startsWith("{") || t.startsWith("["); | |
| }); | |
| if (hasJsonLines && lines.length > 1) { | |
| // Claude Code JSON format: find the last type="result" line — it has the final text | |
| for (let i = lines.length - 1; i >= 0; i--) { | |
| try { | |
| const obj = JSON.parse(lines[i].trim()); | |
| if (obj.type === "result" && typeof obj.result === "string") { | |
| return obj.result; | |
| } | |
| } catch { /* skip */ } | |
| } | |
| // Claude Code format: type="assistant" with message.content array | |
| const claudeMsgs = []; | |
| for (const line of lines) { | |
| try { | |
| const obj = JSON.parse(line.trim()); | |
| if (obj.type === "assistant" && obj.message?.content) { | |
| const content = obj.message.content; | |
| if (Array.isArray(content)) { | |
| const texts = content.filter((c) => c.type === "text").map((c) => c.text); | |
| if (texts.length) claudeMsgs.push(texts.join("")); | |
| } else if (typeof content === "string") { | |
| claudeMsgs.push(content); | |
| } | |
| } | |
| } catch { /* skip */ } | |
| } | |
| if (claudeMsgs.length) return claudeMsgs.join("\n\n"); | |
| // Generic: role="assistant" | |
| const assistantMsgs = []; | |
| for (const line of lines) { | |
| try { | |
| const obj = JSON.parse(line.trim()); | |
| if (obj.role === "assistant" && obj.content) { | |
| assistantMsgs.push(obj.content); | |
| } | |
| } catch { /* skip unparseable lines */ } | |
| } | |
| if (assistantMsgs.length) return assistantMsgs.join("\n\n"); | |
| // No assistant messages — try extracting any content field | |
| const allContent = []; | |
| for (const line of lines) { | |
| try { | |
| const obj = JSON.parse(line.trim()); | |
| if (obj.content) allContent.push(obj.content); | |
| } catch { /* skip */ } | |
| } | |
| if (allContent.length) return allContent.join("\n\n"); | |
| // JSONL detected but no meaningful content extracted (e.g. only | |
| // system/hook lines during streaming) — return empty rather than | |
| // dumping raw JSON noise. | |
| return ""; | |
| } | |
| // Try single JSON object | |
| if (trimmed.startsWith("{") || trimmed.startsWith("[")) { | |
| try { | |
| const obj = JSON.parse(trimmed); | |
| if (obj.type === "result" && typeof obj.result === "string") return obj.result; | |
| const text = | |
| obj.result || obj.response || obj.content || obj.message || | |
| obj.text || obj.output || obj.answer || | |
| obj.choices?.[0]?.message?.content || | |
| obj.choices?.[0]?.text; | |
| if (typeof text === "string") return text; | |
| if (Array.isArray(obj)) { | |
| const msgs = obj.map((m) => m.content || m.text || "").filter(Boolean); | |
| if (msgs.length) return msgs.join("\n\n"); | |
| } | |
| } catch { /* not valid JSON, fall through */ } | |
| } | |
| return raw; | |
| } | |
| // Extract session_id from Claude Code JSONL output so followups can use --resume <id> | |
| function extractSessionId(raw) { | |
| if (!raw || typeof raw !== "string") return null; | |
| const lines = raw.trim().split("\n"); | |
| // Scan from the end — session_id appears on every line, last is most reliable | |
| for (let i = lines.length - 1; i >= 0; i--) { | |
| try { | |
| const obj = JSON.parse(lines[i].trim()); | |
| if (obj.session_id && typeof obj.session_id === "string") return obj.session_id; | |
| } catch { /* skip */ } | |
| } | |
| return null; | |
| } | |
| // Streaming agent runner — returns a live state object + promise | |
| function spawnAgent(agent, prompt, agentDir) { | |
| const [bin, args] = buildAgentCommand(agent, prompt); | |
| const state = { stdout: "", stderr: "", done: false, ok: false }; | |
| const proc = spawn(bin, args, { cwd: agentDir, env: { ...process.env }, stdio: ["ignore", "pipe", "pipe"] }); | |
| proc.stdout.setEncoding("utf-8"); | |
| proc.stderr.setEncoding("utf-8"); | |
| proc.stdout.on("data", (chunk) => { state.stdout += chunk; }); | |
| proc.stderr.on("data", (chunk) => { state.stderr += chunk; }); | |
| const timer = setTimeout(() => { | |
| proc.kill(); | |
| state.stderr += `\n[Timeout after ${AGENT_TIMEOUT_LABEL}]`; | |
| }, AGENT_TIMEOUT); | |
| state.promise = new Promise((resolve) => { | |
| proc.on("close", (code) => { | |
| clearTimeout(timer); | |
| state.done = true; | |
| state.ok = code === 0; | |
| resolve(state); | |
| }); | |
| proc.on("error", (err) => { | |
| clearTimeout(timer); | |
| state.done = true; | |
| state.ok = false; | |
| state.stderr += err.message; | |
| resolve(state); | |
| }); | |
| }); | |
| return state; | |
| } | |
| // Blocking agent runner — used for followups (shorter, less need for streaming) | |
| async function runAgent(agent, prompt, agentDir) { | |
| const [bin, args] = buildAgentCommand(agent, prompt); | |
| try { | |
| const { stdout, stderr } = await execFileAsync(bin, args, { | |
| cwd: agentDir, | |
| timeout: AGENT_TIMEOUT, | |
| encoding: "utf-8", | |
| maxBuffer: 10 * 1024 * 1024, | |
| }); | |
| return { ok: true, stdout, stderr }; | |
| } catch (err) { | |
| const partialOut = err.stdout || ""; | |
| const partialErr = err.stderr || ""; | |
| const prefix = err.killed ? `[Timeout after ${AGENT_TIMEOUT_LABEL}]\n` : ""; | |
| return { | |
| ok: false, | |
| stdout: partialOut, | |
| stderr: prefix + (partialErr || err.message), | |
| }; | |
| } | |
| } | |
| function rebuildPrompt(rounds, followup) { | |
| const parts = []; | |
| for (const r of rounds) { | |
| parts.push(`User: ${r.prompt}`); | |
| parts.push(`Agent: ${r.stdout}`); | |
| } | |
| parts.push(`User: ${followup}`); | |
| return parts.join("\n\n"); | |
| } | |
| async function runFollowup(agent, followup, agentDir, rounds, sessionId) { | |
| let bin = agent.bin, args; | |
| switch (agent.followupStyle) { | |
| case "continue": | |
| args = ["-p", followup, ...agent.followupArgs]; | |
| break; | |
| case "resume": | |
| // Use --resume <session-id> so each agent binds to its own session, | |
| // avoiding conflicts when two instances of the same CLI run simultaneously. | |
| if (sessionId) { | |
| if (agent.promptStyle === "exec") { | |
| // Codex-style: codex exec --resume <session-id> -p <followup> ...args | |
| args = ["exec", "--resume", sessionId, "-p", followup, ...agent.followupArgs]; | |
| } else { | |
| // Claude-style: claude -p <followup> --resume <session-id> ...args | |
| args = ["-p", followup, "--resume", sessionId, ...agent.followupArgs]; | |
| } | |
| } else { | |
| // No session ID captured — fall back to Codex-style exec resume | |
| args = ["exec", ...agent.followupArgs, "resume", "--last", followup]; | |
| } | |
| break; | |
| case "replay": { | |
| const full = rebuildPrompt(rounds, followup); | |
| args = ["-p", full, ...agent.followupArgs]; | |
| break; | |
| } | |
| case "none": | |
| args = [...agent.followupArgs, followup]; | |
| break; | |
| default: | |
| throw new Error(`Unknown followupStyle "${agent.followupStyle}" for ${agent.id}`); | |
| } | |
| try { | |
| const { stdout, stderr } = await execFileAsync(bin, args, { | |
| cwd: agentDir, | |
| timeout: AGENT_TIMEOUT, | |
| encoding: "utf-8", | |
| maxBuffer: 10 * 1024 * 1024, | |
| }); | |
| return { ok: true, stdout, stderr }; | |
| } catch (err) { | |
| const partialOut = err.stdout || ""; | |
| const partialErr = err.stderr || ""; | |
| const prefix = err.killed ? `[Timeout after ${AGENT_TIMEOUT_LABEL}]\n` : ""; | |
| return { | |
| ok: false, | |
| stdout: partialOut, | |
| stderr: prefix + (partialErr || err.message), | |
| }; | |
| } | |
| } | |
| // --------------------------------------------------------------------------- | |
| // First-round retry — tries every available agent until one succeeds | |
| // --------------------------------------------------------------------------- | |
| async function tryAgentWithRetry(battle, side, fullPrompt, repoUrl) { | |
| const available = availableAgents(); | |
| // Fisher-Yates shuffle for unbiased randomisation | |
| const shuffled = [...available]; | |
| for (let i = shuffled.length - 1; i > 0; i--) { | |
| const j = Math.floor(Math.random() * (i + 1)); | |
| [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]]; | |
| } | |
| for (let i = 0; i < shuffled.length; i++) { | |
| const agent = shuffled[i]; | |
| for (let attempt = 0; attempt < MAX_AGENT_RETRIES; attempt++) { | |
| const dir = join(tmpdir(), `swe-arena-${randomUUID()}`); | |
| mkdirSync(dir); | |
| try { | |
| if (repoUrl && repoUrl.trim()) { | |
| cloneRepo(repoUrl, dir); | |
| } else { | |
| execFileSync("git", ["init"], { cwd: dir, stdio: "pipe" }); | |
| // Create an initial empty commit so HEAD always exists. | |
| // git diff HEAD then works correctly on an unborn branch. | |
| execFileSync( | |
| "git", | |
| ["-c", "user.name=arena", "-c", "user.email=arena@localhost", | |
| "commit", "--allow-empty", "-m", "init"], | |
| { cwd: dir, stdio: "pipe" } | |
| ); | |
| } | |
| } catch (err) { | |
| console.error(`Git setup failed for ${agent.name} on ${side} (attempt ${attempt + 1}/${MAX_AGENT_RETRIES}): ${err.message}`); | |
| rmSync(dir, { recursive: true, force: true }); | |
| break; // git setup failed — no point retrying this agent | |
| } | |
| const state = spawnAgent(agent, fullPrompt, dir); | |
| // Clean up previous attempt's directory | |
| const prevDir = battle[`${side}Dir`]; | |
| if (prevDir && prevDir !== dir) { | |
| rmSync(prevDir, { recursive: true, force: true }); | |
| } | |
| // Update battle so polling picks up live output from this attempt | |
| battle[side] = agent.name; | |
| battle[`${side}Agent`] = agent; | |
| battle[`${side}Dir`] = dir; | |
| battle[`${side}State`] = state; | |
| await state.promise; | |
| if (state.ok) { | |
| const diff = captureDiff(dir); | |
| battle[`${side}Diff`] = diff; | |
| battle[`${side}SessionId`] = extractSessionId(state.stdout); | |
| battle[`${side}Rounds`] = [{ | |
| prompt: fullPrompt, | |
| stdout: state.stdout || state.stderr || "", | |
| stderr: state.stderr || "", | |
| diff: diff || "", | |
| }]; | |
| return; | |
| } | |
| console.log(`Agent ${agent.name} failed on ${side} (attempt ${attempt + 1}/${MAX_AGENT_RETRIES}), retrying in a fresh directory...\n stderr: ${state.stderr.slice(0, 1000).replace(/\n/g, " ")}\n stdout: ${state.stdout.slice(0, 1000).replace(/\n/g, " ")}`); | |
| } | |
| console.log(`Agent ${agent.name} exhausted ${MAX_AGENT_RETRIES} retries on ${side}, trying next agent...`); | |
| } | |
| // Every available agent was tried and failed | |
| console.error(`All ${shuffled.length} available agents failed for ${side} side`); | |
| const lastDir = battle[`${side}Dir`]; | |
| const lastState = battle[`${side}State`]; | |
| battle[`${side}Diff`] = lastDir ? captureDiff(lastDir) : ""; | |
| battle[`${side}Rounds`] = [{ | |
| prompt: fullPrompt, | |
| stdout: lastState.stdout || lastState.stderr || "", | |
| stderr: lastState.stderr || "", | |
| diff: battle[`${side}Diff`] || "", | |
| }]; | |
| } | |
| // --------------------------------------------------------------------------- | |
| // Prompt construction | |
| // --------------------------------------------------------------------------- | |
| function buildPrompt(userPrompt, repoContext = "") { | |
| const parts = [SYSTEM_PREFIX]; | |
| if (repoContext) parts.push(`Repository context:\n${repoContext}`); | |
| parts.push(userPrompt); | |
| return parts.join("\n\n"); | |
| } | |
| function stripContext(prompt) { | |
| const marker = "\n\n"; | |
| // Find the last section which is the user query | |
| // The prompt format is: SYSTEM_PREFIX + \n\n + [repo context + \n\n] + user query | |
| // We strip SYSTEM_PREFIX and optional repo context | |
| let rest = prompt; | |
| if (rest.startsWith(SYSTEM_PREFIX)) { | |
| rest = rest.slice(SYSTEM_PREFIX.length); | |
| if (rest.startsWith("\n\n")) rest = rest.slice(2); | |
| } | |
| if (rest.startsWith("Repository context:\n")) { | |
| const idx = rest.indexOf("\n\n", "Repository context:\n".length); | |
| if (idx >= 0) rest = rest.slice(idx + 2); | |
| } | |
| return rest; | |
| } | |
| // --------------------------------------------------------------------------- | |
| // Git operations (clone, checkout, diff) | |
| // --------------------------------------------------------------------------- | |
| function cloneRepo(url, agentDir) { | |
| const { hostname, segments } = parseUrlPath(url); | |
| if (!hostname) return false; | |
| let parsedInfo = null; | |
| let cloneUrl = null; | |
| if (hostname.includes("github.com")) { | |
| parsedInfo = classifyGithubUrl(segments); | |
| if (!parsedInfo) return false; | |
| cloneUrl = `https://github.com/${parsedInfo.owner}/${parsedInfo.repo}.git`; | |
| } else if (hostname.includes("gitlab.com")) { | |
| parsedInfo = classifyGitlabUrl(segments); | |
| if (!parsedInfo) return false; | |
| cloneUrl = `https://gitlab.com/${parsedInfo.projectPath}.git`; | |
| } else if (hostname.includes("huggingface.co")) { | |
| parsedInfo = classifyHuggingfaceUrl(segments); | |
| if (!parsedInfo) return false; | |
| const prefix = parsedInfo.repoType ? `${parsedInfo.repoType}s/` : ""; | |
| cloneUrl = `https://huggingface.co/${prefix}${parsedInfo.repoId}`; | |
| } else { | |
| return false; | |
| } | |
| try { | |
| execFileSync("git", ["clone", "--depth=1", cloneUrl, "."], { | |
| cwd: agentDir, | |
| timeout: 120_000, | |
| stdio: "pipe", | |
| }); | |
| checkoutRef(parsedInfo, agentDir); | |
| return true; | |
| } catch { | |
| return false; | |
| } | |
| } | |
| function checkoutRef(parsedInfo, agentDir) { | |
| const resource = parsedInfo.resource; | |
| const run = (args) => { | |
| try { | |
| execFileSync("git", args, { cwd: agentDir, timeout: 60_000, stdio: "pipe" }); | |
| } catch {} | |
| }; | |
| try { | |
| if (resource === "pull" && parsedInfo.id) { | |
| run(["fetch", "origin", `pull/${parsedInfo.id}/head:pr`]); | |
| run(["checkout", "pr"]); | |
| } else if (resource === "merge_requests" && parsedInfo.id) { | |
| run(["fetch", "origin", `merge-requests/${parsedInfo.id}/head:mr`]); | |
| run(["checkout", "mr"]); | |
| } else if (resource === "commit" && parsedInfo.sha) { | |
| run(["fetch", "--depth=1", "origin", parsedInfo.sha]); | |
| run(["checkout", parsedInfo.sha]); | |
| } else if ( | |
| (resource === "blob" || resource === "tree") && | |
| parsedInfo.branch | |
| ) { | |
| run(["checkout", parsedInfo.branch]); | |
| } else if ( | |
| (resource === "blob" || resource === "resolve" || resource === "tree") && | |
| parsedInfo.revision | |
| ) { | |
| run(["checkout", parsedInfo.revision]); | |
| } | |
| } catch {} // best effort | |
| } | |
| function captureDiff(agentDir) { | |
| try { | |
| execFileSync("git", ["add", "-A"], { | |
| cwd: agentDir, | |
| stdio: "pipe", | |
| }); | |
| // Exclude CLI-specific config/state files so only the agent's | |
| // actual work appears in the diff. | |
| const result = execFileSync( | |
| "git", | |
| [ | |
| "diff", "HEAD", "--", | |
| ".", | |
| // Claude Code | |
| ":(exclude).claude", | |
| ":(exclude)CLAUDE.md", | |
| // Gemini CLI | |
| ":(exclude).gemini", | |
| // OpenAI Codex | |
| ":(exclude).codex", | |
| ":(exclude)codex.json", | |
| // Grok CLI | |
| ":(exclude).grok", | |
| // opencode per-instance dirs | |
| ":(exclude).xdg_data", | |
| ":(exclude).tmp", | |
| // Common IDE / tool artifacts | |
| ":(exclude).vscode", | |
| ":(exclude)settings.json", | |
| ], | |
| { | |
| cwd: agentDir, | |
| encoding: "utf-8", | |
| maxBuffer: 10 * 1024 * 1024, | |
| } | |
| ); | |
| return result.slice(0, 100_000); | |
| } catch (err) { | |
| console.error(`captureDiff failed: ${err.message}`); | |
| return ""; | |
| } | |
| } | |
| // --------------------------------------------------------------------------- | |
| // HF data I/O | |
| // --------------------------------------------------------------------------- | |
| async function saveContentToHf(data, repoName, fileName, token) { | |
| const json = JSON.stringify(data, null, 2); | |
| const content = new Blob([json]); | |
| if (!token) token = process.env.HF_TOKEN; | |
| if (!token) throw new Error("No HF token available for upload."); | |
| await uploadFile({ | |
| repo: { type: "dataset", name: repoName }, | |
| file: { content, path: `${fileName}.json` }, | |
| credentials: { accessToken: token }, | |
| }); | |
| } | |
| function isFileWithinTimeFrame(filePath, days) { | |
| try { | |
| const timestampStr = filePath.split("/").pop().replace(".json", ""); | |
| // Format: YYYYMMDD_HHMMSS | |
| const m = timestampStr.match( | |
| /(\d{4})(\d{2})(\d{2})_(\d{2})(\d{2})(\d{2})/ | |
| ); | |
| if (!m) return false; | |
| const fileDate = new Date( | |
| `${m[1]}-${m[2]}-${m[3]}T${m[4]}:${m[5]}:${m[6]}` | |
| ); | |
| const diffDays = (Date.now() - fileDate.getTime()) / (1000 * 60 * 60 * 24); | |
| return diffDays <= days; | |
| } catch { | |
| return false; | |
| } | |
| } | |
| async function loadContentFromHf(repoName, filePrefix) { | |
| const data = []; | |
| const token = process.env.HF_TOKEN; | |
| const credentials = token ? { accessToken: token } : undefined; | |
| const repo = { type: "dataset", name: repoName }; | |
| try { | |
| let fileCount = 0; | |
| for await (const file of listFiles({ repo, recursive: true, credentials })) { | |
| fileCount++; | |
| if (!file.path.startsWith(`${filePrefix}/`)) continue; | |
| if (!file.path.endsWith(".json")) continue; | |
| if ( | |
| !isFileWithinTimeFrame(file.path, LEADERBOARD_UPDATE_TIME_FRAME_DAYS) | |
| ) { | |
| console.log(` Skipped (outside time frame): ${file.path}`); | |
| continue; | |
| } | |
| const resp = await downloadFile({ repo, path: file.path, credentials }); | |
| if (resp) { | |
| const entry = JSON.parse(await resp.text()); | |
| entry.timestamp = file.path.split("/").pop().replace(".json", ""); | |
| data.push(entry); | |
| } | |
| } | |
| return data; | |
| } catch (err) { | |
| console.error(`Error loading data from HF: ${err.message}`); | |
| throw err; | |
| } | |
| } | |
| // --------------------------------------------------------------------------- | |
| // Leaderboard computation (custom JS — no evalica) | |
| // --------------------------------------------------------------------------- | |
| function round2(n) { | |
| return Math.round(n * 100) / 100; | |
| } | |
| const WINNER_MAP = { | |
| left: "X", | |
| right: "Y", | |
| tie: "draw", | |
| both_bad: "draw", | |
| }; | |
| function computeElo(votes) { | |
| const K = 32; | |
| const INITIAL = 1000; | |
| const scores = {}; | |
| for (const v of votes) { | |
| scores[v.left] ??= INITIAL; | |
| scores[v.right] ??= INITIAL; | |
| const w = WINNER_MAP[v.winner]; | |
| if (w === "draw") continue; // tieWeight = 0 | |
| const rA = scores[v.left]; | |
| const rB = scores[v.right]; | |
| const eA = 1 / (1 + 10 ** ((rB - rA) / 400)); | |
| const eB = 1 - eA; | |
| const sA = w === "X" ? 1 : 0; | |
| const sB = w === "Y" ? 1 : 0; | |
| scores[v.left] += K * (sA - eA); | |
| scores[v.right] += K * (sB - eB); | |
| } | |
| return scores; | |
| } | |
| function computeAvgWinRate(votes) { | |
| const wins = {}; | |
| const losses = {}; | |
| for (const v of votes) { | |
| wins[v.left] ??= 0; | |
| wins[v.right] ??= 0; | |
| losses[v.left] ??= 0; | |
| losses[v.right] ??= 0; | |
| const w = WINNER_MAP[v.winner]; | |
| if (w === "draw") continue; | |
| if (w === "X") { | |
| wins[v.left]++; | |
| losses[v.right]++; | |
| } else { | |
| wins[v.right]++; | |
| losses[v.left]++; | |
| } | |
| } | |
| const result = {}; | |
| for (const name of Object.keys(wins)) { | |
| const total = wins[name] + losses[name]; | |
| result[name] = total > 0 ? wins[name] / total : 0; | |
| } | |
| return result; | |
| } | |
| function computeBradleyTerry(votes, iterations = 100) { | |
| // Collect agents and win counts | |
| const agentSet = new Set(); | |
| for (const v of votes) { | |
| agentSet.add(v.left); | |
| agentSet.add(v.right); | |
| } | |
| const agentList = [...agentSet]; | |
| const n = agentList.length; | |
| const idx = Object.fromEntries(agentList.map((a, i) => [a, i])); | |
| // Win matrix | |
| const W = Array.from({ length: n }, () => new Float64Array(n)); | |
| for (const v of votes) { | |
| const w = WINNER_MAP[v.winner]; | |
| if (w === "draw") continue; | |
| const i = idx[v.left]; | |
| const j = idx[v.right]; | |
| if (w === "X") W[i][j]++; | |
| else W[j][i]++; | |
| } | |
| // Iterative MLE | |
| const p = new Float64Array(n).fill(1 / n); | |
| for (let iter = 0; iter < iterations; iter++) { | |
| const pNew = new Float64Array(n); | |
| for (let i = 0; i < n; i++) { | |
| let num = 0; | |
| let den = 0; | |
| for (let j = 0; j < n; j++) { | |
| if (i === j) continue; | |
| num += W[i][j]; | |
| const totalGames = W[i][j] + W[j][i]; | |
| if (totalGames > 0) den += totalGames / (p[i] + p[j]); | |
| } | |
| pNew[i] = den > 0 ? num / den : 0; | |
| } | |
| // Normalize | |
| const sum = pNew.reduce((a, b) => a + b, 0); | |
| if (sum > 0) for (let i = 0; i < n; i++) pNew[i] /= sum; | |
| for (let i = 0; i < n; i++) p[i] = pNew[i]; | |
| } | |
| const result = {}; | |
| for (let i = 0; i < n; i++) result[agentList[i]] = p[i]; | |
| return result; | |
| } | |
| function computePageRank(votes, damping = 0.85, iterations = 100) { | |
| const agentSet = new Set(); | |
| for (const v of votes) { | |
| agentSet.add(v.left); | |
| agentSet.add(v.right); | |
| } | |
| const agentList = [...agentSet]; | |
| const n = agentList.length; | |
| const idx = Object.fromEntries(agentList.map((a, i) => [a, i])); | |
| // Adjacency: edge from loser to winner | |
| const outLinks = Array.from({ length: n }, () => new Float64Array(n)); | |
| const outDegree = new Float64Array(n); | |
| for (const v of votes) { | |
| const w = WINNER_MAP[v.winner]; | |
| if (w === "draw") continue; | |
| const winner = w === "X" ? idx[v.left] : idx[v.right]; | |
| const loser = w === "X" ? idx[v.right] : idx[v.left]; | |
| outLinks[loser][winner]++; | |
| outDegree[loser]++; | |
| } | |
| let pr = new Float64Array(n).fill(1 / n); | |
| for (let iter = 0; iter < iterations; iter++) { | |
| const prNew = new Float64Array(n).fill((1 - damping) / n); | |
| for (let j = 0; j < n; j++) { | |
| if (outDegree[j] === 0) { | |
| // Dangling node: distribute evenly | |
| for (let i = 0; i < n; i++) prNew[i] += damping * pr[j] / n; | |
| } else { | |
| for (let i = 0; i < n; i++) { | |
| if (outLinks[j][i] > 0) { | |
| prNew[i] += damping * pr[j] * (outLinks[j][i] / outDegree[j]); | |
| } | |
| } | |
| } | |
| } | |
| pr = prNew; | |
| } | |
| const result = {}; | |
| for (let i = 0; i < n; i++) result[agentList[i]] = pr[i]; | |
| return result; | |
| } | |
| function computeEigen(votes, iterations = 100) { | |
| const agentSet = new Set(); | |
| for (const v of votes) { | |
| agentSet.add(v.left); | |
| agentSet.add(v.right); | |
| } | |
| const agentList = [...agentSet]; | |
| const n = agentList.length; | |
| const idx = Object.fromEntries(agentList.map((a, i) => [a, i])); | |
| // Adjacency matrix: wins | |
| const A = Array.from({ length: n }, () => new Float64Array(n)); | |
| for (const v of votes) { | |
| const w = WINNER_MAP[v.winner]; | |
| if (w === "draw") continue; | |
| const i = idx[v.left]; | |
| const j = idx[v.right]; | |
| if (w === "X") A[i][j]++; | |
| else A[j][i]++; | |
| } | |
| // Power iteration for dominant eigenvector | |
| let vec = new Float64Array(n).fill(1 / Math.sqrt(n)); | |
| for (let iter = 0; iter < iterations; iter++) { | |
| const newVec = new Float64Array(n); | |
| for (let i = 0; i < n; i++) { | |
| for (let j = 0; j < n; j++) { | |
| newVec[i] += A[i][j] * vec[j]; | |
| } | |
| } | |
| // Normalize | |
| const norm = Math.sqrt(newVec.reduce((s, v) => s + v * v, 0)); | |
| if (norm > 0) for (let i = 0; i < n; i++) newVec[i] /= norm; | |
| vec = newVec; | |
| } | |
| const result = {}; | |
| for (let i = 0; i < n; i++) result[agentList[i]] = vec[i]; | |
| return result; | |
| } | |
| function computeNewman(votes) { | |
| // Simplified Newman modularity on win-graph | |
| const agentSet = new Set(); | |
| for (const v of votes) { | |
| agentSet.add(v.left); | |
| agentSet.add(v.right); | |
| } | |
| const agentList = [...agentSet]; | |
| const n = agentList.length; | |
| const idx = Object.fromEntries(agentList.map((a, i) => [a, i])); | |
| const A = Array.from({ length: n }, () => new Float64Array(n)); | |
| let totalEdges = 0; | |
| const degree = new Float64Array(n); | |
| for (const v of votes) { | |
| const w = WINNER_MAP[v.winner]; | |
| if (w === "draw") continue; | |
| const i = idx[v.left]; | |
| const j = idx[v.right]; | |
| if (w === "X") { | |
| A[i][j]++; | |
| A[j][i]++; | |
| } else { | |
| A[j][i]++; | |
| A[i][j]++; | |
| } | |
| degree[i]++; | |
| degree[j]++; | |
| totalEdges++; | |
| } | |
| if (totalEdges === 0) { | |
| const result = {}; | |
| for (const a of agentList) result[a] = 0; | |
| return result; | |
| } | |
| // Each node in its own community -> modularity contribution | |
| const result = {}; | |
| for (let i = 0; i < n; i++) { | |
| const qi = | |
| (A[i][i] || 0) / (2 * totalEdges) - | |
| (degree[i] / (2 * totalEdges)) ** 2; | |
| result[agentList[i]] = qi; | |
| } | |
| return result; | |
| } | |
| function computeCeiMcs(votes, conversations) { | |
| const convMap = new Map(); | |
| for (const c of conversations) { | |
| convMap.set(`${c.timestamp}|${c.left}|${c.right}`, c); | |
| } | |
| const stats = {}; | |
| for (const vote of votes) { | |
| const conv = convMap.get( | |
| `${vote.timestamp}|${vote.left}|${vote.right}` | |
| ); | |
| for (const m of [vote.left, vote.right]) { | |
| stats[m] ??= { ceiSum: 0, ceiMax: 0, selfMatches: 0, selfDraws: 0 }; | |
| } | |
| if (vote.left === vote.right) { | |
| stats[vote.left].selfMatches++; | |
| if (vote.winner === "tie" || vote.winner === "both_bad") { | |
| stats[vote.left].selfDraws++; | |
| } | |
| continue; | |
| } | |
| let leftScore, rightScore; | |
| switch (vote.winner) { | |
| case "left": | |
| leftScore = 1; | |
| rightScore = -1; | |
| break; | |
| case "right": | |
| leftScore = -1; | |
| rightScore = 1; | |
| break; | |
| case "tie": | |
| leftScore = 0.3; | |
| rightScore = 0.3; | |
| break; | |
| case "both_bad": | |
| leftScore = -0.3; | |
| rightScore = -0.3; | |
| break; | |
| default: | |
| continue; | |
| } | |
| // CEI: use conversation rounds if available, default to 1 | |
| const leftRounds = conv?.left_rounds?.length || 1; | |
| const rightRounds = conv?.right_rounds?.length || 1; | |
| stats[vote.left].ceiMax += 1 / leftRounds; | |
| stats[vote.right].ceiMax += 1 / rightRounds; | |
| stats[vote.left].ceiSum += leftScore / leftRounds; | |
| stats[vote.right].ceiSum += rightScore / rightRounds; | |
| } | |
| const cei = {}; | |
| const mcs = {}; | |
| for (const [agent, s] of Object.entries(stats)) { | |
| cei[agent] = s.ceiMax > 0 ? round2(s.ceiSum / s.ceiMax) : null; | |
| mcs[agent] = s.selfMatches > 0 ? round2(s.selfDraws / s.selfMatches) : null; | |
| } | |
| return { cei, mcs }; | |
| } | |
| async function getLeaderboardData({ voteEntry = null, convEntry = null, useCache = true } = {}) { | |
| // Return in-memory cache if available and no new vote to incorporate | |
| if (useCache && leaderboardCache && !voteEntry) return leaderboardCache; | |
| const token = process.env.HF_TOKEN; | |
| const credentials = token ? { accessToken: token } : undefined; | |
| if (useCache && !leaderboardCache) { | |
| try { | |
| const resp = await downloadFile({ | |
| repo: { type: "dataset", name: LEADERBOARD_REPO }, | |
| path: `${LEADERBOARD_FILE}.json`, | |
| credentials, | |
| }); | |
| if (resp) { | |
| const parsed = JSON.parse(await resp.text()); | |
| if (Array.isArray(parsed) && parsed.length > 0) { | |
| leaderboardCache = parsed; | |
| return leaderboardCache; | |
| } | |
| console.log("Leaderboard cache is empty, falling back to vote_data..."); | |
| } | |
| } catch { | |
| console.log("No cached leaderboard found, computing from votes..."); | |
| } | |
| } | |
| let votes = []; | |
| try { | |
| votes = await loadContentFromHf(VOTE_REPO, LEADERBOARD_FILE); | |
| console.log(`Loaded ${votes.length} vote(s) from ${VOTE_REPO}`); | |
| } catch (err) { | |
| console.error(`Failed to load votes: ${err.message}`); | |
| } | |
| if (voteEntry) votes.push(voteEntry); | |
| if (votes.length === 0) return []; | |
| let conversations = []; | |
| try { | |
| conversations = await loadContentFromHf(CONVERSATION_REPO, LEADERBOARD_FILE); | |
| console.log(`Loaded ${conversations.length} conversation(s) from ${CONVERSATION_REPO}`); | |
| } catch (err) { | |
| console.error(`Failed to load conversations (non-fatal): ${err.message}`); | |
| } | |
| if (convEntry) conversations.push(convEntry); | |
| const eloScores = computeElo(votes); | |
| const winRates = computeAvgWinRate(votes); | |
| const btScores = computeBradleyTerry(votes); | |
| const pagerankScr = computePageRank(votes); | |
| const eigenScores = computeEigen(votes); | |
| const newmanScores = computeNewman(votes); | |
| const { cei, mcs } = computeCeiMcs(votes, conversations); | |
| const agentNames = Object.keys(eloScores); | |
| const rows = agentNames.map((name) => ({ | |
| Agent: name, | |
| Website: (agentByName[name] || agentById[name])?.website || "", | |
| Provider: (agentByName[name] || agentById[name])?.provider || "", | |
| "Elo Score": round2(eloScores[name] ?? 0), | |
| "Win Rate": round2(winRates[name] ?? 0), | |
| "Conversation Efficiency Index": cei[name] ?? null, | |
| "Conversation Consistency Index": mcs[name] ?? null, | |
| "Bradley-Terry Coefficient": round2(btScores[name] ?? 0), | |
| "Eigenvector Centrality Value": round2(eigenScores[name] ?? 0), | |
| "Newman Modularity Score": round2(newmanScores[name] ?? 0), | |
| "PageRank Score": round2(pagerankScr[name] ?? 0), | |
| })); | |
| rows.sort((a, b) => b["Elo Score"] - a["Elo Score"]); | |
| rows.forEach((row, i) => { | |
| row.Rank = i + 1; | |
| }); | |
| leaderboardCache = rows; | |
| if (voteEntry && token) { | |
| saveContentToHf(rows, LEADERBOARD_REPO, LEADERBOARD_FILE, token).catch( | |
| (err) => console.error(`Failed to save leaderboard cache: ${err.message}`) | |
| ); | |
| } | |
| return rows; | |
| } | |
| // --------------------------------------------------------------------------- | |
| // Guardrail | |
| // --------------------------------------------------------------------------- | |
| async function guardrailCheckSeRelevance(userInput) { | |
| try { | |
| const response = await openaiClient.chat.completions.create({ | |
| model: "openai/gpt-oss-safeguard-20b", | |
| messages: [ | |
| { | |
| role: "system", | |
| content: | |
| "You are a classifier that decides if a user's question is relevant to software engineering. " + | |
| "If the question is about software engineering concepts, tools, processes, or code, respond with 'Yes'. " + | |
| "Otherwise, respond with 'No'.", | |
| }, | |
| { role: "user", content: userInput }, | |
| ], | |
| }); | |
| const classification = response.choices[0].message.content | |
| .trim() | |
| .toLowerCase(); | |
| return classification.startsWith("yes"); | |
| } catch (err) { | |
| console.error(`Guardrail check failed: ${err.message}`); | |
| return true; // fail open | |
| } | |
| } | |
| // --------------------------------------------------------------------------- | |
| // Express app | |
| // --------------------------------------------------------------------------- | |
| const app = express(); | |
| app.set("trust proxy", true); | |
| app.use(express.json({ limit: "10mb" })); | |
| app.use(express.static("public")); | |
| app.use( | |
| cookieSession({ | |
| name: "session", | |
| keys: [process.env.SESSION_SECRET || randomUUID()], | |
| maxAge: 24 * 60 * 60 * 1000, | |
| }) | |
| ); | |
| // In-memory battle state: battleId -> battle object | |
| const battles = new Map(); | |
| // --------------------------------------------------------------------------- | |
| // Auth routes (HF OAuth) | |
| // --------------------------------------------------------------------------- | |
| function getRedirectUri(req) { | |
| // On HF Spaces the SPACE_HOST env var gives the canonical public hostname. | |
| // Using it avoids http/https mismatches caused by reverse-proxy headers. | |
| if (process.env.SPACE_HOST) { | |
| return `https://${process.env.SPACE_HOST}/auth/callback`; | |
| } | |
| return `${req.protocol}://${req.get("host")}/auth/callback`; | |
| } | |
| app.get("/auth/login", (req, res) => { | |
| const clientId = process.env.OAUTH_CLIENT_ID; | |
| if (!clientId) return res.status(500).json({ error: "OAuth not configured" }); | |
| const redirectUri = getRedirectUri(req); | |
| const params = new URLSearchParams({ | |
| client_id: clientId, | |
| redirect_uri: redirectUri, | |
| response_type: "code", | |
| scope: process.env.OAUTH_SCOPES || "openid profile", | |
| state: randomUUID(), | |
| }); | |
| res.redirect(`https://huggingface.co/oauth/authorize?${params}`); | |
| }); | |
| app.get("/auth/callback", async (req, res) => { | |
| const { code } = req.query; | |
| if (!code) { | |
| console.error("OAuth callback: no code parameter received"); | |
| return res.redirect("/"); | |
| } | |
| try { | |
| const redirectUri = getRedirectUri(req); | |
| const tokenResp = await fetch("https://huggingface.co/oauth/token", { | |
| method: "POST", | |
| headers: { "Content-Type": "application/x-www-form-urlencoded" }, | |
| body: new URLSearchParams({ | |
| grant_type: "authorization_code", | |
| code, | |
| redirect_uri: redirectUri, | |
| client_id: process.env.OAUTH_CLIENT_ID, | |
| client_secret: process.env.OAUTH_CLIENT_SECRET, | |
| }), | |
| }); | |
| const data = await tokenResp.json(); | |
| if (!tokenResp.ok || !data.access_token) { | |
| console.error(`OAuth token exchange failed (${tokenResp.status}):`, data); | |
| return res.redirect("/"); | |
| } | |
| req.session.hfToken = data.access_token; | |
| res.redirect("/"); | |
| } catch (err) { | |
| console.error(`OAuth callback error: ${err.message}`); | |
| res.redirect("/"); | |
| } | |
| }); | |
| app.get("/auth/status", (req, res) => { | |
| const token = process.env.HF_TOKEN; | |
| res.json({ | |
| authenticated: !!token, | |
| hint: SHOW_HINT_STRING ? HINT_STRING : "", | |
| }); | |
| }); | |
| // --------------------------------------------------------------------------- | |
| // API routes | |
| // --------------------------------------------------------------------------- | |
| app.get("/api/config", (_req, res) => { | |
| res.json({ | |
| agentTimeoutMin: AGENT_TIMEOUT / 60_000, | |
| agentCount: availableAgents().length, | |
| oauthClientId: process.env.OAUTH_CLIENT_ID || "", | |
| }); | |
| }); | |
| app.get("/api/leaderboard", async (req, res) => { | |
| try { | |
| const data = await getLeaderboardData({ useCache: true }); | |
| res.json(data); | |
| } catch (err) { | |
| console.error(`Leaderboard error: ${err.message}\n${err.stack}`); | |
| res.status(500).json({ error: err.message }); | |
| } | |
| }); | |
| app.post("/api/battle/start", async (req, res) => { | |
| const { prompt, repoUrl } = req.body; | |
| if (!prompt || !prompt.trim()) { | |
| return res.status(400).json({ error: "Prompt is required." }); | |
| } | |
| // Guardrail (skip if URL provided) | |
| if (!repoUrl) { | |
| const isRelevant = await guardrailCheckSeRelevance(prompt); | |
| if (!isRelevant) { | |
| return res.status(400).json({ | |
| error: | |
| "Oops! Try asking something about software engineering. Thanks!", | |
| }); | |
| } | |
| } | |
| const available = availableAgents(); | |
| if (available.length < 1) { | |
| return res | |
| .status(500) | |
| .json({ error: "Not enough agents available for a battle." }); | |
| } | |
| try { | |
| // Fetch context & build prompt | |
| const repoContext = await fetchUrlContent(repoUrl || ""); | |
| const fullPrompt = buildPrompt(prompt, repoContext); | |
| const battleId = randomUUID(); | |
| battles.set(battleId, { | |
| id: battleId, | |
| left: "", | |
| right: "", | |
| leftAgent: null, | |
| rightAgent: null, | |
| url: repoUrl || "", | |
| leftDir: null, | |
| rightDir: null, | |
| fullPrompt, | |
| leftState: { stdout: "", stderr: "", done: false, ok: false }, | |
| rightState: { stdout: "", stderr: "", done: false, ok: false }, | |
| leftDiff: null, | |
| rightDiff: null, | |
| leftSessionId: null, | |
| rightSessionId: null, | |
| leftRounds: [], | |
| rightRounds: [], | |
| }); | |
| const battle = battles.get(battleId); | |
| // Both sides pick a random agent from the shuffled pool independently. | |
| // If an agent fails, tryAgentWithRetry re-selects another agent automatically. | |
| tryAgentWithRetry(battle, "left", fullPrompt, repoUrl).catch((err) => { | |
| console.error(`Left agent retry error: ${err.message}`); | |
| }); | |
| tryAgentWithRetry(battle, "right", fullPrompt, repoUrl).catch((err) => { | |
| console.error(`Right agent retry error: ${err.message}`); | |
| }); | |
| // Return immediately — frontend polls /api/battle/status | |
| res.json({ battleId }); | |
| } catch (err) { | |
| console.error(`Battle start error: ${err.message}`); | |
| res.status(500).json({ error: err.message }); | |
| } | |
| }); | |
| // Post-process agent output: strip identity headers, trailing metadata, etc. | |
| function postProcessOutput(output, agent) { | |
| if (!agent) return output; | |
| let result = output; | |
| if (agent.outputStartMarker) { | |
| const idx = result.indexOf(agent.outputStartMarker); | |
| if (idx !== -1) result = result.slice(idx + agent.outputStartMarker.length); | |
| } | |
| if (agent.outputEndMarker) { | |
| const idx = result.indexOf(agent.outputEndMarker); | |
| if (idx !== -1) result = result.slice(0, idx); | |
| } | |
| return result.trim(); | |
| } | |
| // Poll for live agent output | |
| app.get("/api/battle/status/:id", (req, res) => { | |
| const battle = battles.get(req.params.id); | |
| if (!battle) { | |
| return res.status(404).json({ error: "Battle not found (session expired)." }); | |
| } | |
| const { leftState, rightState } = battle; | |
| const formatOutput = (state, agent) => { | |
| let out = parseAgentOutput(state.stdout); | |
| if (state.done && !state.ok) { | |
| const prefix = out ? out + "\n\n" : ""; | |
| out = `${prefix}**Agent error:** ${state.stderr}`; | |
| } else if (state.done && state.stderr) { | |
| // Agent exited 0 but stderr has warnings/errors — append them | |
| out = `${out}\n\n**Agent warnings:** ${state.stderr}`; | |
| } | |
| // Apply post-processing (strip identity headers, trailing metadata) | |
| return postProcessOutput(out, agent); | |
| }; | |
| // Capture a live diff while the agent is still running so the UI can show | |
| // incremental file changes without waiting for the agent to finish. | |
| const leftDiff = leftState.done | |
| ? battle.leftDiff | |
| : (battle.leftDir ? captureDiff(battle.leftDir) : null); | |
| const rightDiff = rightState.done | |
| ? battle.rightDiff | |
| : (battle.rightDir ? captureDiff(battle.rightDir) : null); | |
| res.json({ | |
| leftStatus: leftState.done ? "done" : "running", | |
| rightStatus: rightState.done ? "done" : "running", | |
| leftOutput: formatOutput(leftState, battle.leftAgent), | |
| rightOutput: formatOutput(rightState, battle.rightAgent), | |
| leftDiff, | |
| rightDiff, | |
| }); | |
| }); | |
| app.post("/api/battle/followup", async (req, res) => { | |
| const { battleId, side, prompt } = req.body; | |
| const battle = battles.get(battleId); | |
| if (!battle) | |
| return res.status(404).json({ error: "Battle not found (session expired)." }); | |
| if (!prompt || !prompt.trim()) | |
| return res.status(400).json({ error: "Prompt is required." }); | |
| if (side !== "left" && side !== "right") | |
| return res.status(400).json({ error: 'Side must be "left" or "right".' }); | |
| const state = side === "left" ? battle.leftState : battle.rightState; | |
| if (!state.done) | |
| return res.status(400).json({ error: "Agent is still running. Please wait for it to finish." }); | |
| const agent = side === "left" ? battle.leftAgent : battle.rightAgent; | |
| const agentDir = side === "left" ? battle.leftDir : battle.rightDir; | |
| const rounds = side === "left" ? battle.leftRounds : battle.rightRounds; | |
| const sessionId = side === "left" ? battle.leftSessionId : battle.rightSessionId; | |
| try { | |
| const result = await runFollowup(agent, prompt, agentDir, rounds, sessionId); | |
| const diff = captureDiff(agentDir); | |
| rounds.push({ | |
| prompt, | |
| stdout: result.stdout || result.stderr || "", | |
| stderr: result.stderr || "", | |
| diff, | |
| }); | |
| res.json({ | |
| output: result.ok | |
| ? parseAgentOutput(result.stdout) | |
| : `**Agent error:** ${result.stderr}`, | |
| diff, | |
| ok: result.ok, | |
| }); | |
| } catch (err) { | |
| console.error(`Followup error: ${err.message}`); | |
| res.status(500).json({ error: err.message }); | |
| } | |
| }); | |
| app.post("/api/battle/vote", async (req, res) => { | |
| const { battleId, winner } = req.body; | |
| const battle = battles.get(battleId); | |
| if (!battle) | |
| return res.status(404).json({ error: "Battle not found (session expired)." }); | |
| const validWinners = ["left", "right", "tie", "both_bad"]; | |
| if (!validWinners.includes(winner)) | |
| return res.status(400).json({ error: "Invalid winner value." }); | |
| const token = process.env.HF_TOKEN; | |
| const timestamp = new Date() | |
| .toISOString() | |
| .replace(/[-:T]/g, (c) => (c === "T" ? "_" : "")) | |
| .replace(/\.\d+Z$/, ""); | |
| const fileName = `${LEADERBOARD_FILE}/${timestamp}`; | |
| const voteEntry = { | |
| left: battle.left, | |
| right: battle.right, | |
| winner, | |
| timestamp, | |
| }; | |
| // Strip context from first round prompts before saving | |
| const leftRoundsClean = battle.leftRounds.map((r, i) => ({ | |
| ...r, | |
| prompt: i === 0 ? stripContext(r.prompt) : r.prompt, | |
| })); | |
| const rightRoundsClean = battle.rightRounds.map((r, i) => ({ | |
| ...r, | |
| prompt: i === 0 ? stripContext(r.prompt) : r.prompt, | |
| })); | |
| const convData = { | |
| left: battle.left, | |
| right: battle.right, | |
| url: battle.url, | |
| left_rounds: leftRoundsClean, | |
| right_rounds: rightRoundsClean, | |
| winner, | |
| timestamp, | |
| }; | |
| // Save to HF (fire and forget) | |
| try { | |
| await Promise.all([ | |
| saveContentToHf(voteEntry, VOTE_REPO, fileName, token), | |
| saveContentToHf(convData, CONVERSATION_REPO, fileName, token), | |
| ]); | |
| } catch (err) { | |
| console.error(`HF upload error: ${err.message}`); | |
| } | |
| // Clean up (dirs may be null if a side hadn't started yet when user voted early) | |
| if (battle.leftDir) rmSync(battle.leftDir, { recursive: true, force: true }); | |
| if (battle.rightDir) rmSync(battle.rightDir, { recursive: true, force: true }); | |
| battles.delete(battleId); | |
| // Recompute leaderboard | |
| try { | |
| const leaderboard = await getLeaderboardData({ | |
| voteEntry, | |
| convEntry: convData, | |
| useCache: false, | |
| }); | |
| res.json({ leaderboard, agentA: battle.left, agentB: battle.right }); | |
| } catch (err) { | |
| console.error(`Leaderboard recompute error: ${err.message}`); | |
| res.json({ leaderboard: [], agentA: battle.left, agentB: battle.right }); | |
| } | |
| }); | |
| // --------------------------------------------------------------------------- | |
| // Agent submission | |
| // --------------------------------------------------------------------------- | |
| const VALID_PROMPT_STYLES = ["flag", "exec", "none"]; | |
| const VALID_FOLLOWUP_STYLES = ["continue", "resume", "replay", "none"]; | |
| /** | |
| * Parse a CLI-args value supplied by the user. | |
| * Accepts three forms: | |
| * - A JSON array string: '["--flag", "value"]' | |
| * - A space-separated string: '--flag value' | |
| * - An actual JS array (when the client sends JSON body with array field) | |
| */ | |
| function parseArgString(val) { | |
| if (Array.isArray(val)) return val.map(String).filter(Boolean); | |
| if (!val || typeof val !== "string" || !val.trim()) return []; | |
| const s = val.trim(); | |
| if (s.startsWith("[")) { | |
| try { return JSON.parse(s).map(String).filter(Boolean); } catch { /* fall through */ } | |
| } | |
| // Simple whitespace split — covers the most common form e.g. "--output-format json" | |
| return s.split(/\s+/).filter(Boolean); | |
| } | |
| app.post("/api/agent/submit", async (req, res) => { | |
| const { displayName, organization, website, bin, promptStyle, initArgs, followupStyle, followupArgs, outputStartMarker, outputEndMarker } = req.body; | |
| // ---- required field validation ---- | |
| if (!displayName || !String(displayName).trim()) | |
| return res.status(400).json({ error: "Agent display name is required." }); | |
| if (!organization || !String(organization).trim()) | |
| return res.status(400).json({ error: "Organization / provider name is required." }); | |
| if (!website || !String(website).trim()) | |
| return res.status(400).json({ error: "Website / OSS repository URL is required." }); | |
| if (!bin || !String(bin).trim()) | |
| return res.status(400).json({ error: "CLI binary name (bin) is required." }); | |
| if (!VALID_PROMPT_STYLES.includes(promptStyle)) | |
| return res.status(400).json({ error: `promptStyle must be one of: ${VALID_PROMPT_STYLES.join(", ")}.` }); | |
| if (!VALID_FOLLOWUP_STYLES.includes(followupStyle)) | |
| return res.status(400).json({ error: `followupStyle must be one of: ${VALID_FOLLOWUP_STYLES.join(", ")}.` }); | |
| const name = String(displayName).trim(); | |
| const org = String(organization).trim(); | |
| const binStr = String(bin).trim(); | |
| // Prevent path traversal via slashes in the file stem components | |
| if (/[/\\]/.test(name) || /[/\\]/.test(org)) | |
| return res.status(400).json({ error: "Display name and organization must not contain slashes." }); | |
| const fileName = `${org}: ${name}`; // e.g. "Anthropic: Claude Code" | |
| let parsedInitArgs, parsedFollowupArgs; | |
| try { | |
| parsedInitArgs = parseArgString(initArgs); | |
| parsedFollowupArgs = parseArgString(followupArgs); | |
| } catch (e) { | |
| return res.status(400).json({ error: `Invalid args format: ${e.message}` }); | |
| } | |
| const token = process.env.HF_TOKEN; | |
| if (!token) | |
| return res.status(500).json({ error: "Server is not configured with HF_TOKEN for uploads." }); | |
| // ---- duplicate check ---- | |
| try { | |
| const repo = { type: "dataset", name: CLI_DATA_REPO }; | |
| const credentials = { accessToken: token }; | |
| const existing = new Set(); | |
| for await (const file of listFiles({ repo, credentials })) { | |
| if (file.path.endsWith(".json") && !file.path.includes("/")) | |
| existing.add(file.path.replace(/\.json$/, "")); | |
| } | |
| if (existing.has(fileName)) | |
| return res.status(409).json({ error: `An agent named "${fileName}" already exists in the dataset.` }); | |
| } catch (err) { | |
| return res.status(500).json({ error: `Could not check for duplicates: ${err.message}` }); | |
| } | |
| // ---- build the record matching the cli_data schema ---- | |
| const websiteStr = typeof website === "string" ? website.trim() : ""; | |
| const record = { | |
| ...(websiteStr ? { website: websiteStr } : {}), | |
| provider: org, | |
| bin: binStr, | |
| promptStyle, | |
| initArgs: parsedInitArgs, | |
| followupStyle, | |
| followupArgs: parsedFollowupArgs, | |
| outputStartMarker: typeof outputStartMarker === "string" ? outputStartMarker : "", | |
| outputEndMarker: typeof outputEndMarker === "string" ? outputEndMarker : "", | |
| state: "active", | |
| }; | |
| // ---- upload to HF ---- | |
| try { | |
| const json = JSON.stringify(record, null, 4); | |
| const content = new Blob([json]); | |
| await uploadFile({ | |
| repo: { type: "dataset", name: CLI_DATA_REPO }, | |
| file: { content, path: `${fileName}.json` }, | |
| credentials: { accessToken: token }, | |
| }); | |
| } catch (err) { | |
| return res.status(500).json({ error: `Upload failed: ${err.message}` }); | |
| } | |
| res.json({ | |
| message: `Agent "${fileName}" successfully submitted! It will appear in the Arena after maintainers review and activate it.`, | |
| }); | |
| }); | |
| // --------------------------------------------------------------------------- | |
| // Start server | |
| // --------------------------------------------------------------------------- | |
| process.on("uncaughtException", (err) => { | |
| console.error("Uncaught exception:", err); | |
| }); | |
| process.on("unhandledRejection", (reason) => { | |
| console.error("Unhandled rejection:", reason); | |
| }); | |
| const PORT = process.env.PORT || 7860; | |
| (async () => { | |
| // Load agent CLI metadata from HF before accepting requests | |
| try { | |
| await loadAgentsFromHf(); | |
| } catch (err) { | |
| console.error(`Failed to load agents from HF: ${err.message}`); | |
| process.exit(1); | |
| } | |
| const available = availableAgents(); | |
| console.log( | |
| `Available agents: ${available.map((a) => a.name).join(", ") || "(none)"}` | |
| ); | |
| // Preload leaderboard | |
| try { | |
| const data = await getLeaderboardData({ useCache: true }); | |
| console.log(`Leaderboard preloaded: ${data.length} entries.`); | |
| } catch (err) { | |
| console.error(`Failed to preload leaderboard: ${err.message}`); | |
| } | |
| const server = app.listen(PORT, () => { | |
| console.log(`SWE-Agent-Arena running on http://localhost:${PORT}`); | |
| }); | |
| server.on("error", (err) => { | |
| console.error("Server error:", err); | |
| }); | |
| })(); | |