burtenshaw commited on
Commit ·
bca003e
1
Parent(s): 368bcac
feat: embed agent traces in article
Browse files
app/src/components/TraceEmbed.astro
ADDED
|
@@ -0,0 +1,494 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
interface Props {
|
| 3 |
+
src: string;
|
| 4 |
+
title?: string;
|
| 5 |
+
desc?: string;
|
| 6 |
+
height?: number;
|
| 7 |
+
wide?: boolean;
|
| 8 |
+
sourceUrl?: string;
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
type TraceCard = {
|
| 12 |
+
kind: string;
|
| 13 |
+
label: string;
|
| 14 |
+
meta?: string;
|
| 15 |
+
text: string;
|
| 16 |
+
status?: string;
|
| 17 |
+
};
|
| 18 |
+
|
| 19 |
+
const {
|
| 20 |
+
src,
|
| 21 |
+
title,
|
| 22 |
+
desc,
|
| 23 |
+
height = 640,
|
| 24 |
+
wide = true,
|
| 25 |
+
sourceUrl,
|
| 26 |
+
} = Astro.props as Props;
|
| 27 |
+
|
| 28 |
+
const traces = (import.meta as any).glob("../content/assets/traces/**/*.jsonl", {
|
| 29 |
+
query: "?raw",
|
| 30 |
+
import: "default",
|
| 31 |
+
eager: true,
|
| 32 |
+
}) as Record<string, string>;
|
| 33 |
+
|
| 34 |
+
function resolveTrace(requested: string): string | null {
|
| 35 |
+
const needle = requested.replace(/^\/*/, "");
|
| 36 |
+
for (const [key, raw] of Object.entries(traces)) {
|
| 37 |
+
if (key.endsWith("/" + needle) || key.endsWith("/" + needle.replace(/^traces\//, ""))) {
|
| 38 |
+
return raw;
|
| 39 |
+
}
|
| 40 |
+
}
|
| 41 |
+
return null;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
function escapeHtml(value: unknown): string {
|
| 45 |
+
return String(value ?? "")
|
| 46 |
+
.replace(/&/g, "&")
|
| 47 |
+
.replace(/</g, "<")
|
| 48 |
+
.replace(/>/g, ">")
|
| 49 |
+
.replace(/"/g, """)
|
| 50 |
+
.replace(/'/g, "'");
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
function compact(value: string, max = 180): string {
|
| 54 |
+
const text = value.replace(/\s+/g, " ").trim();
|
| 55 |
+
return text.length > max ? `${text.slice(0, max - 1)}...` : text;
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
function contentText(content: any): string {
|
| 59 |
+
if (typeof content === "string") return content;
|
| 60 |
+
if (!Array.isArray(content)) return "";
|
| 61 |
+
return content
|
| 62 |
+
.map((part) => {
|
| 63 |
+
if (!part || typeof part !== "object") return "";
|
| 64 |
+
return part.text ?? part.input_text ?? part.output_text ?? "";
|
| 65 |
+
})
|
| 66 |
+
.filter(Boolean)
|
| 67 |
+
.join("\n\n");
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
function parseArgs(args: unknown): Record<string, any> {
|
| 71 |
+
if (!args) return {};
|
| 72 |
+
if (typeof args === "object") return args as Record<string, any>;
|
| 73 |
+
try {
|
| 74 |
+
return JSON.parse(String(args));
|
| 75 |
+
} catch {
|
| 76 |
+
return { raw: String(args) };
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
function parseTrace(raw: string) {
|
| 81 |
+
const cards: TraceCard[] = [];
|
| 82 |
+
let meta: Record<string, any> = {};
|
| 83 |
+
let sawEventUser = false;
|
| 84 |
+
|
| 85 |
+
for (const line of raw.split(/\r?\n/)) {
|
| 86 |
+
if (!line.trim()) continue;
|
| 87 |
+
let item: any;
|
| 88 |
+
try {
|
| 89 |
+
item = JSON.parse(line);
|
| 90 |
+
} catch {
|
| 91 |
+
continue;
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
if (item.type === "session_meta") {
|
| 95 |
+
meta = item.payload ?? {};
|
| 96 |
+
continue;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
const payload = item.payload ?? {};
|
| 100 |
+
|
| 101 |
+
if (item.type === "event_msg") {
|
| 102 |
+
if (payload.type === "user_message" && payload.message) {
|
| 103 |
+
sawEventUser = true;
|
| 104 |
+
cards.push({
|
| 105 |
+
kind: "user",
|
| 106 |
+
label: "User",
|
| 107 |
+
meta: item.timestamp,
|
| 108 |
+
text: payload.message,
|
| 109 |
+
});
|
| 110 |
+
}
|
| 111 |
+
continue;
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
if (item.type !== "response_item") continue;
|
| 115 |
+
|
| 116 |
+
if (payload.type === "message") {
|
| 117 |
+
const role = payload.role ?? "message";
|
| 118 |
+
if (role === "user" && sawEventUser) continue;
|
| 119 |
+
const text = contentText(payload.content);
|
| 120 |
+
if (!text.trim()) continue;
|
| 121 |
+
cards.push({
|
| 122 |
+
kind: role === "developer" ? "system" : role,
|
| 123 |
+
label: role === "developer" ? "System" : role[0].toUpperCase() + role.slice(1),
|
| 124 |
+
meta: item.timestamp,
|
| 125 |
+
text,
|
| 126 |
+
});
|
| 127 |
+
continue;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
if (payload.type === "reasoning") {
|
| 131 |
+
const text = contentText(payload.summary);
|
| 132 |
+
if (!text.trim()) continue;
|
| 133 |
+
cards.push({
|
| 134 |
+
kind: "thinking",
|
| 135 |
+
label: "Thinking",
|
| 136 |
+
meta: item.timestamp,
|
| 137 |
+
text,
|
| 138 |
+
});
|
| 139 |
+
continue;
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
if (payload.type === "function_call") {
|
| 143 |
+
const args = parseArgs(payload.arguments);
|
| 144 |
+
const body = args.command
|
| 145 |
+
? String(args.command)
|
| 146 |
+
: JSON.stringify(args, null, 2);
|
| 147 |
+
cards.push({
|
| 148 |
+
kind: "tool-call",
|
| 149 |
+
label: `Tool Call · ${payload.name ?? "tool"}`,
|
| 150 |
+
meta: payload.call_id,
|
| 151 |
+
text: body,
|
| 152 |
+
});
|
| 153 |
+
continue;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
if (payload.type === "function_call_output") {
|
| 157 |
+
cards.push({
|
| 158 |
+
kind: payload.status === "success" ? "tool-result success" : "tool-result",
|
| 159 |
+
label: "Tool Result",
|
| 160 |
+
meta: payload.call_id,
|
| 161 |
+
text: payload.output ?? "",
|
| 162 |
+
status: payload.status,
|
| 163 |
+
});
|
| 164 |
+
}
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
return { meta, cards };
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
const rawTrace = resolveTrace(src);
|
| 171 |
+
const trace = rawTrace ? parseTrace(rawTrace) : { meta: {}, cards: [] };
|
| 172 |
+
const filename = src.split("/").pop() ?? src;
|
| 173 |
+
const externalHref =
|
| 174 |
+
sourceUrl ??
|
| 175 |
+
`https://huggingface.co/datasets/evalstate/all-defects/blob/main/${encodeURIComponent(filename)}`;
|
| 176 |
+
const openedCutoff = Math.max(0, trace.cards.length - 4);
|
| 177 |
+
|
| 178 |
+
const cardHtml = rawTrace
|
| 179 |
+
? trace.cards
|
| 180 |
+
.map((card, index) => {
|
| 181 |
+
const open = index < 2 || index >= openedCutoff ? " open" : "";
|
| 182 |
+
const meta = card.meta ? `<span>${escapeHtml(card.meta)}</span>` : "";
|
| 183 |
+
const status = card.status ? `<span>${escapeHtml(card.status)}</span>` : "";
|
| 184 |
+
return `
|
| 185 |
+
<details class="trace-event ${escapeHtml(card.kind)}"${open}>
|
| 186 |
+
<summary>
|
| 187 |
+
<span class="trace-pill">${escapeHtml(card.label)}</span>
|
| 188 |
+
<span class="trace-preview">${escapeHtml(compact(card.text))}</span>
|
| 189 |
+
<span class="trace-meta">${meta}${status}</span>
|
| 190 |
+
</summary>
|
| 191 |
+
<pre>${escapeHtml(card.text)}</pre>
|
| 192 |
+
</details>
|
| 193 |
+
`;
|
| 194 |
+
})
|
| 195 |
+
.join("")
|
| 196 |
+
: `<div class="trace-missing">Trace not found: <code>${escapeHtml(src)}</code></div>`;
|
| 197 |
+
|
| 198 |
+
const iframeDoc = `<!doctype html>
|
| 199 |
+
<html>
|
| 200 |
+
<head>
|
| 201 |
+
<meta charset="utf-8" />
|
| 202 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 203 |
+
<style>
|
| 204 |
+
:root {
|
| 205 |
+
color-scheme: light;
|
| 206 |
+
--trace-page: #ffffff;
|
| 207 |
+
--trace-surface: #f8fafc;
|
| 208 |
+
--trace-card: #ffffff;
|
| 209 |
+
--trace-border: #e5e7eb;
|
| 210 |
+
--trace-text: #1f2937;
|
| 211 |
+
--trace-muted: #64748b;
|
| 212 |
+
--trace-code: #334155;
|
| 213 |
+
--trace-system: #d97706;
|
| 214 |
+
--trace-user: #2563eb;
|
| 215 |
+
--trace-assistant: #ea580c;
|
| 216 |
+
--trace-tool: #475569;
|
| 217 |
+
--trace-success: #10b981;
|
| 218 |
+
}
|
| 219 |
+
:root[data-theme="dark"] {
|
| 220 |
+
color-scheme: dark;
|
| 221 |
+
--trace-page: #0f1115;
|
| 222 |
+
--trace-surface: #111827;
|
| 223 |
+
--trace-card: #172033;
|
| 224 |
+
--trace-border: #334155;
|
| 225 |
+
--trace-text: #e5e7eb;
|
| 226 |
+
--trace-muted: #9ca3af;
|
| 227 |
+
--trace-code: #d1d5db;
|
| 228 |
+
--trace-system: #f59e0b;
|
| 229 |
+
--trace-user: #60a5fa;
|
| 230 |
+
--trace-assistant: #fb923c;
|
| 231 |
+
--trace-tool: #cbd5e1;
|
| 232 |
+
--trace-success: #34d399;
|
| 233 |
+
}
|
| 234 |
+
* { box-sizing: border-box; }
|
| 235 |
+
body {
|
| 236 |
+
margin: 0;
|
| 237 |
+
background: var(--trace-page);
|
| 238 |
+
color: var(--trace-text);
|
| 239 |
+
font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
| 240 |
+
font-size: 14px;
|
| 241 |
+
line-height: 1.55;
|
| 242 |
+
}
|
| 243 |
+
.trace-shell {
|
| 244 |
+
min-height: 100vh;
|
| 245 |
+
background: var(--trace-page);
|
| 246 |
+
}
|
| 247 |
+
.trace-topbar {
|
| 248 |
+
position: sticky;
|
| 249 |
+
top: 0;
|
| 250 |
+
z-index: 2;
|
| 251 |
+
display: flex;
|
| 252 |
+
align-items: center;
|
| 253 |
+
justify-content: space-between;
|
| 254 |
+
gap: 16px;
|
| 255 |
+
padding: 10px 14px;
|
| 256 |
+
border-bottom: 1px solid var(--trace-border);
|
| 257 |
+
background: color-mix(in srgb, var(--trace-surface) 94%, transparent);
|
| 258 |
+
}
|
| 259 |
+
.trace-brand {
|
| 260 |
+
display: flex;
|
| 261 |
+
min-width: 0;
|
| 262 |
+
align-items: center;
|
| 263 |
+
gap: 8px;
|
| 264 |
+
font-weight: 650;
|
| 265 |
+
}
|
| 266 |
+
.trace-brand svg {
|
| 267 |
+
width: 16px;
|
| 268 |
+
height: 16px;
|
| 269 |
+
flex: none;
|
| 270 |
+
color: var(--trace-muted);
|
| 271 |
+
}
|
| 272 |
+
.trace-session {
|
| 273 |
+
min-width: 0;
|
| 274 |
+
overflow: hidden;
|
| 275 |
+
text-overflow: ellipsis;
|
| 276 |
+
white-space: nowrap;
|
| 277 |
+
color: var(--trace-muted);
|
| 278 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
|
| 279 |
+
font-size: 11px;
|
| 280 |
+
}
|
| 281 |
+
.trace-link {
|
| 282 |
+
flex: none;
|
| 283 |
+
color: var(--trace-user);
|
| 284 |
+
font-size: 12px;
|
| 285 |
+
text-decoration: none;
|
| 286 |
+
}
|
| 287 |
+
.trace-link:hover { text-decoration: underline; }
|
| 288 |
+
.trace-meta-grid {
|
| 289 |
+
display: grid;
|
| 290 |
+
grid-template-columns: repeat(3, minmax(0, 1fr));
|
| 291 |
+
gap: 8px;
|
| 292 |
+
padding: 12px 14px;
|
| 293 |
+
border-bottom: 1px solid var(--trace-border);
|
| 294 |
+
background: var(--trace-page);
|
| 295 |
+
}
|
| 296 |
+
.trace-kv {
|
| 297 |
+
min-width: 0;
|
| 298 |
+
border: 1px solid var(--trace-border);
|
| 299 |
+
border-radius: 8px;
|
| 300 |
+
background: var(--trace-card);
|
| 301 |
+
padding: 8px 10px;
|
| 302 |
+
}
|
| 303 |
+
.trace-kv span {
|
| 304 |
+
display: block;
|
| 305 |
+
color: var(--trace-muted);
|
| 306 |
+
font-size: 11px;
|
| 307 |
+
text-transform: uppercase;
|
| 308 |
+
}
|
| 309 |
+
.trace-kv strong {
|
| 310 |
+
display: block;
|
| 311 |
+
overflow: hidden;
|
| 312 |
+
text-overflow: ellipsis;
|
| 313 |
+
white-space: nowrap;
|
| 314 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
|
| 315 |
+
font-size: 12px;
|
| 316 |
+
font-weight: 600;
|
| 317 |
+
}
|
| 318 |
+
.trace-events {
|
| 319 |
+
padding: 14px;
|
| 320 |
+
}
|
| 321 |
+
.trace-event {
|
| 322 |
+
overflow: hidden;
|
| 323 |
+
border: 1px solid var(--trace-border);
|
| 324 |
+
border-radius: 8px;
|
| 325 |
+
background: var(--trace-card);
|
| 326 |
+
box-shadow: 0 1px 2px rgba(15, 23, 42, 0.04);
|
| 327 |
+
}
|
| 328 |
+
.trace-event + .trace-event {
|
| 329 |
+
margin-top: 10px;
|
| 330 |
+
}
|
| 331 |
+
.trace-event summary {
|
| 332 |
+
display: grid;
|
| 333 |
+
grid-template-columns: max-content minmax(0, 1fr) max-content;
|
| 334 |
+
align-items: center;
|
| 335 |
+
gap: 8px;
|
| 336 |
+
min-height: 34px;
|
| 337 |
+
padding: 6px 8px;
|
| 338 |
+
cursor: pointer;
|
| 339 |
+
list-style: none;
|
| 340 |
+
}
|
| 341 |
+
.trace-event summary::-webkit-details-marker {
|
| 342 |
+
display: none;
|
| 343 |
+
}
|
| 344 |
+
.trace-event summary:hover {
|
| 345 |
+
background: color-mix(in srgb, var(--trace-surface) 78%, transparent);
|
| 346 |
+
}
|
| 347 |
+
.trace-pill {
|
| 348 |
+
border: 1px solid var(--trace-border);
|
| 349 |
+
border-radius: 6px;
|
| 350 |
+
background: var(--trace-surface);
|
| 351 |
+
padding: 2px 7px;
|
| 352 |
+
color: var(--trace-muted);
|
| 353 |
+
font-size: 12px;
|
| 354 |
+
font-weight: 650;
|
| 355 |
+
white-space: nowrap;
|
| 356 |
+
}
|
| 357 |
+
.system .trace-pill { color: var(--trace-system); }
|
| 358 |
+
.user .trace-pill { color: var(--trace-user); }
|
| 359 |
+
.assistant .trace-pill { color: var(--trace-assistant); }
|
| 360 |
+
.tool-call .trace-pill,
|
| 361 |
+
.tool-result .trace-pill { color: var(--trace-tool); }
|
| 362 |
+
.tool-result.success .trace-meta::before {
|
| 363 |
+
content: "";
|
| 364 |
+
display: inline-block;
|
| 365 |
+
width: 7px;
|
| 366 |
+
height: 7px;
|
| 367 |
+
margin-right: 5px;
|
| 368 |
+
border-radius: 50%;
|
| 369 |
+
background: var(--trace-success);
|
| 370 |
+
}
|
| 371 |
+
.trace-preview {
|
| 372 |
+
overflow: hidden;
|
| 373 |
+
text-overflow: ellipsis;
|
| 374 |
+
white-space: nowrap;
|
| 375 |
+
color: var(--trace-muted);
|
| 376 |
+
font-size: 13px;
|
| 377 |
+
}
|
| 378 |
+
.trace-meta {
|
| 379 |
+
display: flex;
|
| 380 |
+
align-items: center;
|
| 381 |
+
gap: 6px;
|
| 382 |
+
color: var(--trace-muted);
|
| 383 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
|
| 384 |
+
font-size: 11px;
|
| 385 |
+
}
|
| 386 |
+
.trace-event pre {
|
| 387 |
+
overflow: auto;
|
| 388 |
+
max-height: 520px;
|
| 389 |
+
margin: 0;
|
| 390 |
+
border-top: 1px solid var(--trace-border);
|
| 391 |
+
padding: 12px;
|
| 392 |
+
background: color-mix(in srgb, var(--trace-surface) 84%, var(--trace-card));
|
| 393 |
+
color: var(--trace-code);
|
| 394 |
+
white-space: pre-wrap;
|
| 395 |
+
word-break: break-word;
|
| 396 |
+
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace;
|
| 397 |
+
font-size: 12px;
|
| 398 |
+
line-height: 1.45;
|
| 399 |
+
}
|
| 400 |
+
.trace-missing {
|
| 401 |
+
padding: 18px;
|
| 402 |
+
color: #b91c1c;
|
| 403 |
+
}
|
| 404 |
+
@media (max-width: 680px) {
|
| 405 |
+
.trace-topbar {
|
| 406 |
+
align-items: flex-start;
|
| 407 |
+
flex-direction: column;
|
| 408 |
+
gap: 4px;
|
| 409 |
+
}
|
| 410 |
+
.trace-meta-grid {
|
| 411 |
+
grid-template-columns: 1fr;
|
| 412 |
+
}
|
| 413 |
+
.trace-event summary {
|
| 414 |
+
grid-template-columns: 1fr;
|
| 415 |
+
}
|
| 416 |
+
.trace-meta {
|
| 417 |
+
display: none;
|
| 418 |
+
}
|
| 419 |
+
}
|
| 420 |
+
</style>
|
| 421 |
+
</head>
|
| 422 |
+
<body>
|
| 423 |
+
<div class="trace-shell">
|
| 424 |
+
<div class="trace-topbar">
|
| 425 |
+
<div class="trace-brand">
|
| 426 |
+
<svg viewBox="0 0 12 12" aria-hidden="true"><path d="M10.28 5.1a2.5 2.5 0 0 0-.21-2.05 2.52 2.52 0 0 0-2.71-1.21 2.53 2.53 0 0 0-4.29.9 2.5 2.5 0 0 0-1.66 1.21 2.52 2.52 0 0 0 .3 2.96 2.5 2.5 0 0 0 .22 2.04 2.52 2.52 0 0 0 2.72 1.21 2.5 2.5 0 0 0 1.87.84 2.52 2.52 0 0 0 2.4-1.75 2.5 2.5 0 0 0 2-2.73 2.52 2.52 0 0 0-.64-1.43Z" fill="currentColor"/></svg>
|
| 427 |
+
<span>Codex trace</span>
|
| 428 |
+
<span class="trace-session">${escapeHtml(trace.meta.id ?? filename)}</span>
|
| 429 |
+
</div>
|
| 430 |
+
<a class="trace-link" href="${escapeHtml(externalHref)}" target="_blank" rel="noopener noreferrer">Open raw trace</a>
|
| 431 |
+
</div>
|
| 432 |
+
<div class="trace-meta-grid">
|
| 433 |
+
<div class="trace-kv"><span>Model</span><strong>${escapeHtml(trace.meta.model_spec ?? "unknown")}</strong></div>
|
| 434 |
+
<div class="trace-kv"><span>Started</span><strong>${escapeHtml(trace.meta.timestamp ?? "unknown")}</strong></div>
|
| 435 |
+
<div class="trace-kv"><span>Working Directory</span><strong>${escapeHtml(trace.meta.cwd ?? "unknown")}</strong></div>
|
| 436 |
+
</div>
|
| 437 |
+
<div class="trace-events">${cardHtml}</div>
|
| 438 |
+
</div>
|
| 439 |
+
<script>
|
| 440 |
+
(() => {
|
| 441 |
+
const applyTheme = () => {
|
| 442 |
+
try {
|
| 443 |
+
const theme = parent.document.documentElement.getAttribute("data-theme");
|
| 444 |
+
if (theme) document.documentElement.setAttribute("data-theme", theme);
|
| 445 |
+
else document.documentElement.removeAttribute("data-theme");
|
| 446 |
+
} catch {}
|
| 447 |
+
};
|
| 448 |
+
applyTheme();
|
| 449 |
+
try {
|
| 450 |
+
new MutationObserver(applyTheme).observe(parent.document.documentElement, {
|
| 451 |
+
attributes: true,
|
| 452 |
+
attributeFilter: ["data-theme"],
|
| 453 |
+
});
|
| 454 |
+
} catch {}
|
| 455 |
+
})();
|
| 456 |
+
</script>
|
| 457 |
+
</body>
|
| 458 |
+
</html>`;
|
| 459 |
+
---
|
| 460 |
+
|
| 461 |
+
<figure class={`html-embed trace-embed${wide ? " html-embed--wide" : ""}`}>
|
| 462 |
+
{title && <figcaption class="html-embed__title">{title}</figcaption>}
|
| 463 |
+
<div class="html-embed__card trace-embed__card">
|
| 464 |
+
<iframe
|
| 465 |
+
class="trace-embed__iframe"
|
| 466 |
+
srcdoc={iframeDoc}
|
| 467 |
+
title={title ?? "Agent trace"}
|
| 468 |
+
loading="lazy"
|
| 469 |
+
style={`height:${height}px`}
|
| 470 |
+
></iframe>
|
| 471 |
+
</div>
|
| 472 |
+
{desc && <figcaption class="html-embed__desc" set:html={desc} />}
|
| 473 |
+
</figure>
|
| 474 |
+
|
| 475 |
+
<style is:global>
|
| 476 |
+
.trace-embed__card {
|
| 477 |
+
overflow: hidden;
|
| 478 |
+
padding: 0;
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
.trace-embed__iframe {
|
| 482 |
+
display: block;
|
| 483 |
+
width: 100%;
|
| 484 |
+
min-height: 460px;
|
| 485 |
+
border: 0;
|
| 486 |
+
background: var(--surface-bg);
|
| 487 |
+
}
|
| 488 |
+
|
| 489 |
+
@media (max-width: 768px) {
|
| 490 |
+
.trace-embed__iframe {
|
| 491 |
+
min-height: 520px;
|
| 492 |
+
}
|
| 493 |
+
}
|
| 494 |
+
</style>
|
app/src/content/assets/traces/all-defects-750-batch-prs-45267-to-45189-20260429T113834Z.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app/src/content/assets/traces/all-defects-750-batch-prs-45699-to-45549-20260429T090102Z.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
app/src/content/chapters/slopfarmer/content.mdx
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import HtmlEmbed from '../../../components/HtmlEmbed.astro';
|
|
|
|
| 2 |
|
| 3 |
## The shape of the problem
|
| 4 |
|
|
@@ -64,7 +65,16 @@ We found and built several experimental tools for this to work. They each approa
|
|
| 64 |
|
| 65 |
The experiment ran on a fork at [evalstate/transformers](https://github.com/evalstate/transformers). The process was straightforward: take clusters of related PRs, merge them into worktrees, and have an agent assess whether the combined result was valid. Each merged PR includes a comment with the full agent trace showing the reasoning.
|
| 66 |
|
| 67 |
-
Some clusters merged cleanly. The agent identified that multiple PRs fixed the same underlying bug and combined the best parts of each. Other clusters were rejected because the agent determined the fix had already been merged upstream. For instance, three separate contributors tried to add a feature that was already in `main`. The raw traces are published as
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
The combined PR containing the merged results is at [evalstate/transformers#42](https://github.com/evalstate/transformers/pull/42).
|
| 70 |
|
|
|
|
| 1 |
import HtmlEmbed from '../../../components/HtmlEmbed.astro';
|
| 2 |
+
import TraceEmbed from '../../../components/TraceEmbed.astro';
|
| 3 |
|
| 4 |
## The shape of the problem
|
| 5 |
|
|
|
|
| 65 |
|
| 66 |
The experiment ran on a fork at [evalstate/transformers](https://github.com/evalstate/transformers). The process was straightforward: take clusters of related PRs, merge them into worktrees, and have an agent assess whether the combined result was valid. Each merged PR includes a comment with the full agent trace showing the reasoning.
|
| 67 |
|
| 68 |
+
Some clusters merged cleanly. The agent identified that multiple PRs fixed the same underlying bug and combined the best parts of each. Other clusters were rejected because the agent determined the fix had already been merged upstream. For instance, three separate contributors tried to add a feature that was already in `main`. The raw traces are published as datasets at [evalstate/all-defects](https://huggingface.co/datasets/evalstate/all-defects) and [evalstate/transformers-merge-experiments](https://huggingface.co/datasets/evalstate/transformers-merge-experiments).
|
| 69 |
+
|
| 70 |
+
The traces are useful because they show the protocol, not just the outcome. In one batch, the agent merged six defect PRs in sequence and reran validation after each integration. In another, it handled a more typical mixed batch: some PRs were merged or patched, one was aborted because the codebase had moved on, and one was reset after validation failed. Below you can explore that trace.
|
| 71 |
+
|
| 72 |
+
<TraceEmbed
|
| 73 |
+
src="all-defects-750-batch-prs-45699-to-45549-20260429T090102Z.jsonl"
|
| 74 |
+
title="Trace: six validated defect merges"
|
| 75 |
+
desc="A cumulative all-defects batch where six defect PRs merged cleanly and baseline plus per-merge validation passed."
|
| 76 |
+
sourceUrl="https://huggingface.co/datasets/evalstate/all-defects/blob/main/all-defects-750-batch-prs-45699-to-45549-20260429T090102Z.jsonl"
|
| 77 |
+
/>
|
| 78 |
|
| 79 |
The combined PR containing the merged results is at [evalstate/transformers#42](https://github.com/evalstate/transformers/pull/42).
|
| 80 |
|