Zayne Rea Sprague commited on
Commit ·
6b7050a
1
Parent(s): 45f09e7
bump
Browse files
backend/api/rlm_eval_datasets.py
CHANGED
|
@@ -36,12 +36,20 @@ def _build_hierarchy(rows: list[dict]) -> dict:
|
|
| 36 |
|
| 37 |
ex = examples[ei]
|
| 38 |
|
| 39 |
-
# Parse code blocks
|
| 40 |
code_blocks = []
|
| 41 |
cbj = row.get("code_blocks_json", "")
|
| 42 |
if cbj and cbj != "[]":
|
| 43 |
try:
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
except (json.JSONDecodeError, TypeError):
|
| 46 |
code_blocks = []
|
| 47 |
|
|
|
|
| 36 |
|
| 37 |
ex = examples[ei]
|
| 38 |
|
| 39 |
+
# Parse code blocks, flattening result.stdout -> stdout
|
| 40 |
code_blocks = []
|
| 41 |
cbj = row.get("code_blocks_json", "")
|
| 42 |
if cbj and cbj != "[]":
|
| 43 |
try:
|
| 44 |
+
raw_blocks = json.loads(cbj) if isinstance(cbj, str) else cbj
|
| 45 |
+
for cb in raw_blocks:
|
| 46 |
+
block = {"code": cb.get("code", "")}
|
| 47 |
+
result = cb.get("result", {})
|
| 48 |
+
if isinstance(result, dict) and result.get("stdout"):
|
| 49 |
+
block["stdout"] = result["stdout"]
|
| 50 |
+
elif cb.get("stdout"):
|
| 51 |
+
block["stdout"] = cb["stdout"]
|
| 52 |
+
code_blocks.append(block)
|
| 53 |
except (json.JSONDecodeError, TypeError):
|
| 54 |
code_blocks = []
|
| 55 |
|
frontend/src/harbor/components/ChatBubble.tsx
CHANGED
|
@@ -5,11 +5,9 @@ import type { RawStep, RawToolCall, AtifStep } from "../types";
|
|
| 5 |
|
| 6 |
interface RawBubbleProps {
|
| 7 |
step: RawStep;
|
| 8 |
-
// Map of tool_call_id → tool response content for pairing
|
| 9 |
-
toolResponses?: Map<string, string>;
|
| 10 |
}
|
| 11 |
|
| 12 |
-
export function RawBubble({ step
|
| 13 |
const [expanded, setExpanded] = useState(false);
|
| 14 |
|
| 15 |
if (step.role === "system") {
|
|
@@ -71,13 +69,9 @@ export function RawBubble({ step, toolResponses }: RawBubbleProps) {
|
|
| 71 |
)}
|
| 72 |
|
| 73 |
{step.tool_calls && step.tool_calls.length > 0 && (
|
| 74 |
-
<div className="mt-2 space-y-
|
| 75 |
{step.tool_calls.map((tc, i) => (
|
| 76 |
-
<
|
| 77 |
-
key={i}
|
| 78 |
-
toolCall={tc}
|
| 79 |
-
response={toolResponses?.get(tc.id)}
|
| 80 |
-
/>
|
| 81 |
))}
|
| 82 |
</div>
|
| 83 |
)}
|
|
@@ -87,13 +81,11 @@ export function RawBubble({ step, toolResponses }: RawBubbleProps) {
|
|
| 87 |
}
|
| 88 |
|
| 89 |
if (step.role === "tool") {
|
| 90 |
-
// Tool responses are shown inline with their tool_call above when possible
|
| 91 |
-
// Only render standalone if not paired
|
| 92 |
return (
|
| 93 |
<div className="flex justify-start mb-3">
|
| 94 |
<div className="max-w-[85%] rounded-lg px-3 py-2 bg-gray-800 border border-gray-700">
|
| 95 |
<div className="text-xs font-medium text-gray-400 mb-1">
|
| 96 |
-
|
| 97 |
</div>
|
| 98 |
<ContentBlock
|
| 99 |
content={step.content}
|
|
@@ -118,6 +110,7 @@ export function RawBubble({ step, toolResponses }: RawBubbleProps) {
|
|
| 118 |
}
|
| 119 |
|
| 120 |
// ---- ATIF step bubble ----
|
|
|
|
| 121 |
|
| 122 |
interface AtifBubbleProps {
|
| 123 |
step: AtifStep;
|
|
@@ -155,11 +148,11 @@ export function AtifBubble({ step }: AtifBubbleProps) {
|
|
| 155 |
|
| 156 |
if (step.source === "agent") {
|
| 157 |
return (
|
| 158 |
-
<
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
<div className="rounded-lg px-4 py-3 bg-violet-900/20 border border-violet-800/30">
|
| 163 |
<div className="text-xs font-medium text-violet-400 mb-1">Reasoning</div>
|
| 164 |
<ContentBlock
|
| 165 |
content={step.reasoning}
|
|
@@ -168,45 +161,39 @@ export function AtifBubble({ step }: AtifBubbleProps) {
|
|
| 168 |
maxPreview={300}
|
| 169 |
/>
|
| 170 |
</div>
|
| 171 |
-
|
|
|
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
<
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
{tc.function}
|
| 194 |
-
</div>
|
| 195 |
-
{tc.command && (
|
| 196 |
-
<pre className="code-block text-amber-200 mt-1 whitespace-pre-wrap break-all">
|
| 197 |
-
{tc.command}
|
| 198 |
-
</pre>
|
| 199 |
-
)}
|
| 200 |
-
</div>
|
| 201 |
-
))}
|
| 202 |
-
</div>
|
| 203 |
-
)}
|
| 204 |
</div>
|
|
|
|
| 205 |
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
|
|
|
| 210 |
<ContentBlock
|
| 211 |
content={step.observation}
|
| 212 |
expanded={expanded}
|
|
@@ -215,23 +202,23 @@ export function AtifBubble({ step }: AtifBubbleProps) {
|
|
| 215 |
mono
|
| 216 |
/>
|
| 217 |
</div>
|
| 218 |
-
|
|
|
|
| 219 |
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
</div>
|
| 235 |
);
|
| 236 |
}
|
| 237 |
|
|
@@ -307,15 +294,14 @@ function ContentBlock({
|
|
| 307 |
);
|
| 308 |
}
|
| 309 |
|
| 310 |
-
|
|
|
|
| 311 |
toolCall,
|
| 312 |
-
response,
|
| 313 |
}: {
|
| 314 |
-
toolCall:
|
| 315 |
-
response?: string;
|
| 316 |
}) {
|
| 317 |
const [expanded, setExpanded] = useState(false);
|
| 318 |
-
const cmd = toolCall.command ||
|
| 319 |
|
| 320 |
return (
|
| 321 |
<div className="rounded bg-amber-900/30 border border-amber-800/30 px-3 py-2">
|
|
@@ -323,12 +309,14 @@ function ToolCallBlock({
|
|
| 323 |
<span className="text-xs text-amber-400 font-medium">
|
| 324 |
{toolCall.function}
|
| 325 |
</span>
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
|
|
|
|
|
|
| 332 |
</div>
|
| 333 |
|
| 334 |
{cmd && (
|
|
@@ -336,15 +324,6 @@ function ToolCallBlock({
|
|
| 336 |
{expanded ? cmd : cmd.length > 300 ? cmd.slice(0, 300) + "..." : cmd}
|
| 337 |
</pre>
|
| 338 |
)}
|
| 339 |
-
|
| 340 |
-
{expanded && response && (
|
| 341 |
-
<div className="mt-2 rounded bg-gray-800/80 px-2 py-1.5 max-h-64 overflow-y-auto">
|
| 342 |
-
<div className="text-xs text-gray-400 mb-1">Output:</div>
|
| 343 |
-
<pre className="code-block text-gray-300 whitespace-pre-wrap break-all">
|
| 344 |
-
{response}
|
| 345 |
-
</pre>
|
| 346 |
-
</div>
|
| 347 |
-
)}
|
| 348 |
</div>
|
| 349 |
);
|
| 350 |
}
|
|
|
|
| 5 |
|
| 6 |
interface RawBubbleProps {
|
| 7 |
step: RawStep;
|
|
|
|
|
|
|
| 8 |
}
|
| 9 |
|
| 10 |
+
export function RawBubble({ step }: RawBubbleProps) {
|
| 11 |
const [expanded, setExpanded] = useState(false);
|
| 12 |
|
| 13 |
if (step.role === "system") {
|
|
|
|
| 69 |
)}
|
| 70 |
|
| 71 |
{step.tool_calls && step.tool_calls.length > 0 && (
|
| 72 |
+
<div className="mt-2 space-y-1">
|
| 73 |
{step.tool_calls.map((tc, i) => (
|
| 74 |
+
<ToolCallCommand key={i} toolCall={tc} />
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
))}
|
| 76 |
</div>
|
| 77 |
)}
|
|
|
|
| 81 |
}
|
| 82 |
|
| 83 |
if (step.role === "tool") {
|
|
|
|
|
|
|
| 84 |
return (
|
| 85 |
<div className="flex justify-start mb-3">
|
| 86 |
<div className="max-w-[85%] rounded-lg px-3 py-2 bg-gray-800 border border-gray-700">
|
| 87 |
<div className="text-xs font-medium text-gray-400 mb-1">
|
| 88 |
+
Environment
|
| 89 |
</div>
|
| 90 |
<ContentBlock
|
| 91 |
content={step.content}
|
|
|
|
| 110 |
}
|
| 111 |
|
| 112 |
// ---- ATIF step bubble ----
|
| 113 |
+
// Returns an array of elements: agent action (right) + environment response (left)
|
| 114 |
|
| 115 |
interface AtifBubbleProps {
|
| 116 |
step: AtifStep;
|
|
|
|
| 148 |
|
| 149 |
if (step.source === "agent") {
|
| 150 |
return (
|
| 151 |
+
<>
|
| 152 |
+
{/* Reasoning (right, violet) */}
|
| 153 |
+
{step.reasoning && (
|
| 154 |
+
<div className="flex justify-end mb-3">
|
| 155 |
+
<div className="max-w-[85%] rounded-lg px-4 py-3 bg-violet-900/20 border border-violet-800/30">
|
| 156 |
<div className="text-xs font-medium text-violet-400 mb-1">Reasoning</div>
|
| 157 |
<ContentBlock
|
| 158 |
content={step.reasoning}
|
|
|
|
| 161 |
maxPreview={300}
|
| 162 |
/>
|
| 163 |
</div>
|
| 164 |
+
</div>
|
| 165 |
+
)}
|
| 166 |
|
| 167 |
+
{/* Agent message + tool calls (right, blue) */}
|
| 168 |
+
{(step.message || (step.tool_calls && step.tool_calls.length > 0)) && (
|
| 169 |
+
<div className="flex justify-end mb-3">
|
| 170 |
+
<div className="max-w-[85%] rounded-lg px-4 py-3 bg-blue-900/30 border border-blue-800/50">
|
| 171 |
+
<div className="text-xs font-medium text-blue-400 mb-1">Agent</div>
|
| 172 |
+
{step.message && (
|
| 173 |
+
<ContentBlock
|
| 174 |
+
content={step.message}
|
| 175 |
+
expanded={expanded}
|
| 176 |
+
onToggle={() => setExpanded(!expanded)}
|
| 177 |
+
maxPreview={400}
|
| 178 |
+
/>
|
| 179 |
+
)}
|
| 180 |
|
| 181 |
+
{step.tool_calls && step.tool_calls.length > 0 && (
|
| 182 |
+
<div className="mt-2 space-y-1">
|
| 183 |
+
{step.tool_calls.map((tc, i) => (
|
| 184 |
+
<ToolCallCommand key={i} toolCall={tc} />
|
| 185 |
+
))}
|
| 186 |
+
</div>
|
| 187 |
+
)}
|
| 188 |
+
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
</div>
|
| 190 |
+
)}
|
| 191 |
|
| 192 |
+
{/* Observation / Environment response (LEFT, gray) */}
|
| 193 |
+
{step.observation && (
|
| 194 |
+
<div className="flex justify-start mb-3">
|
| 195 |
+
<div className="max-w-[85%] rounded-lg px-3 py-2 bg-gray-800 border border-gray-700">
|
| 196 |
+
<div className="text-xs font-medium text-gray-400 mb-1">Environment</div>
|
| 197 |
<ContentBlock
|
| 198 |
content={step.observation}
|
| 199 |
expanded={expanded}
|
|
|
|
| 202 |
mono
|
| 203 |
/>
|
| 204 |
</div>
|
| 205 |
+
</div>
|
| 206 |
+
)}
|
| 207 |
|
| 208 |
+
{/* Step metrics */}
|
| 209 |
+
{step.metrics && Object.keys(step.metrics).length > 0 && (
|
| 210 |
+
<div className="flex gap-2 flex-wrap mb-3 justify-end">
|
| 211 |
+
{Object.entries(step.metrics).map(([k, v]) => (
|
| 212 |
+
<span
|
| 213 |
+
key={k}
|
| 214 |
+
className="px-1.5 py-0.5 rounded text-xs bg-gray-800 text-gray-500"
|
| 215 |
+
>
|
| 216 |
+
{k}: {typeof v === "number" ? v.toFixed(2) : String(v)}
|
| 217 |
+
</span>
|
| 218 |
+
))}
|
| 219 |
+
</div>
|
| 220 |
+
)}
|
| 221 |
+
</>
|
|
|
|
| 222 |
);
|
| 223 |
}
|
| 224 |
|
|
|
|
| 294 |
);
|
| 295 |
}
|
| 296 |
|
| 297 |
+
/** Shows just the command portion of a tool call (no hidden expand for response). */
|
| 298 |
+
function ToolCallCommand({
|
| 299 |
toolCall,
|
|
|
|
| 300 |
}: {
|
| 301 |
+
toolCall: { function: string; command?: string; arguments?: Record<string, unknown> };
|
|
|
|
| 302 |
}) {
|
| 303 |
const [expanded, setExpanded] = useState(false);
|
| 304 |
+
const cmd = toolCall.command || "";
|
| 305 |
|
| 306 |
return (
|
| 307 |
<div className="rounded bg-amber-900/30 border border-amber-800/30 px-3 py-2">
|
|
|
|
| 309 |
<span className="text-xs text-amber-400 font-medium">
|
| 310 |
{toolCall.function}
|
| 311 |
</span>
|
| 312 |
+
{cmd.length > 300 && (
|
| 313 |
+
<button
|
| 314 |
+
onClick={() => setExpanded(!expanded)}
|
| 315 |
+
className="text-xs text-gray-500 hover:text-gray-300"
|
| 316 |
+
>
|
| 317 |
+
{expanded ? "[-]" : "[+]"}
|
| 318 |
+
</button>
|
| 319 |
+
)}
|
| 320 |
</div>
|
| 321 |
|
| 322 |
{cmd && (
|
|
|
|
| 324 |
{expanded ? cmd : cmd.length > 300 ? cmd.slice(0, 300) + "..." : cmd}
|
| 325 |
</pre>
|
| 326 |
)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
</div>
|
| 328 |
);
|
| 329 |
}
|
frontend/src/harbor/components/TrajectoryView.tsx
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import { useRef, useEffect } from "react";
|
| 2 |
-
import type { DatasetInfo, InstanceDetail, TrajectoryMode
|
| 3 |
import { RawBubble, AtifBubble } from "./ChatBubble";
|
| 4 |
import { StepDetail } from "./StepDetail";
|
| 5 |
|
|
@@ -19,42 +19,6 @@ export function TrajectoryView({ dataset, detail, mode, isSingle }: Props) {
|
|
| 19 |
}
|
| 20 |
}, [detail.instance_id, mode]);
|
| 21 |
|
| 22 |
-
// Build tool_call_id → response map for raw mode
|
| 23 |
-
const toolResponseMap = new Map<string, string>();
|
| 24 |
-
if (mode === "raw") {
|
| 25 |
-
for (const step of detail.raw_steps) {
|
| 26 |
-
if (step.role === "tool" && step.tool_call_id) {
|
| 27 |
-
toolResponseMap.set(step.tool_call_id, step.content);
|
| 28 |
-
}
|
| 29 |
-
}
|
| 30 |
-
}
|
| 31 |
-
|
| 32 |
-
// For raw mode, group assistant messages with their tool responses
|
| 33 |
-
// to avoid showing tool responses twice
|
| 34 |
-
const pairedToolCallIds = new Set<string>();
|
| 35 |
-
if (mode === "raw") {
|
| 36 |
-
for (const step of detail.raw_steps) {
|
| 37 |
-
if (step.role === "assistant" && step.tool_calls) {
|
| 38 |
-
for (const tc of step.tool_calls) {
|
| 39 |
-
if (toolResponseMap.has(tc.id)) {
|
| 40 |
-
pairedToolCallIds.add(tc.id);
|
| 41 |
-
}
|
| 42 |
-
}
|
| 43 |
-
}
|
| 44 |
-
}
|
| 45 |
-
}
|
| 46 |
-
|
| 47 |
-
// Filter out standalone tool messages that are already paired
|
| 48 |
-
const filteredRawSteps: RawStep[] =
|
| 49 |
-
mode === "raw"
|
| 50 |
-
? detail.raw_steps.filter((step) => {
|
| 51 |
-
if (step.role === "tool" && step.tool_call_id) {
|
| 52 |
-
return !pairedToolCallIds.has(step.tool_call_id);
|
| 53 |
-
}
|
| 54 |
-
return true;
|
| 55 |
-
})
|
| 56 |
-
: [];
|
| 57 |
-
|
| 58 |
return (
|
| 59 |
<div
|
| 60 |
className={`flex flex-col overflow-hidden ${
|
|
@@ -89,16 +53,13 @@ export function TrajectoryView({ dataset, detail, mode, isSingle }: Props) {
|
|
| 89 |
<div ref={scrollRef} className="flex-1 overflow-y-auto px-4 py-4 space-y-1">
|
| 90 |
{mode === "raw" && (
|
| 91 |
<>
|
| 92 |
-
{
|
| 93 |
-
<RawBubble
|
| 94 |
-
key={step.index}
|
| 95 |
-
step={step}
|
| 96 |
-
toolResponses={toolResponseMap}
|
| 97 |
-
/>
|
| 98 |
))}
|
| 99 |
-
{
|
| 100 |
<div className="text-center text-gray-500 text-sm mt-8">
|
| 101 |
-
No raw trajectory data available
|
|
|
|
| 102 |
</div>
|
| 103 |
)}
|
| 104 |
</>
|
|
|
|
| 1 |
import { useRef, useEffect } from "react";
|
| 2 |
+
import type { DatasetInfo, InstanceDetail, TrajectoryMode } from "../types";
|
| 3 |
import { RawBubble, AtifBubble } from "./ChatBubble";
|
| 4 |
import { StepDetail } from "./StepDetail";
|
| 5 |
|
|
|
|
| 19 |
}
|
| 20 |
}, [detail.instance_id, mode]);
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
return (
|
| 23 |
<div
|
| 24 |
className={`flex flex-col overflow-hidden ${
|
|
|
|
| 53 |
<div ref={scrollRef} className="flex-1 overflow-y-auto px-4 py-4 space-y-1">
|
| 54 |
{mode === "raw" && (
|
| 55 |
<>
|
| 56 |
+
{detail.raw_steps.map((step) => (
|
| 57 |
+
<RawBubble key={step.index} step={step} />
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
))}
|
| 59 |
+
{detail.raw_steps.length === 0 && (
|
| 60 |
<div className="text-center text-gray-500 text-sm mt-8">
|
| 61 |
+
No raw trajectory data available.
|
| 62 |
+
{detail.n_atif_steps > 0 && " Try ATIF Steps mode."}
|
| 63 |
</div>
|
| 64 |
)}
|
| 65 |
</>
|