Zayne Rea Sprague commited on
Commit
6b7050a
·
1 Parent(s): 45f09e7
backend/api/rlm_eval_datasets.py CHANGED
@@ -36,12 +36,20 @@ def _build_hierarchy(rows: list[dict]) -> dict:
36
 
37
  ex = examples[ei]
38
 
39
- # Parse code blocks
40
  code_blocks = []
41
  cbj = row.get("code_blocks_json", "")
42
  if cbj and cbj != "[]":
43
  try:
44
- code_blocks = json.loads(cbj) if isinstance(cbj, str) else cbj
 
 
 
 
 
 
 
 
45
  except (json.JSONDecodeError, TypeError):
46
  code_blocks = []
47
 
 
36
 
37
  ex = examples[ei]
38
 
39
+ # Parse code blocks, flattening result.stdout -> stdout
40
  code_blocks = []
41
  cbj = row.get("code_blocks_json", "")
42
  if cbj and cbj != "[]":
43
  try:
44
+ raw_blocks = json.loads(cbj) if isinstance(cbj, str) else cbj
45
+ for cb in raw_blocks:
46
+ block = {"code": cb.get("code", "")}
47
+ result = cb.get("result", {})
48
+ if isinstance(result, dict) and result.get("stdout"):
49
+ block["stdout"] = result["stdout"]
50
+ elif cb.get("stdout"):
51
+ block["stdout"] = cb["stdout"]
52
+ code_blocks.append(block)
53
  except (json.JSONDecodeError, TypeError):
54
  code_blocks = []
55
 
frontend/src/harbor/components/ChatBubble.tsx CHANGED
@@ -5,11 +5,9 @@ import type { RawStep, RawToolCall, AtifStep } from "../types";
5
 
6
  interface RawBubbleProps {
7
  step: RawStep;
8
- // Map of tool_call_id → tool response content for pairing
9
- toolResponses?: Map<string, string>;
10
  }
11
 
12
- export function RawBubble({ step, toolResponses }: RawBubbleProps) {
13
  const [expanded, setExpanded] = useState(false);
14
 
15
  if (step.role === "system") {
@@ -71,13 +69,9 @@ export function RawBubble({ step, toolResponses }: RawBubbleProps) {
71
  )}
72
 
73
  {step.tool_calls && step.tool_calls.length > 0 && (
74
- <div className="mt-2 space-y-2">
75
  {step.tool_calls.map((tc, i) => (
76
- <ToolCallBlock
77
- key={i}
78
- toolCall={tc}
79
- response={toolResponses?.get(tc.id)}
80
- />
81
  ))}
82
  </div>
83
  )}
@@ -87,13 +81,11 @@ export function RawBubble({ step, toolResponses }: RawBubbleProps) {
87
  }
88
 
89
  if (step.role === "tool") {
90
- // Tool responses are shown inline with their tool_call above when possible
91
- // Only render standalone if not paired
92
  return (
93
  <div className="flex justify-start mb-3">
94
  <div className="max-w-[85%] rounded-lg px-3 py-2 bg-gray-800 border border-gray-700">
95
  <div className="text-xs font-medium text-gray-400 mb-1">
96
- Tool Output
97
  </div>
98
  <ContentBlock
99
  content={step.content}
@@ -118,6 +110,7 @@ export function RawBubble({ step, toolResponses }: RawBubbleProps) {
118
  }
119
 
120
  // ---- ATIF step bubble ----
 
121
 
122
  interface AtifBubbleProps {
123
  step: AtifStep;
@@ -155,11 +148,11 @@ export function AtifBubble({ step }: AtifBubbleProps) {
155
 
156
  if (step.source === "agent") {
157
  return (
158
- <div className="flex justify-end mb-3">
159
- <div className="max-w-[85%] space-y-2">
160
- {/* Reasoning */}
161
- {step.reasoning && (
162
- <div className="rounded-lg px-4 py-3 bg-violet-900/20 border border-violet-800/30">
163
  <div className="text-xs font-medium text-violet-400 mb-1">Reasoning</div>
164
  <ContentBlock
165
  content={step.reasoning}
@@ -168,45 +161,39 @@ export function AtifBubble({ step }: AtifBubbleProps) {
168
  maxPreview={300}
169
  />
170
  </div>
171
- )}
 
172
 
173
- {/* Message / action */}
174
- <div className="rounded-lg px-4 py-3 bg-blue-900/30 border border-blue-800/50">
175
- <div className="text-xs font-medium text-blue-400 mb-1">Agent</div>
176
- {step.message && (
177
- <ContentBlock
178
- content={step.message}
179
- expanded={expanded}
180
- onToggle={() => setExpanded(!expanded)}
181
- maxPreview={400}
182
- />
183
- )}
 
 
184
 
185
- {step.tool_calls && step.tool_calls.length > 0 && (
186
- <div className="mt-2 space-y-1">
187
- {step.tool_calls.map((tc, i) => (
188
- <div
189
- key={i}
190
- className="rounded bg-amber-900/30 border border-amber-800/30 px-3 py-2"
191
- >
192
- <div className="text-xs text-amber-400 font-medium">
193
- {tc.function}
194
- </div>
195
- {tc.command && (
196
- <pre className="code-block text-amber-200 mt-1 whitespace-pre-wrap break-all">
197
- {tc.command}
198
- </pre>
199
- )}
200
- </div>
201
- ))}
202
- </div>
203
- )}
204
  </div>
 
205
 
206
- {/* Observation */}
207
- {step.observation && (
208
- <div className="rounded-lg px-3 py-2 bg-gray-800 border border-gray-700 self-start">
209
- <div className="text-xs font-medium text-gray-400 mb-1">Output</div>
 
210
  <ContentBlock
211
  content={step.observation}
212
  expanded={expanded}
@@ -215,23 +202,23 @@ export function AtifBubble({ step }: AtifBubbleProps) {
215
  mono
216
  />
217
  </div>
218
- )}
 
219
 
220
- {/* Step metrics */}
221
- {step.metrics && Object.keys(step.metrics).length > 0 && (
222
- <div className="flex gap-2 flex-wrap">
223
- {Object.entries(step.metrics).map(([k, v]) => (
224
- <span
225
- key={k}
226
- className="px-1.5 py-0.5 rounded text-xs bg-gray-800 text-gray-500"
227
- >
228
- {k}: {typeof v === "number" ? v.toFixed(2) : String(v)}
229
- </span>
230
- ))}
231
- </div>
232
- )}
233
- </div>
234
- </div>
235
  );
236
  }
237
 
@@ -307,15 +294,14 @@ function ContentBlock({
307
  );
308
  }
309
 
310
- function ToolCallBlock({
 
311
  toolCall,
312
- response,
313
  }: {
314
- toolCall: RawToolCall;
315
- response?: string;
316
  }) {
317
  const [expanded, setExpanded] = useState(false);
318
- const cmd = toolCall.command || toolCall.arguments_raw;
319
 
320
  return (
321
  <div className="rounded bg-amber-900/30 border border-amber-800/30 px-3 py-2">
@@ -323,12 +309,14 @@ function ToolCallBlock({
323
  <span className="text-xs text-amber-400 font-medium">
324
  {toolCall.function}
325
  </span>
326
- <button
327
- onClick={() => setExpanded(!expanded)}
328
- className="text-xs text-gray-500 hover:text-gray-300"
329
- >
330
- {expanded ? "[-]" : "[+]"}
331
- </button>
 
 
332
  </div>
333
 
334
  {cmd && (
@@ -336,15 +324,6 @@ function ToolCallBlock({
336
  {expanded ? cmd : cmd.length > 300 ? cmd.slice(0, 300) + "..." : cmd}
337
  </pre>
338
  )}
339
-
340
- {expanded && response && (
341
- <div className="mt-2 rounded bg-gray-800/80 px-2 py-1.5 max-h-64 overflow-y-auto">
342
- <div className="text-xs text-gray-400 mb-1">Output:</div>
343
- <pre className="code-block text-gray-300 whitespace-pre-wrap break-all">
344
- {response}
345
- </pre>
346
- </div>
347
- )}
348
  </div>
349
  );
350
  }
 
5
 
6
  interface RawBubbleProps {
7
  step: RawStep;
 
 
8
  }
9
 
10
+ export function RawBubble({ step }: RawBubbleProps) {
11
  const [expanded, setExpanded] = useState(false);
12
 
13
  if (step.role === "system") {
 
69
  )}
70
 
71
  {step.tool_calls && step.tool_calls.length > 0 && (
72
+ <div className="mt-2 space-y-1">
73
  {step.tool_calls.map((tc, i) => (
74
+ <ToolCallCommand key={i} toolCall={tc} />
 
 
 
 
75
  ))}
76
  </div>
77
  )}
 
81
  }
82
 
83
  if (step.role === "tool") {
 
 
84
  return (
85
  <div className="flex justify-start mb-3">
86
  <div className="max-w-[85%] rounded-lg px-3 py-2 bg-gray-800 border border-gray-700">
87
  <div className="text-xs font-medium text-gray-400 mb-1">
88
+ Environment
89
  </div>
90
  <ContentBlock
91
  content={step.content}
 
110
  }
111
 
112
  // ---- ATIF step bubble ----
113
+ // Returns an array of elements: agent action (right) + environment response (left)
114
 
115
  interface AtifBubbleProps {
116
  step: AtifStep;
 
148
 
149
  if (step.source === "agent") {
150
  return (
151
+ <>
152
+ {/* Reasoning (right, violet) */}
153
+ {step.reasoning && (
154
+ <div className="flex justify-end mb-3">
155
+ <div className="max-w-[85%] rounded-lg px-4 py-3 bg-violet-900/20 border border-violet-800/30">
156
  <div className="text-xs font-medium text-violet-400 mb-1">Reasoning</div>
157
  <ContentBlock
158
  content={step.reasoning}
 
161
  maxPreview={300}
162
  />
163
  </div>
164
+ </div>
165
+ )}
166
 
167
+ {/* Agent message + tool calls (right, blue) */}
168
+ {(step.message || (step.tool_calls && step.tool_calls.length > 0)) && (
169
+ <div className="flex justify-end mb-3">
170
+ <div className="max-w-[85%] rounded-lg px-4 py-3 bg-blue-900/30 border border-blue-800/50">
171
+ <div className="text-xs font-medium text-blue-400 mb-1">Agent</div>
172
+ {step.message && (
173
+ <ContentBlock
174
+ content={step.message}
175
+ expanded={expanded}
176
+ onToggle={() => setExpanded(!expanded)}
177
+ maxPreview={400}
178
+ />
179
+ )}
180
 
181
+ {step.tool_calls && step.tool_calls.length > 0 && (
182
+ <div className="mt-2 space-y-1">
183
+ {step.tool_calls.map((tc, i) => (
184
+ <ToolCallCommand key={i} toolCall={tc} />
185
+ ))}
186
+ </div>
187
+ )}
188
+ </div>
 
 
 
 
 
 
 
 
 
 
 
189
  </div>
190
+ )}
191
 
192
+ {/* Observation / Environment response (LEFT, gray) */}
193
+ {step.observation && (
194
+ <div className="flex justify-start mb-3">
195
+ <div className="max-w-[85%] rounded-lg px-3 py-2 bg-gray-800 border border-gray-700">
196
+ <div className="text-xs font-medium text-gray-400 mb-1">Environment</div>
197
  <ContentBlock
198
  content={step.observation}
199
  expanded={expanded}
 
202
  mono
203
  />
204
  </div>
205
+ </div>
206
+ )}
207
 
208
+ {/* Step metrics */}
209
+ {step.metrics && Object.keys(step.metrics).length > 0 && (
210
+ <div className="flex gap-2 flex-wrap mb-3 justify-end">
211
+ {Object.entries(step.metrics).map(([k, v]) => (
212
+ <span
213
+ key={k}
214
+ className="px-1.5 py-0.5 rounded text-xs bg-gray-800 text-gray-500"
215
+ >
216
+ {k}: {typeof v === "number" ? v.toFixed(2) : String(v)}
217
+ </span>
218
+ ))}
219
+ </div>
220
+ )}
221
+ </>
 
222
  );
223
  }
224
 
 
294
  );
295
  }
296
 
297
+ /** Shows just the command portion of a tool call (no hidden expand for response). */
298
+ function ToolCallCommand({
299
  toolCall,
 
300
  }: {
301
+ toolCall: { function: string; command?: string; arguments?: Record<string, unknown> };
 
302
  }) {
303
  const [expanded, setExpanded] = useState(false);
304
+ const cmd = toolCall.command || "";
305
 
306
  return (
307
  <div className="rounded bg-amber-900/30 border border-amber-800/30 px-3 py-2">
 
309
  <span className="text-xs text-amber-400 font-medium">
310
  {toolCall.function}
311
  </span>
312
+ {cmd.length > 300 && (
313
+ <button
314
+ onClick={() => setExpanded(!expanded)}
315
+ className="text-xs text-gray-500 hover:text-gray-300"
316
+ >
317
+ {expanded ? "[-]" : "[+]"}
318
+ </button>
319
+ )}
320
  </div>
321
 
322
  {cmd && (
 
324
  {expanded ? cmd : cmd.length > 300 ? cmd.slice(0, 300) + "..." : cmd}
325
  </pre>
326
  )}
 
 
 
 
 
 
 
 
 
327
  </div>
328
  );
329
  }
frontend/src/harbor/components/TrajectoryView.tsx CHANGED
@@ -1,5 +1,5 @@
1
  import { useRef, useEffect } from "react";
2
- import type { DatasetInfo, InstanceDetail, TrajectoryMode, RawStep } from "../types";
3
  import { RawBubble, AtifBubble } from "./ChatBubble";
4
  import { StepDetail } from "./StepDetail";
5
 
@@ -19,42 +19,6 @@ export function TrajectoryView({ dataset, detail, mode, isSingle }: Props) {
19
  }
20
  }, [detail.instance_id, mode]);
21
 
22
- // Build tool_call_id → response map for raw mode
23
- const toolResponseMap = new Map<string, string>();
24
- if (mode === "raw") {
25
- for (const step of detail.raw_steps) {
26
- if (step.role === "tool" && step.tool_call_id) {
27
- toolResponseMap.set(step.tool_call_id, step.content);
28
- }
29
- }
30
- }
31
-
32
- // For raw mode, group assistant messages with their tool responses
33
- // to avoid showing tool responses twice
34
- const pairedToolCallIds = new Set<string>();
35
- if (mode === "raw") {
36
- for (const step of detail.raw_steps) {
37
- if (step.role === "assistant" && step.tool_calls) {
38
- for (const tc of step.tool_calls) {
39
- if (toolResponseMap.has(tc.id)) {
40
- pairedToolCallIds.add(tc.id);
41
- }
42
- }
43
- }
44
- }
45
- }
46
-
47
- // Filter out standalone tool messages that are already paired
48
- const filteredRawSteps: RawStep[] =
49
- mode === "raw"
50
- ? detail.raw_steps.filter((step) => {
51
- if (step.role === "tool" && step.tool_call_id) {
52
- return !pairedToolCallIds.has(step.tool_call_id);
53
- }
54
- return true;
55
- })
56
- : [];
57
-
58
  return (
59
  <div
60
  className={`flex flex-col overflow-hidden ${
@@ -89,16 +53,13 @@ export function TrajectoryView({ dataset, detail, mode, isSingle }: Props) {
89
  <div ref={scrollRef} className="flex-1 overflow-y-auto px-4 py-4 space-y-1">
90
  {mode === "raw" && (
91
  <>
92
- {filteredRawSteps.map((step) => (
93
- <RawBubble
94
- key={step.index}
95
- step={step}
96
- toolResponses={toolResponseMap}
97
- />
98
  ))}
99
- {filteredRawSteps.length === 0 && (
100
  <div className="text-center text-gray-500 text-sm mt-8">
101
- No raw trajectory data available
 
102
  </div>
103
  )}
104
  </>
 
1
  import { useRef, useEffect } from "react";
2
+ import type { DatasetInfo, InstanceDetail, TrajectoryMode } from "../types";
3
  import { RawBubble, AtifBubble } from "./ChatBubble";
4
  import { StepDetail } from "./StepDetail";
5
 
 
19
  }
20
  }, [detail.instance_id, mode]);
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  return (
23
  <div
24
  className={`flex flex-col overflow-hidden ${
 
53
  <div ref={scrollRef} className="flex-1 overflow-y-auto px-4 py-4 space-y-1">
54
  {mode === "raw" && (
55
  <>
56
+ {detail.raw_steps.map((step) => (
57
+ <RawBubble key={step.index} step={step} />
 
 
 
 
58
  ))}
59
+ {detail.raw_steps.length === 0 && (
60
  <div className="text-center text-gray-500 text-sm mt-8">
61
+ No raw trajectory data available.
62
+ {detail.n_atif_steps > 0 && " Try ATIF Steps mode."}
63
  </div>
64
  )}
65
  </>