Zayne Rea Sprague commited on
Commit
7023780
·
1 Parent(s): b630916

vis updates for harbor

Browse files
backend/api/harbor_datasets.py CHANGED
@@ -14,7 +14,7 @@ def _make_id(repo: str, split: str) -> str:
14
 
15
 
16
  def _parse_trajectory(traj_json: str) -> dict:
17
- """Parse ATIF-v1.2 trajectory JSON into structured steps."""
18
  if not traj_json:
19
  return {"steps": [], "agent_info": {}, "final_metrics": {}}
20
 
@@ -35,11 +35,13 @@ def _parse_trajectory(traj_json: str) -> dict:
35
  parsed["reasoning"] = step.get("reasoning_content", "")
36
  parsed["tool_calls"] = []
37
  for tc in step.get("tool_calls", []):
 
38
  tool_call = {
39
  "function": tc.get("function_name", ""),
40
- "arguments": tc.get("arguments", {}),
41
  }
42
- cmd = tc.get("arguments", {}).get("command", "")
 
43
  if cmd:
44
  tool_call["command"] = cmd
45
  parsed["tool_calls"].append(tool_call)
@@ -144,6 +146,82 @@ def _parse_trajectory_raw(traj_raw: str) -> list[dict]:
144
  return steps
145
 
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  def _build_instance_summary(row: dict) -> dict:
148
  """Build a summary for one instance row."""
149
  return {
@@ -240,8 +318,10 @@ def get_instance(ds_id, instance_id):
240
  # Parse ATIF trajectory
241
  atif = _parse_trajectory(row.get("trajectory", ""))
242
 
243
- # Parse raw trajectory (OpenAI messages)
244
  raw_steps = _parse_trajectory_raw(row.get("trajectory_raw", ""))
 
 
245
 
246
  return jsonify({
247
  "instance_id": instance_id,
 
14
 
15
 
16
  def _parse_trajectory(traj_json: str) -> dict:
17
+ """Parse ATIF trajectory JSON into structured steps (v1.2 and v1.5)."""
18
  if not traj_json:
19
  return {"steps": [], "agent_info": {}, "final_metrics": {}}
20
 
 
35
  parsed["reasoning"] = step.get("reasoning_content", "")
36
  parsed["tool_calls"] = []
37
  for tc in step.get("tool_calls", []):
38
+ args = tc.get("arguments", {})
39
  tool_call = {
40
  "function": tc.get("function_name", ""),
41
+ "arguments": args,
42
  }
43
+ # v1.2 uses "command", v1.5 uses "cmd"
44
+ cmd = args.get("command", "") or args.get("cmd", "")
45
  if cmd:
46
  tool_call["command"] = cmd
47
  parsed["tool_calls"].append(tool_call)
 
146
  return steps
147
 
148
 
149
+ def _parse_agent_output_jsonl(agent_output: str) -> list[dict]:
150
+ """Parse Codex-style JSONL agent_output into chat-style steps.
151
+
152
+ Codex emits newline-delimited JSON with item.completed events containing
153
+ reasoning, agent_message, and command_execution items. Falls back
154
+ gracefully if the format is unrecognised.
155
+ """
156
+ if not agent_output:
157
+ return []
158
+
159
+ steps: list[dict] = []
160
+ idx = 0
161
+
162
+ for line in agent_output.strip().split("\n"):
163
+ try:
164
+ event = json.loads(line)
165
+ except (json.JSONDecodeError, TypeError):
166
+ continue
167
+
168
+ if event.get("type") != "item.completed":
169
+ continue
170
+
171
+ item = event.get("item", {})
172
+ item_type = item.get("type", "")
173
+
174
+ if item_type == "reasoning":
175
+ steps.append({
176
+ "index": idx,
177
+ "role": "assistant",
178
+ "content": item.get("text", ""),
179
+ "_reasoning": True,
180
+ })
181
+ idx += 1
182
+
183
+ elif item_type == "agent_message":
184
+ steps.append({
185
+ "index": idx,
186
+ "role": "assistant",
187
+ "content": item.get("text", ""),
188
+ })
189
+ idx += 1
190
+
191
+ elif item_type == "command_execution":
192
+ cmd = item.get("command", "")
193
+ call_id = item.get("call_id", item.get("id", ""))
194
+ # Assistant step with tool call
195
+ steps.append({
196
+ "index": idx,
197
+ "role": "assistant",
198
+ "content": "",
199
+ "tool_calls": [{
200
+ "id": call_id,
201
+ "function": "exec_command",
202
+ "arguments_raw": json.dumps({"command": cmd}),
203
+ "arguments": {"command": cmd},
204
+ "command": cmd,
205
+ }],
206
+ })
207
+ idx += 1
208
+ # Tool response step
209
+ output = item.get("output", "")
210
+ exit_code = item.get("exit_code")
211
+ response_text = output
212
+ if exit_code is not None:
213
+ response_text = f"[exit code: {exit_code}]\n{output}" if output else f"[exit code: {exit_code}]"
214
+ steps.append({
215
+ "index": idx,
216
+ "role": "tool",
217
+ "content": response_text,
218
+ "tool_call_id": call_id,
219
+ })
220
+ idx += 1
221
+
222
+ return steps
223
+
224
+
225
  def _build_instance_summary(row: dict) -> dict:
226
  """Build a summary for one instance row."""
227
  return {
 
318
  # Parse ATIF trajectory
319
  atif = _parse_trajectory(row.get("trajectory", ""))
320
 
321
+ # Parse raw trajectory (OpenAI messages), fall back to agent_output JSONL
322
  raw_steps = _parse_trajectory_raw(row.get("trajectory_raw", ""))
323
+ if not raw_steps and row.get("agent_output"):
324
+ raw_steps = _parse_agent_output_jsonl(row["agent_output"])
325
 
326
  return jsonify({
327
  "instance_id": instance_id,
frontend/src/harbor/components/ChatBubble.tsx CHANGED
@@ -39,6 +39,22 @@ export function RawBubble({ step, toolResponses }: RawBubbleProps) {
39
  );
40
  }
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  if (step.role === "assistant") {
43
  return (
44
  <div className="flex justify-end mb-3">
 
39
  );
40
  }
41
 
42
+ if (step.role === "assistant" && step._reasoning) {
43
+ return (
44
+ <div className="flex justify-end mb-3">
45
+ <div className="max-w-[85%] rounded-lg px-4 py-3 bg-violet-900/20 border border-violet-800/30">
46
+ <div className="text-xs font-medium text-violet-400 mb-1">Reasoning</div>
47
+ <ContentBlock
48
+ content={step.content}
49
+ expanded={expanded}
50
+ onToggle={() => setExpanded(!expanded)}
51
+ maxPreview={300}
52
+ />
53
+ </div>
54
+ </div>
55
+ );
56
+ }
57
+
58
  if (step.role === "assistant") {
59
  return (
60
  <div className="flex justify-end mb-3">
frontend/src/harbor/types.ts CHANGED
@@ -50,6 +50,7 @@ export interface RawStep {
50
  content: string;
51
  tool_calls?: RawToolCall[];
52
  tool_call_id?: string;
 
53
  }
54
 
55
  export interface RawToolCall {
 
50
  content: string;
51
  tool_calls?: RawToolCall[];
52
  tool_call_id?: string;
53
+ _reasoning?: boolean;
54
  }
55
 
56
  export interface RawToolCall {
frontend/tsconfig.app.tsbuildinfo CHANGED
@@ -1 +1 @@
1
- {"root":["./src/app.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/arena/arenaapp.tsx","./src/arena/api.ts","./src/arena/store.ts","./src/arena/types.ts","./src/arena/components/episodebar.tsx","./src/arena/components/episodenav.tsx","./src/arena/components/sidebar.tsx","./src/arena/components/transcriptpanel.tsx","./src/arena/utils/tracehighlight.ts","./src/harbor/harborapp.tsx","./src/harbor/api.ts","./src/harbor/store.ts","./src/harbor/types.ts","./src/harbor/components/chatbubble.tsx","./src/harbor/components/infobar.tsx","./src/harbor/components/instancelist.tsx","./src/harbor/components/instancenav.tsx","./src/harbor/components/metricssummary.tsx","./src/harbor/components/sidebar.tsx","./src/harbor/components/stepdetail.tsx","./src/harbor/components/trajectoryview.tsx","./src/model/modelapp.tsx","./src/model/api.ts","./src/model/store.ts","./src/model/types.ts","./src/model/components/infobar.tsx","./src/model/components/questionnav.tsx","./src/model/components/sidebar.tsx","./src/model/components/tracepanel.tsx","./src/model/utils/promptparser.ts","./src/model/utils/tracehighlight.ts","./src/rlm/rlmapp.tsx","./src/rlm/api.ts","./src/rlm/store.ts","./src/rlm/types.ts","./src/rlm/components/breadcrumb.tsx","./src/rlm/components/datasetselector.tsx","./src/rlm/components/gepaiterlevel.tsx","./src/rlm/components/overviewlevel.tsx","./src/rlm/components/panel.tsx","./src/rlm/components/rlmdetaillevel.tsx","./src/rlm/components/sidebar.tsx"],"version":"5.9.3"}
 
1
+ {"root":["./src/app.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/arena/arenaapp.tsx","./src/arena/api.ts","./src/arena/store.ts","./src/arena/types.ts","./src/arena/components/episodebar.tsx","./src/arena/components/episodenav.tsx","./src/arena/components/sidebar.tsx","./src/arena/components/transcriptpanel.tsx","./src/arena/utils/tracehighlight.ts","./src/harbor/harborapp.tsx","./src/harbor/api.ts","./src/harbor/store.ts","./src/harbor/types.ts","./src/harbor/components/chatbubble.tsx","./src/harbor/components/infobar.tsx","./src/harbor/components/instancelist.tsx","./src/harbor/components/instancenav.tsx","./src/harbor/components/metricssummary.tsx","./src/harbor/components/sidebar.tsx","./src/harbor/components/stepdetail.tsx","./src/harbor/components/trajectoryview.tsx","./src/model/modelapp.tsx","./src/model/api.ts","./src/model/store.ts","./src/model/types.ts","./src/model/components/infobar.tsx","./src/model/components/questionnav.tsx","./src/model/components/sidebar.tsx","./src/model/components/tracepanel.tsx","./src/model/utils/promptparser.ts","./src/model/utils/tracehighlight.ts","./src/rlm/rlmapp.tsx","./src/rlm/api.ts","./src/rlm/store.ts","./src/rlm/types.ts","./src/rlm/components/breadcrumb.tsx","./src/rlm/components/datasetselector.tsx","./src/rlm/components/gepaiterlevel.tsx","./src/rlm/components/overviewlevel.tsx","./src/rlm/components/panel.tsx","./src/rlm/components/rlmdetaillevel.tsx","./src/rlm/components/sidebar.tsx","./src/rlm-eval/rlmevalapp.tsx","./src/rlm-eval/api.ts","./src/rlm-eval/store.ts","./src/rlm-eval/types.ts","./src/rlm-eval/components/breadcrumb.tsx","./src/rlm-eval/components/datasetselector.tsx","./src/rlm-eval/components/exampledetaillevel.tsx","./src/rlm-eval/components/iterationdetail.tsx","./src/rlm-eval/components/overviewlevel.tsx","./src/rlm-eval/components/panel.tsx","./src/rlm-eval/components/sidebar.tsx"],"version":"5.9.3"}