Zayne Rea Sprague commited on
Commit ·
7023780
1
Parent(s): b630916
vis updates for harbor
Browse files
backend/api/harbor_datasets.py
CHANGED
|
@@ -14,7 +14,7 @@ def _make_id(repo: str, split: str) -> str:
|
|
| 14 |
|
| 15 |
|
| 16 |
def _parse_trajectory(traj_json: str) -> dict:
|
| 17 |
-
"""Parse ATIF
|
| 18 |
if not traj_json:
|
| 19 |
return {"steps": [], "agent_info": {}, "final_metrics": {}}
|
| 20 |
|
|
@@ -35,11 +35,13 @@ def _parse_trajectory(traj_json: str) -> dict:
|
|
| 35 |
parsed["reasoning"] = step.get("reasoning_content", "")
|
| 36 |
parsed["tool_calls"] = []
|
| 37 |
for tc in step.get("tool_calls", []):
|
|
|
|
| 38 |
tool_call = {
|
| 39 |
"function": tc.get("function_name", ""),
|
| 40 |
-
"arguments":
|
| 41 |
}
|
| 42 |
-
|
|
|
|
| 43 |
if cmd:
|
| 44 |
tool_call["command"] = cmd
|
| 45 |
parsed["tool_calls"].append(tool_call)
|
|
@@ -144,6 +146,82 @@ def _parse_trajectory_raw(traj_raw: str) -> list[dict]:
|
|
| 144 |
return steps
|
| 145 |
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
def _build_instance_summary(row: dict) -> dict:
|
| 148 |
"""Build a summary for one instance row."""
|
| 149 |
return {
|
|
@@ -240,8 +318,10 @@ def get_instance(ds_id, instance_id):
|
|
| 240 |
# Parse ATIF trajectory
|
| 241 |
atif = _parse_trajectory(row.get("trajectory", ""))
|
| 242 |
|
| 243 |
-
# Parse raw trajectory (OpenAI messages)
|
| 244 |
raw_steps = _parse_trajectory_raw(row.get("trajectory_raw", ""))
|
|
|
|
|
|
|
| 245 |
|
| 246 |
return jsonify({
|
| 247 |
"instance_id": instance_id,
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
def _parse_trajectory(traj_json: str) -> dict:
|
| 17 |
+
"""Parse ATIF trajectory JSON into structured steps (v1.2 and v1.5)."""
|
| 18 |
if not traj_json:
|
| 19 |
return {"steps": [], "agent_info": {}, "final_metrics": {}}
|
| 20 |
|
|
|
|
| 35 |
parsed["reasoning"] = step.get("reasoning_content", "")
|
| 36 |
parsed["tool_calls"] = []
|
| 37 |
for tc in step.get("tool_calls", []):
|
| 38 |
+
args = tc.get("arguments", {})
|
| 39 |
tool_call = {
|
| 40 |
"function": tc.get("function_name", ""),
|
| 41 |
+
"arguments": args,
|
| 42 |
}
|
| 43 |
+
# v1.2 uses "command", v1.5 uses "cmd"
|
| 44 |
+
cmd = args.get("command", "") or args.get("cmd", "")
|
| 45 |
if cmd:
|
| 46 |
tool_call["command"] = cmd
|
| 47 |
parsed["tool_calls"].append(tool_call)
|
|
|
|
| 146 |
return steps
|
| 147 |
|
| 148 |
|
| 149 |
+
def _parse_agent_output_jsonl(agent_output: str) -> list[dict]:
|
| 150 |
+
"""Parse Codex-style JSONL agent_output into chat-style steps.
|
| 151 |
+
|
| 152 |
+
Codex emits newline-delimited JSON with item.completed events containing
|
| 153 |
+
reasoning, agent_message, and command_execution items. Falls back
|
| 154 |
+
gracefully if the format is unrecognised.
|
| 155 |
+
"""
|
| 156 |
+
if not agent_output:
|
| 157 |
+
return []
|
| 158 |
+
|
| 159 |
+
steps: list[dict] = []
|
| 160 |
+
idx = 0
|
| 161 |
+
|
| 162 |
+
for line in agent_output.strip().split("\n"):
|
| 163 |
+
try:
|
| 164 |
+
event = json.loads(line)
|
| 165 |
+
except (json.JSONDecodeError, TypeError):
|
| 166 |
+
continue
|
| 167 |
+
|
| 168 |
+
if event.get("type") != "item.completed":
|
| 169 |
+
continue
|
| 170 |
+
|
| 171 |
+
item = event.get("item", {})
|
| 172 |
+
item_type = item.get("type", "")
|
| 173 |
+
|
| 174 |
+
if item_type == "reasoning":
|
| 175 |
+
steps.append({
|
| 176 |
+
"index": idx,
|
| 177 |
+
"role": "assistant",
|
| 178 |
+
"content": item.get("text", ""),
|
| 179 |
+
"_reasoning": True,
|
| 180 |
+
})
|
| 181 |
+
idx += 1
|
| 182 |
+
|
| 183 |
+
elif item_type == "agent_message":
|
| 184 |
+
steps.append({
|
| 185 |
+
"index": idx,
|
| 186 |
+
"role": "assistant",
|
| 187 |
+
"content": item.get("text", ""),
|
| 188 |
+
})
|
| 189 |
+
idx += 1
|
| 190 |
+
|
| 191 |
+
elif item_type == "command_execution":
|
| 192 |
+
cmd = item.get("command", "")
|
| 193 |
+
call_id = item.get("call_id", item.get("id", ""))
|
| 194 |
+
# Assistant step with tool call
|
| 195 |
+
steps.append({
|
| 196 |
+
"index": idx,
|
| 197 |
+
"role": "assistant",
|
| 198 |
+
"content": "",
|
| 199 |
+
"tool_calls": [{
|
| 200 |
+
"id": call_id,
|
| 201 |
+
"function": "exec_command",
|
| 202 |
+
"arguments_raw": json.dumps({"command": cmd}),
|
| 203 |
+
"arguments": {"command": cmd},
|
| 204 |
+
"command": cmd,
|
| 205 |
+
}],
|
| 206 |
+
})
|
| 207 |
+
idx += 1
|
| 208 |
+
# Tool response step
|
| 209 |
+
output = item.get("output", "")
|
| 210 |
+
exit_code = item.get("exit_code")
|
| 211 |
+
response_text = output
|
| 212 |
+
if exit_code is not None:
|
| 213 |
+
response_text = f"[exit code: {exit_code}]\n{output}" if output else f"[exit code: {exit_code}]"
|
| 214 |
+
steps.append({
|
| 215 |
+
"index": idx,
|
| 216 |
+
"role": "tool",
|
| 217 |
+
"content": response_text,
|
| 218 |
+
"tool_call_id": call_id,
|
| 219 |
+
})
|
| 220 |
+
idx += 1
|
| 221 |
+
|
| 222 |
+
return steps
|
| 223 |
+
|
| 224 |
+
|
| 225 |
def _build_instance_summary(row: dict) -> dict:
|
| 226 |
"""Build a summary for one instance row."""
|
| 227 |
return {
|
|
|
|
| 318 |
# Parse ATIF trajectory
|
| 319 |
atif = _parse_trajectory(row.get("trajectory", ""))
|
| 320 |
|
| 321 |
+
# Parse raw trajectory (OpenAI messages), fall back to agent_output JSONL
|
| 322 |
raw_steps = _parse_trajectory_raw(row.get("trajectory_raw", ""))
|
| 323 |
+
if not raw_steps and row.get("agent_output"):
|
| 324 |
+
raw_steps = _parse_agent_output_jsonl(row["agent_output"])
|
| 325 |
|
| 326 |
return jsonify({
|
| 327 |
"instance_id": instance_id,
|
frontend/src/harbor/components/ChatBubble.tsx
CHANGED
|
@@ -39,6 +39,22 @@ export function RawBubble({ step, toolResponses }: RawBubbleProps) {
|
|
| 39 |
);
|
| 40 |
}
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
if (step.role === "assistant") {
|
| 43 |
return (
|
| 44 |
<div className="flex justify-end mb-3">
|
|
|
|
| 39 |
);
|
| 40 |
}
|
| 41 |
|
| 42 |
+
if (step.role === "assistant" && step._reasoning) {
|
| 43 |
+
return (
|
| 44 |
+
<div className="flex justify-end mb-3">
|
| 45 |
+
<div className="max-w-[85%] rounded-lg px-4 py-3 bg-violet-900/20 border border-violet-800/30">
|
| 46 |
+
<div className="text-xs font-medium text-violet-400 mb-1">Reasoning</div>
|
| 47 |
+
<ContentBlock
|
| 48 |
+
content={step.content}
|
| 49 |
+
expanded={expanded}
|
| 50 |
+
onToggle={() => setExpanded(!expanded)}
|
| 51 |
+
maxPreview={300}
|
| 52 |
+
/>
|
| 53 |
+
</div>
|
| 54 |
+
</div>
|
| 55 |
+
);
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
if (step.role === "assistant") {
|
| 59 |
return (
|
| 60 |
<div className="flex justify-end mb-3">
|
frontend/src/harbor/types.ts
CHANGED
|
@@ -50,6 +50,7 @@ export interface RawStep {
|
|
| 50 |
content: string;
|
| 51 |
tool_calls?: RawToolCall[];
|
| 52 |
tool_call_id?: string;
|
|
|
|
| 53 |
}
|
| 54 |
|
| 55 |
export interface RawToolCall {
|
|
|
|
| 50 |
content: string;
|
| 51 |
tool_calls?: RawToolCall[];
|
| 52 |
tool_call_id?: string;
|
| 53 |
+
_reasoning?: boolean;
|
| 54 |
}
|
| 55 |
|
| 56 |
export interface RawToolCall {
|
frontend/tsconfig.app.tsbuildinfo
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
{"root":["./src/app.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/arena/arenaapp.tsx","./src/arena/api.ts","./src/arena/store.ts","./src/arena/types.ts","./src/arena/components/episodebar.tsx","./src/arena/components/episodenav.tsx","./src/arena/components/sidebar.tsx","./src/arena/components/transcriptpanel.tsx","./src/arena/utils/tracehighlight.ts","./src/harbor/harborapp.tsx","./src/harbor/api.ts","./src/harbor/store.ts","./src/harbor/types.ts","./src/harbor/components/chatbubble.tsx","./src/harbor/components/infobar.tsx","./src/harbor/components/instancelist.tsx","./src/harbor/components/instancenav.tsx","./src/harbor/components/metricssummary.tsx","./src/harbor/components/sidebar.tsx","./src/harbor/components/stepdetail.tsx","./src/harbor/components/trajectoryview.tsx","./src/model/modelapp.tsx","./src/model/api.ts","./src/model/store.ts","./src/model/types.ts","./src/model/components/infobar.tsx","./src/model/components/questionnav.tsx","./src/model/components/sidebar.tsx","./src/model/components/tracepanel.tsx","./src/model/utils/promptparser.ts","./src/model/utils/tracehighlight.ts","./src/rlm/rlmapp.tsx","./src/rlm/api.ts","./src/rlm/store.ts","./src/rlm/types.ts","./src/rlm/components/breadcrumb.tsx","./src/rlm/components/datasetselector.tsx","./src/rlm/components/gepaiterlevel.tsx","./src/rlm/components/overviewlevel.tsx","./src/rlm/components/panel.tsx","./src/rlm/components/rlmdetaillevel.tsx","./src/rlm/components/sidebar.tsx"],"version":"5.9.3"}
|
|
|
|
| 1 |
+
{"root":["./src/app.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/arena/arenaapp.tsx","./src/arena/api.ts","./src/arena/store.ts","./src/arena/types.ts","./src/arena/components/episodebar.tsx","./src/arena/components/episodenav.tsx","./src/arena/components/sidebar.tsx","./src/arena/components/transcriptpanel.tsx","./src/arena/utils/tracehighlight.ts","./src/harbor/harborapp.tsx","./src/harbor/api.ts","./src/harbor/store.ts","./src/harbor/types.ts","./src/harbor/components/chatbubble.tsx","./src/harbor/components/infobar.tsx","./src/harbor/components/instancelist.tsx","./src/harbor/components/instancenav.tsx","./src/harbor/components/metricssummary.tsx","./src/harbor/components/sidebar.tsx","./src/harbor/components/stepdetail.tsx","./src/harbor/components/trajectoryview.tsx","./src/model/modelapp.tsx","./src/model/api.ts","./src/model/store.ts","./src/model/types.ts","./src/model/components/infobar.tsx","./src/model/components/questionnav.tsx","./src/model/components/sidebar.tsx","./src/model/components/tracepanel.tsx","./src/model/utils/promptparser.ts","./src/model/utils/tracehighlight.ts","./src/rlm/rlmapp.tsx","./src/rlm/api.ts","./src/rlm/store.ts","./src/rlm/types.ts","./src/rlm/components/breadcrumb.tsx","./src/rlm/components/datasetselector.tsx","./src/rlm/components/gepaiterlevel.tsx","./src/rlm/components/overviewlevel.tsx","./src/rlm/components/panel.tsx","./src/rlm/components/rlmdetaillevel.tsx","./src/rlm/components/sidebar.tsx","./src/rlm-eval/rlmevalapp.tsx","./src/rlm-eval/api.ts","./src/rlm-eval/store.ts","./src/rlm-eval/types.ts","./src/rlm-eval/components/breadcrumb.tsx","./src/rlm-eval/components/datasetselector.tsx","./src/rlm-eval/components/exampledetaillevel.tsx","./src/rlm-eval/components/iterationdetail.tsx","./src/rlm-eval/components/overviewlevel.tsx","./src/rlm-eval/components/panel.tsx","./src/rlm-eval/components/sidebar.tsx"],"version":"5.9.3"}
|