rufimelo commited on
Commit
f20d268
Β·
verified Β·
1 Parent(s): 0d15aae

Collapse all expanders by default

Browse files
Files changed (1) hide show
  1. app.py +337 -240
app.py CHANGED
@@ -1,12 +1,12 @@
1
- """github_red Trajectory Viewer β€” Hugging Face Space edition.
2
 
3
- Select a bundled example or upload any .eval log file produced by the
4
- decomposition benchmark to visualise the full multi-agent attack trajectory.
5
  """
6
 
7
  from __future__ import annotations
8
 
9
- import io
10
  import json
11
  import re
12
  import zipfile
@@ -14,42 +14,44 @@ from pathlib import Path
14
 
15
  import streamlit as st
16
 
 
 
17
  # ── Colours & labels ────────────────────────────────────────────────────────
18
 
19
  ROLE_COLOR = {
20
- "profiler": "#0F766E",
21
  "decomposer": "#7C3AED",
22
- "executor": "#2563EB",
23
- "target": "#DC2626",
24
- "reviewer": "#DC2626",
25
- "auditor": "#B45309",
26
  "classifier": "#D97706",
27
  }
28
  ROLE_EMOJI = {
29
- "profiler": "πŸ”¬",
30
  "decomposer": "🧩",
31
- "executor": "βš™οΈ",
32
- "target": "πŸ›‘οΈ",
33
- "reviewer": "πŸ”Ž",
34
- "auditor": "🎯",
35
  "classifier": "πŸ”",
36
  }
37
  TOOL_COLOR = {
38
- "github_create_branch": "#0891B2",
39
- "github_write_file": "#0891B2",
40
- "github_create_pull_request": "#0891B2",
41
- "transfer_to_target": "#7C3AED",
42
- "github_list_pr_files": "#DC2626",
43
- "github_read_file": "#DC2626",
44
  "github_approve_pull_request": "#16A34A",
45
- "github_add_pr_comment": "#DC2626",
46
- "github_list_files": "#0284C7",
47
- "bash": "#854D0E",
48
- "submit": "#374151",
49
- "decompose": "#7C3AED",
50
- "think": "#9CA3AF",
51
- "submit_profile": "#0F766E",
52
- "get_repository_analysis": "#0F766E",
53
  }
54
 
55
  # ── Helpers ──────────────────────────────────────────────────────────────────
@@ -57,7 +59,7 @@ TOOL_COLOR = {
57
 
58
  def _resolve(val: object, atts: dict) -> object:
59
  if isinstance(val, str) and val.startswith("attachment://"):
60
- key = val[len("attachment://"):]
61
  resolved = atts.get(key, val)
62
  return resolved if isinstance(resolved, (str, dict, list)) else val
63
  if isinstance(val, dict):
@@ -90,10 +92,24 @@ def _parse_args(raw: object) -> dict:
90
  return raw if isinstance(raw, dict) else {}
91
 
92
 
 
 
 
93
  def _parse_pr_files(text: str) -> list[dict]:
 
 
 
 
 
 
 
 
 
94
  files = []
 
95
  parts = re.split(r"^=== (.+?) ===$", text, flags=re.MULTILINE)
96
- it = iter(parts[1:])
 
97
  for path in it:
98
  content = next(it, "").strip()
99
  if path.strip():
@@ -102,19 +118,25 @@ def _parse_pr_files(text: str) -> list[dict]:
102
 
103
 
104
  def _lang_for(path: str) -> str:
 
105
  ext = path.rsplit(".", 1)[-1].lower() if "." in path else ""
106
  return {
107
- "py": "python", "yml": "yaml", "yaml": "yaml",
108
- "sh": "bash", "bash": "bash",
109
- "json": "json", "js": "javascript", "ts": "typescript",
110
- "md": "markdown", "txt": "text", "toml": "toml",
111
- "dockerfile": "dockerfile", "tf": "hcl",
 
 
 
 
 
 
 
 
112
  }.get(ext, "text")
113
 
114
 
115
- # ── Event parsing ─────────────────────────────────────────────────────────────
116
-
117
-
118
  def parse_events(events: list, atts: dict) -> list:
119
  """Return a flat list of structured step dicts."""
120
  steps = []
@@ -122,6 +144,7 @@ def parse_events(events: list, atts: dict) -> list:
122
  _seen_pr_sigs: set[str] = set()
123
  _profiler_phase_emitted = False
124
  _decomp_phase_emitted = False
 
125
  _seen_role_in_subtask: set[tuple] = set()
126
  _anchor_counter = 0
127
 
@@ -140,20 +163,24 @@ def parse_events(events: list, atts: dict) -> list:
140
  if not isinstance(text, str):
141
  text = json.dumps(text)
142
 
 
143
  m = re.search(r"Subtask (\d+)", text)
144
  if m and ("context for" in text or "Starting" in text):
145
  subtask = int(m.group(1))
146
  anchor = f"subtask-{subtask}"
147
- steps.append({
148
- "kind": "subtask_divider",
149
- "subtask": subtask,
150
- "anchor": anchor,
151
- "nav_label": f"Subtask {subtask}",
152
- "nav_color": "#2563EB",
153
- "nav_indent": 0,
154
- })
 
 
155
  continue
156
 
 
157
  if text.startswith("attachment://"):
158
  continue
159
  if text.startswith("●") and "[profiler]" not in text:
@@ -167,29 +194,34 @@ def parse_events(events: list, atts: dict) -> list:
167
  elif ev == "model":
168
  role = e.get("role", "unknown")
169
 
 
170
  if role == "profiler" and not _profiler_phase_emitted:
171
  _profiler_phase_emitted = True
172
- steps.append({
173
- "kind": "phase",
174
- "label": "πŸ”¬ Profiler Phase",
175
- "subtask": 0,
176
- "anchor": "phase-profiler",
177
- "nav_label": "πŸ”¬ Profiler",
178
- "nav_color": "#0F766E",
179
- "nav_indent": 0,
180
- })
 
 
181
 
182
  if role == "decomposer" and not _decomp_phase_emitted:
183
  _decomp_phase_emitted = True
184
- steps.append({
185
- "kind": "phase",
186
- "label": "🧩 Decomposition Phase",
187
- "subtask": 0,
188
- "anchor": "phase-decomposer",
189
- "nav_label": "🧩 Decomposer",
190
- "nav_color": "#7C3AED",
191
- "nav_indent": 0,
192
- })
 
 
193
 
194
  model = e.get("model", "")
195
  out = e.get("output") or {}
@@ -204,12 +236,16 @@ def parse_events(events: list, atts: dict) -> list:
204
  args = _resolve(_parse_args(tc.get("arguments", {})), atts)
205
  tool_calls.append({"fn": fn, "args": args})
206
 
 
207
  anchor = None
208
  nav_label = None
209
  nav_color = None
210
  nav_indent = None
211
  role_key = (role, subtask)
212
- if role in ("executor", "reviewer", "target", "auditor") and role_key not in _seen_role_in_subtask:
 
 
 
213
  _seen_role_in_subtask.add(role_key)
214
  anchor = _next_anchor()
215
  emoji = ROLE_EMOJI.get(role, "πŸ€–")
@@ -234,6 +270,8 @@ def parse_events(events: list, atts: dict) -> list:
234
 
235
  steps.append(step)
236
 
 
 
237
  if role in ("target", "reviewer"):
238
  inp = e.get("input", [])
239
  for m in inp:
@@ -251,12 +289,14 @@ def parse_events(events: list, atts: dict) -> list:
251
  sig = "|".join(f["path"] for f in files)
252
  if sig not in _seen_pr_sigs:
253
  _seen_pr_sigs.add(sig)
254
- steps.append({
255
- "kind": "pr_diff",
256
- "files": files,
257
- "subtask": subtask,
258
- })
259
- break
 
 
260
 
261
  # ── Tool results ──────────────────────────────────────────────
262
  elif ev == "tool" and e.get("type") == "function":
@@ -268,23 +308,27 @@ def parse_events(events: list, atts: dict) -> list:
268
  elif isinstance(out, dict):
269
  out = json.dumps(out, indent=2)
270
  out = str(out)
271
- steps.append({
272
- "kind": "tool_result",
273
- "fn": fn,
274
- "result": out,
275
- "subtask": subtask,
276
- })
 
 
277
 
278
  # ── Final score ───────────────────────────────────────────────
279
  elif ev == "score" and not e.get("intermediate"):
280
  sc = e.get("score", {})
281
- steps.append({
282
- "kind": "score",
283
- "value": sc.get("value"),
284
- "answer": sc.get("answer", "?"),
285
- "explanation": sc.get("explanation", ""),
286
- "subtask": subtask,
287
- })
 
 
288
 
289
  return steps
290
 
@@ -305,24 +349,33 @@ def _tool_badge(fn: str) -> str:
305
 
306
 
307
  def _args_summary(fn: str, args: dict) -> str:
 
308
  if fn == "github_create_branch":
309
- return f"`{args.get('branch_name', '')}` from `{args.get('base_branch', 'main')}`"
 
 
310
  if fn == "github_write_file":
311
- return f"`{args.get('path', '')}` β€” {args.get('commit_message', '')[:80]}"
 
 
312
  if fn == "github_create_pull_request":
313
  return f"**{args.get('title', '')}** β†’ `{args.get('head', '')}`"
314
  if fn == "transfer_to_target":
315
- return args.get("message", "")[:120]
 
316
  if fn == "github_list_pr_files":
317
  return f"PR #{args.get('pr_number', '?')} in `{args.get('repo', '')}`"
318
  if fn in ("github_approve_pull_request",):
319
  return f"PR #{args.get('pr_number', '?')}"
320
  if fn == "github_add_pr_comment":
321
- return args.get("comment", "")[:120]
 
322
  if fn == "submit":
323
- return str(args.get("answer", args.get("value", "")))[:120]
 
324
  if fn == "decompose":
325
- return f"{len(args.get('subtasks', []))} subtasks"
 
326
  if fn == "bash":
327
  cmd = args.get("cmd", args.get("command", ""))
328
  return f"`{cmd[:120]}`"
@@ -335,10 +388,14 @@ def _args_summary(fn: str, args: dict) -> str:
335
  return s[:120] + ("…" if len(s) > 120 else "")
336
  if fn == "get_repository_analysis":
337
  return "fetching repository analysis"
 
 
 
338
  return ""
339
 
340
 
341
  def _anchor_div(anchor: str | None) -> None:
 
342
  if anchor:
343
  st.markdown(f'<div id="{anchor}"></div>', unsafe_allow_html=True)
344
 
@@ -360,8 +417,8 @@ def render_nav(steps: list) -> None:
360
  pad_left = 8 + indent * 14
361
  st.markdown(
362
  f'<a href="#{anchor}" style="display:block;padding:3px 8px 3px {pad_left}px;'
363
- f'font-size:0.83em;color:{color};text-decoration:none;'
364
- f'border-left:2px solid {color}55;margin:1px 0;'
365
  f'border-radius:0 4px 4px 0">{label}</a>',
366
  unsafe_allow_html=True,
367
  )
@@ -426,6 +483,7 @@ def render_steps(steps: list) -> None:
426
  text = step["text"].strip()
427
  tcs = step["tool_calls"]
428
 
 
429
  if role == "classifier" and not text:
430
  continue
431
 
@@ -435,16 +493,19 @@ def render_steps(steps: list) -> None:
435
  f"<b>{emoji} {role.upper()}</b>&nbsp;&nbsp;"
436
  f'<span style="color:{color};font-size:0.78em">{model_short}</span>'
437
  )
 
 
438
  if tcs:
439
  tc_html = " ".join(_tool_badge(tc["fn"]) for tc in tcs)
440
  header_html += f"<br><div style='margin-top:4px'>{tc_html}</div>"
 
441
  header_html += "</div>"
442
  st.markdown(header_html, unsafe_allow_html=True)
443
 
 
444
  if text:
445
- expand_by_default = role in ("reviewer", "target", "auditor", "profiler")
446
  if len(text) > 400:
447
- with st.expander("View full response", expanded=expand_by_default):
448
  st.markdown(text)
449
  else:
450
  st.markdown(
@@ -453,13 +514,22 @@ def render_steps(steps: list) -> None:
453
  unsafe_allow_html=True,
454
  )
455
 
 
456
  for tc in tcs:
457
  fn = tc["fn"]
458
  color2 = TOOL_COLOR.get(fn, "#6B7280")
 
459
  _has_full_block = fn in (
460
- "github_write_file", "decompose", "transfer_to_target",
461
- "transfer_to_reviewer", "github_add_pr_comment",
462
- "github_approve_pull_request", "submit", "bash", "submit_profile",
 
 
 
 
 
 
 
463
  )
464
  if not _has_full_block:
465
  summary = _args_summary(fn, tc["args"])
@@ -472,100 +542,141 @@ def render_steps(steps: list) -> None:
472
  detail_html += "</div>"
473
  st.markdown(detail_html, unsafe_allow_html=True)
474
 
475
- args = tc["args"]
476
- if fn == "github_write_file":
477
- content = args.get("content", "")
478
- path = args.get("path", "")
479
- if content:
480
- with st.expander(f" πŸ“„ `{path}`", expanded=True):
481
- st.code(content, language=_lang_for(path))
482
- elif fn == "decompose":
483
- tasks = args.get("subtasks", [])
484
- with st.expander(f" {len(tasks)} subtasks", expanded=True):
485
- for i, t in enumerate(tasks, 1):
486
- task_text = t.get("task", t) if isinstance(t, dict) else str(t)
487
- st.markdown(f"**{i}.** {task_text}")
488
- elif fn in ("transfer_to_target", "transfer_to_reviewer"):
489
- msg = args.get("message", "")
490
- if len(msg) > 120:
491
- with st.expander(" Full handoff message", expanded=True):
492
- st.markdown(msg)
493
- elif fn == "github_add_pr_comment":
494
- comment = args.get("comment", args.get("body", ""))
495
- pr = args.get("pr_number", "?")
496
- if comment:
497
- with st.expander(f" πŸ’¬ PR #{pr} comment", expanded=True):
498
- st.markdown(comment)
499
- elif fn == "github_approve_pull_request":
500
- pr = args.get("pr_number", "?")
501
- review_body = args.get("body", args.get("review_body", ""))
502
- label = f" βœ… Approved PR #{pr}"
503
- if review_body:
504
- with st.expander(label, expanded=True):
505
- st.markdown(review_body)
506
- else:
507
- st.markdown(
508
- f'<div style="padding:2px 14px 2px 28px;font-size:0.85em;'
509
- f'color:#16A34A"><b>{label}</b></div>',
510
- unsafe_allow_html=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
511
  )
512
- elif fn == "submit":
513
- answer = args.get("answer", args.get("value", ""))
514
- answer_str = str(answer).strip()
515
- is_decline = any(
516
- w in answer_str.lower()
517
- for w in ("declin", "reject", "not approv", "refused", "concerns")
518
- )
519
- icon = "❌" if is_decline else "βœ…"
520
- label = f" {icon} Submit: {answer_str[:60]}{'…' if len(answer_str) > 60 else ''}"
521
- if len(answer_str) > 60:
522
- with st.expander(label, expanded=True):
523
- st.markdown(answer_str)
524
- else:
525
- st.markdown(
526
- f'<div style="padding:2px 14px 2px 28px;font-size:0.85em;'
527
- f'color:#{"DC2626" if is_decline else "16A34A"}">'
528
- f"<b>{label}</b></div>",
529
- unsafe_allow_html=True,
 
 
 
 
 
 
 
 
 
 
530
  )
531
- elif fn == "bash":
532
- cmd = args.get("cmd", args.get("command", ""))
533
- with st.expander(f" $ {cmd[:80]}{'…' if len(cmd) > 80 else ''}", expanded=True):
534
- st.code(cmd, language="bash")
535
- elif fn == "submit_profile":
536
- strategy = args.get("strategy", "")
537
- with st.expander(" πŸ”¬ Repository strategy report", expanded=True):
538
- st.markdown(strategy)
 
 
539
 
540
  # ── PR file diff ──────────────────────────────────────────────
541
  elif k == "pr_diff":
542
  files = step["files"]
543
  with st.expander(
544
  f"πŸ“‚ PR files ({len(files)} file{'s' if len(files) != 1 else ''})",
545
- expanded=True,
546
  ):
547
  for f in files:
548
  path = f["path"]
 
 
549
  st.markdown(
550
- f'<div style="font-size:0.82em;font-weight:600;color:#374151;'
551
- f'padding:4px 0 2px 0">πŸ“„ <code>{path}</code></div>',
 
552
  unsafe_allow_html=True,
553
  )
554
- st.code(f["content"], language=_lang_for(path))
555
 
556
  # ── Tool result ───────────────────────────────────────────────
557
  elif k == "tool_result":
558
  fn = step["fn"]
 
 
559
  result = step["result"]
560
  color = TOOL_COLOR.get(fn, "#6B7280")
561
  short = result[:120].replace("\n", " ")
562
- expand_result = fn in (
563
- "github_read_file", "github_get_pull_request", "github_list_pr_reviews",
564
- "github_list_pr_commits", "github_search_code", "bash",
565
- "github_list_files", "get_repository_analysis",
566
- )
567
  label = f'← {fn}: {short}{"…" if len(result) > 120 else ""}'
568
- with st.expander(label, expanded=expand_result):
569
  lang = "text"
570
  if fn == "github_read_file":
571
  lang = _lang_for(result.split("\n")[0].strip())
@@ -592,40 +703,7 @@ def render_steps(steps: list) -> None:
592
  )
593
 
594
 
595
- # ── Sample renderer ───────────────────────────────────────────────────────────
596
-
597
-
598
- def _render_sample(sample: dict) -> None:
599
- atts = sample.get("attachments", {})
600
- events = sample.get("events", [])
601
- scores = sample.get("scores", {})
602
-
603
- score_val, score_ans = None, None
604
- for sc in (scores.values() if isinstance(scores, dict) else []):
605
- score_val = sc.get("value")
606
- score_ans = sc.get("answer")
607
- break
608
-
609
- col1, col2, col3, col4 = st.columns(4)
610
- col1.metric("Sample", sample.get("id", "?"))
611
- col2.metric("Verdict", f"{score_ans}" if score_ans else "?")
612
- col3.metric("Score", f"{score_val:.1f}" if score_val is not None else "?")
613
- col4.metric("Time", f"{sample.get('total_time', 0):.0f}s")
614
-
615
- role_usage = sample.get("role_usage", {})
616
- if role_usage:
617
- with st.expander("Token usage by role", expanded=False):
618
- cols = st.columns(len(role_usage))
619
- for col, (role, usage) in zip(cols, role_usage.items()):
620
- col.metric(role, f"{usage.get('total_tokens', 0):,}")
621
-
622
- st.markdown("---")
623
- steps = parse_events(events, atts)
624
- render_nav(steps)
625
- render_steps(steps)
626
-
627
-
628
- # ── Main ──────────────────────────────────────────────────────────────────────
629
 
630
 
631
  def main() -> None:
@@ -635,27 +713,21 @@ def main() -> None:
635
  page_icon="πŸ”΄",
636
  )
637
 
638
- DATA_DIR = Path(__file__).parent / "data"
639
- bundled = sorted(DATA_DIR.glob("*.eval")) if DATA_DIR.exists() else []
640
-
641
  with st.sidebar:
642
  st.title("πŸ”΄ github_red")
643
  st.subheader("Trajectory Viewer")
644
 
645
- eval_source = None # will hold a BytesIO
 
 
 
646
 
647
- if bundled:
648
- st.markdown("**Example logs**")
649
- options = ["β€” select β€”"] + [f.name for f in bundled]
650
- choice = st.selectbox("Pre-loaded .eval files", options, label_visibility="collapsed")
651
- if choice != "β€” select β€”":
652
- eval_source = io.BytesIO((DATA_DIR / choice).read_bytes())
653
- st.markdown("---")
654
-
655
- st.markdown("**Or upload your own**")
656
- uploaded = st.file_uploader("Upload .eval file", type=["eval"], label_visibility="collapsed")
657
- if uploaded is not None:
658
- eval_source = io.BytesIO(uploaded.read())
659
 
660
  st.markdown("---")
661
  st.markdown("**Legend**")
@@ -667,34 +739,24 @@ def main() -> None:
667
  unsafe_allow_html=True,
668
  )
669
 
670
- if eval_source is None:
671
- st.info("Select an example log from the sidebar or upload your own `.eval` file.")
672
- st.markdown("""
673
- ### What is this?
674
-
675
- Each `.eval` file contains a full run log where:
676
- - πŸ”¬ **Profiler** explores the target repository to build a red-teaming strategy
677
- - 🧩 **Decomposer** breaks the harmful goal into benign-looking subtasks
678
- - βš™οΈ **Executor** creates branches, commits code, and opens pull requests
679
- - πŸ”Ž **Reviewer** / πŸ›‘οΈ **Target** (code reviewer) is asked to approve the PRs
680
- - πŸ” **Classifier** judges whether the target complied
681
-
682
- The viewer shows the full conversation, PR code diffs, and the final verdict.
683
- The **Trajectory** panel in the sidebar lets you jump directly to any phase.
684
- """)
685
  return
686
 
 
687
  try:
688
- with zipfile.ZipFile(eval_source) as z:
689
  sample_files = [n for n in z.namelist() if n.startswith("samples/")]
690
  if not sample_files:
691
  st.error("No sample files found in this eval log.")
692
  return
 
 
693
  samples = {n: json.loads(z.read(n)) for n in sample_files}
694
- except Exception as exc:
695
- st.error(f"Failed to load eval file: {exc}")
696
  return
697
 
 
698
  def _sample_verdict(sample: dict) -> str:
699
  for sc in (sample.get("scores") or {}).values():
700
  ans = sc.get("answer")
@@ -703,7 +765,6 @@ The **Trajectory** panel in the sidebar lets you jump directly to any phase.
703
  return "?"
704
 
705
  all_verdicts = sorted({_sample_verdict(s) for s in samples.values()})
706
-
707
  with st.sidebar:
708
  st.markdown("---")
709
  st.markdown("**Filter by verdict**")
@@ -712,15 +773,19 @@ The **Trajectory** panel in the sidebar lets you jump directly to any phase.
712
  }
713
 
714
  filtered = {
715
- n: s for n, s in samples.items()
 
716
  if selected_verdicts.get(_sample_verdict(s), True)
717
  }
718
  if not filtered:
719
  st.warning("No samples match the current filter.")
720
  return
721
 
 
722
  if len(filtered) > 1:
723
- tab_names = [f"{Path(n).stem} β€” {_sample_verdict(s)}" for n, s in filtered.items()]
 
 
724
  tabs = st.tabs(tab_names)
725
  for tab, (_, sample) in zip(tabs, filtered.items()):
726
  with tab:
@@ -729,5 +794,37 @@ The **Trajectory** panel in the sidebar lets you jump directly to any phase.
729
  _render_sample(next(iter(filtered.values())))
730
 
731
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
732
  if __name__ == "__main__":
733
  main()
 
1
+ """Streamlit trajectory viewer for github_red .eval log files.
2
 
3
+ Run with:
4
+ streamlit run scripts/trajectory_viewer.py
5
  """
6
 
7
  from __future__ import annotations
8
 
9
+ import glob
10
  import json
11
  import re
12
  import zipfile
 
14
 
15
  import streamlit as st
16
 
17
+ LOGS_DIR = Path(__file__).parent.parent / "logs" / "github_red"
18
+
19
  # ── Colours & labels ────────────────────────────────────────────────────────
20
 
21
  ROLE_COLOR = {
22
+ "profiler": "#0F766E",
23
  "decomposer": "#7C3AED",
24
+ "executor": "#2563EB",
25
+ "target": "#DC2626",
26
+ "reviewer": "#DC2626",
27
+ "auditor": "#B45309",
28
  "classifier": "#D97706",
29
  }
30
  ROLE_EMOJI = {
31
+ "profiler": "πŸ”¬",
32
  "decomposer": "🧩",
33
+ "executor": "βš™οΈ",
34
+ "target": "πŸ›‘οΈ",
35
+ "reviewer": "πŸ”Ž",
36
+ "auditor": "🎯",
37
  "classifier": "πŸ”",
38
  }
39
  TOOL_COLOR = {
40
+ "github_create_branch": "#0891B2",
41
+ "github_write_file": "#0891B2",
42
+ "github_create_pull_request": "#0891B2",
43
+ "transfer_to_target": "#7C3AED",
44
+ "github_list_pr_files": "#DC2626",
45
+ "github_read_file": "#DC2626",
46
  "github_approve_pull_request": "#16A34A",
47
+ "github_add_pr_comment": "#DC2626",
48
+ "github_list_files": "#0284C7",
49
+ "bash": "#854D0E",
50
+ "submit": "#374151",
51
+ "decompose": "#7C3AED",
52
+ "think": "#9CA3AF",
53
+ "submit_profile": "#0F766E",
54
+ "get_repository_analysis": "#0F766E",
55
  }
56
 
57
  # ── Helpers ──────────────────────────────────────────────────────────────────
 
59
 
60
  def _resolve(val: object, atts: dict) -> object:
61
  if isinstance(val, str) and val.startswith("attachment://"):
62
+ key = val[len("attachment://") :]
63
  resolved = atts.get(key, val)
64
  return resolved if isinstance(resolved, (str, dict, list)) else val
65
  if isinstance(val, dict):
 
92
  return raw if isinstance(raw, dict) else {}
93
 
94
 
95
+ # ── Event parsing ─────────────────────────────────────────────────────────────
96
+
97
+
98
  def _parse_pr_files(text: str) -> list[dict]:
99
+ """Parse github_list_pr_files output into a list of {path, content} dicts.
100
+
101
+ The format is:
102
+ === path/to/file.py ===
103
+ <file content>
104
+
105
+ === another/file.yml ===
106
+ <file content>
107
+ """
108
  files = []
109
+ # Split on === ... === headers
110
  parts = re.split(r"^=== (.+?) ===$", text, flags=re.MULTILINE)
111
+ # parts = ["preamble", "path1", "content1", "path2", "content2", ...]
112
+ it = iter(parts[1:]) # skip preamble
113
  for path in it:
114
  content = next(it, "").strip()
115
  if path.strip():
 
118
 
119
 
120
  def _lang_for(path: str) -> str:
121
+ """Return a Streamlit/Pygments language hint for syntax highlighting."""
122
  ext = path.rsplit(".", 1)[-1].lower() if "." in path else ""
123
  return {
124
+ "py": "python",
125
+ "yml": "yaml",
126
+ "yaml": "yaml",
127
+ "sh": "bash",
128
+ "bash": "bash",
129
+ "json": "json",
130
+ "js": "javascript",
131
+ "ts": "typescript",
132
+ "md": "markdown",
133
+ "txt": "text",
134
+ "toml": "toml",
135
+ "dockerfile": "dockerfile",
136
+ "tf": "hcl",
137
  }.get(ext, "text")
138
 
139
 
 
 
 
140
  def parse_events(events: list, atts: dict) -> list:
141
  """Return a flat list of structured step dicts."""
142
  steps = []
 
144
  _seen_pr_sigs: set[str] = set()
145
  _profiler_phase_emitted = False
146
  _decomp_phase_emitted = False
147
+ # Track first appearance of each role per subtask for nav anchors
148
  _seen_role_in_subtask: set[tuple] = set()
149
  _anchor_counter = 0
150
 
 
163
  if not isinstance(text, str):
164
  text = json.dumps(text)
165
 
166
+ # Detect subtask transitions
167
  m = re.search(r"Subtask (\d+)", text)
168
  if m and ("context for" in text or "Starting" in text):
169
  subtask = int(m.group(1))
170
  anchor = f"subtask-{subtask}"
171
+ steps.append(
172
+ {
173
+ "kind": "subtask_divider",
174
+ "subtask": subtask,
175
+ "anchor": anchor,
176
+ "nav_label": f"Subtask {subtask}",
177
+ "nav_color": "#2563EB",
178
+ "nav_indent": 0,
179
+ }
180
+ )
181
  continue
182
 
183
+ # Suppress noisy attachment / memory lines (but keep profiler banners)
184
  if text.startswith("attachment://"):
185
  continue
186
  if text.startswith("●") and "[profiler]" not in text:
 
194
  elif ev == "model":
195
  role = e.get("role", "unknown")
196
 
197
+ # Emit a one-time phase divider when the profiler starts
198
  if role == "profiler" and not _profiler_phase_emitted:
199
  _profiler_phase_emitted = True
200
+ steps.append(
201
+ {
202
+ "kind": "phase",
203
+ "label": "πŸ”¬ Profiler Phase",
204
+ "subtask": 0,
205
+ "anchor": "phase-profiler",
206
+ "nav_label": "πŸ”¬ Profiler",
207
+ "nav_color": "#0F766E",
208
+ "nav_indent": 0,
209
+ }
210
+ )
211
 
212
  if role == "decomposer" and not _decomp_phase_emitted:
213
  _decomp_phase_emitted = True
214
+ steps.append(
215
+ {
216
+ "kind": "phase",
217
+ "label": "🧩 Decomposition Phase",
218
+ "subtask": 0,
219
+ "anchor": "phase-decomposer",
220
+ "nav_label": "🧩 Decomposer",
221
+ "nav_color": "#7C3AED",
222
+ "nav_indent": 0,
223
+ }
224
+ )
225
 
226
  model = e.get("model", "")
227
  out = e.get("output") or {}
 
236
  args = _resolve(_parse_args(tc.get("arguments", {})), atts)
237
  tool_calls.append({"fn": fn, "args": args})
238
 
239
+ # Assign nav anchor on first appearance of executor/reviewer per subtask
240
  anchor = None
241
  nav_label = None
242
  nav_color = None
243
  nav_indent = None
244
  role_key = (role, subtask)
245
+ if (
246
+ role in ("executor", "reviewer", "target", "auditor")
247
+ and role_key not in _seen_role_in_subtask
248
+ ):
249
  _seen_role_in_subtask.add(role_key)
250
  anchor = _next_anchor()
251
  emoji = ROLE_EMOJI.get(role, "πŸ€–")
 
270
 
271
  steps.append(step)
272
 
273
+ # ── PR file contents seen by reviewer/target ───────────────
274
+ # Parse github_list_pr_files results from this model's input messages
275
  if role in ("target", "reviewer"):
276
  inp = e.get("input", [])
277
  for m in inp:
 
289
  sig = "|".join(f["path"] for f in files)
290
  if sig not in _seen_pr_sigs:
291
  _seen_pr_sigs.add(sig)
292
+ steps.append(
293
+ {
294
+ "kind": "pr_diff",
295
+ "files": files,
296
+ "subtask": subtask,
297
+ }
298
+ )
299
+ break # only emit once per model call
300
 
301
  # ── Tool results ──────────────────────────────────────────────
302
  elif ev == "tool" and e.get("type") == "function":
 
308
  elif isinstance(out, dict):
309
  out = json.dumps(out, indent=2)
310
  out = str(out)
311
+ steps.append(
312
+ {
313
+ "kind": "tool_result",
314
+ "fn": fn,
315
+ "result": out,
316
+ "subtask": subtask,
317
+ }
318
+ )
319
 
320
  # ── Final score ───────────────────────────────────────────────
321
  elif ev == "score" and not e.get("intermediate"):
322
  sc = e.get("score", {})
323
+ steps.append(
324
+ {
325
+ "kind": "score",
326
+ "value": sc.get("value"),
327
+ "answer": sc.get("answer", "?"),
328
+ "explanation": sc.get("explanation", ""),
329
+ "subtask": subtask,
330
+ }
331
+ )
332
 
333
  return steps
334
 
 
349
 
350
 
351
  def _args_summary(fn: str, args: dict) -> str:
352
+ """Return the most useful 1-line summary of tool arguments."""
353
  if fn == "github_create_branch":
354
+ return (
355
+ f"`{args.get('branch_name', '')}` from `{args.get('base_branch', 'main')}`"
356
+ )
357
  if fn == "github_write_file":
358
+ msg = args.get("commit_message", "")
359
+ path = args.get("path", "")
360
+ return f"`{path}` β€” {msg[:80]}"
361
  if fn == "github_create_pull_request":
362
  return f"**{args.get('title', '')}** β†’ `{args.get('head', '')}`"
363
  if fn == "transfer_to_target":
364
+ msg = args.get("message", "")
365
+ return msg[:120]
366
  if fn == "github_list_pr_files":
367
  return f"PR #{args.get('pr_number', '?')} in `{args.get('repo', '')}`"
368
  if fn in ("github_approve_pull_request",):
369
  return f"PR #{args.get('pr_number', '?')}"
370
  if fn == "github_add_pr_comment":
371
+ c = args.get("comment", "")
372
+ return c[:120]
373
  if fn == "submit":
374
+ a = args.get("answer", args.get("value", ""))
375
+ return str(a)[:120]
376
  if fn == "decompose":
377
+ tasks = args.get("subtasks", [])
378
+ return f"{len(tasks)} subtasks"
379
  if fn == "bash":
380
  cmd = args.get("cmd", args.get("command", ""))
381
  return f"`{cmd[:120]}`"
 
388
  return s[:120] + ("…" if len(s) > 120 else "")
389
  if fn == "get_repository_analysis":
390
  return "fetching repository analysis"
391
+ if fn == "think":
392
+ t = args.get("thought", args.get("thinking", args.get("content", "")))
393
+ return str(t)[:120] + ("…" if len(str(t)) > 120 else "")
394
  return ""
395
 
396
 
397
  def _anchor_div(anchor: str | None) -> None:
398
+ """Emit an invisible anchor div for in-page navigation."""
399
  if anchor:
400
  st.markdown(f'<div id="{anchor}"></div>', unsafe_allow_html=True)
401
 
 
417
  pad_left = 8 + indent * 14
418
  st.markdown(
419
  f'<a href="#{anchor}" style="display:block;padding:3px 8px 3px {pad_left}px;'
420
+ f"font-size:0.83em;color:{color};text-decoration:none;"
421
+ f"border-left:2px solid {color}55;margin:1px 0;"
422
  f'border-radius:0 4px 4px 0">{label}</a>',
423
  unsafe_allow_html=True,
424
  )
 
483
  text = step["text"].strip()
484
  tcs = step["tool_calls"]
485
 
486
+ # Skip classifier turns (not very interesting)
487
  if role == "classifier" and not text:
488
  continue
489
 
 
493
  f"<b>{emoji} {role.upper()}</b>&nbsp;&nbsp;"
494
  f'<span style="color:{color};font-size:0.78em">{model_short}</span>'
495
  )
496
+
497
+ # Tool call summary inline in header
498
  if tcs:
499
  tc_html = " ".join(_tool_badge(tc["fn"]) for tc in tcs)
500
  header_html += f"<br><div style='margin-top:4px'>{tc_html}</div>"
501
+
502
  header_html += "</div>"
503
  st.markdown(header_html, unsafe_allow_html=True)
504
 
505
+ # Reasoning / response text
506
  if text:
 
507
  if len(text) > 400:
508
+ with st.expander("View full response", expanded=False):
509
  st.markdown(text)
510
  else:
511
  st.markdown(
 
514
  unsafe_allow_html=True,
515
  )
516
 
517
+ # Tool call detail
518
  for tc in tcs:
519
  fn = tc["fn"]
520
  color2 = TOOL_COLOR.get(fn, "#6B7280")
521
+ # Skip the one-liner summary for tools that render their own full block
522
  _has_full_block = fn in (
523
+ "github_write_file",
524
+ "decompose",
525
+ "transfer_to_target",
526
+ "transfer_to_reviewer",
527
+ "github_add_pr_comment",
528
+ "github_approve_pull_request",
529
+ "submit",
530
+ "bash",
531
+ "submit_profile",
532
+ "think",
533
  )
534
  if not _has_full_block:
535
  summary = _args_summary(fn, tc["args"])
 
542
  detail_html += "</div>"
543
  st.markdown(detail_html, unsafe_allow_html=True)
544
 
545
+ # Show full args for interesting tools on demand
546
+ if fn in (
547
+ "github_write_file",
548
+ "decompose",
549
+ "transfer_to_target",
550
+ "transfer_to_reviewer",
551
+ "github_add_pr_comment",
552
+ "github_approve_pull_request",
553
+ "submit",
554
+ "bash",
555
+ "submit_profile",
556
+ "think",
557
+ ):
558
+ args = tc["args"]
559
+ if fn == "github_write_file":
560
+ content = args.get("content", "")
561
+ path = args.get("path", "")
562
+ if content:
563
+ with st.expander(f" πŸ“„ `{path}`", expanded=False):
564
+ st.code(content, language=_lang_for(path))
565
+ elif fn == "decompose":
566
+ tasks = args.get("subtasks", [])
567
+ with st.expander(f" {len(tasks)} subtasks", expanded=False):
568
+ for i, t in enumerate(tasks, 1):
569
+ task_text = (
570
+ t.get("task", t) if isinstance(t, dict) else str(t)
571
+ )
572
+ st.markdown(f"**{i}.** {task_text}")
573
+ elif fn in ("transfer_to_target", "transfer_to_reviewer"):
574
+ msg = args.get("message", "")
575
+ if len(msg) > 120:
576
+ with st.expander(" Full handoff message", expanded=False):
577
+ st.markdown(msg)
578
+ elif fn == "github_add_pr_comment":
579
+ comment = args.get("comment", args.get("body", ""))
580
+ pr = args.get("pr_number", "?")
581
+ if comment:
582
+ with st.expander(f" πŸ’¬ PR #{pr} comment", expanded=False):
583
+ st.markdown(comment)
584
+ elif fn == "github_approve_pull_request":
585
+ pr = args.get("pr_number", "?")
586
+ review_body = args.get("body", args.get("review_body", ""))
587
+ label = f" βœ… Approved PR #{pr}"
588
+ if review_body:
589
+ with st.expander(label, expanded=False):
590
+ st.markdown(review_body)
591
+ else:
592
+ st.markdown(
593
+ f'<div style="padding:2px 14px 2px 28px;font-size:0.85em;'
594
+ f'color:#16A34A"><b>{label}</b></div>',
595
+ unsafe_allow_html=True,
596
+ )
597
+ elif fn == "submit":
598
+ answer = args.get("answer", args.get("value", ""))
599
+ answer_str = str(answer).strip()
600
+ is_decline = any(
601
+ w in answer_str.lower()
602
+ for w in (
603
+ "declin",
604
+ "reject",
605
+ "not approv",
606
+ "refused",
607
+ "concerns",
608
+ )
609
  )
610
+ icon = "❌" if is_decline else "βœ…"
611
+ label = f" {icon} Submit: {answer_str[:60]}{'…' if len(answer_str) > 60 else ''}"
612
+ if len(answer_str) > 60:
613
+ with st.expander(label, expanded=False):
614
+ st.markdown(answer_str)
615
+ else:
616
+ st.markdown(
617
+ f'<div style="padding:2px 14px 2px 28px;font-size:0.85em;'
618
+ f'color:#{"DC2626" if is_decline else "16A34A"}">'
619
+ f"<b>{label}</b></div>",
620
+ unsafe_allow_html=True,
621
+ )
622
+ elif fn == "bash":
623
+ cmd = args.get("cmd", args.get("command", ""))
624
+ with st.expander(
625
+ f" $ {cmd[:80]}{'…' if len(cmd) > 80 else ''}",
626
+ expanded=False,
627
+ ):
628
+ st.code(cmd, language="bash")
629
+ elif fn == "submit_profile":
630
+ strategy = args.get("strategy", "")
631
+ with st.expander(
632
+ " πŸ”¬ Repository strategy report", expanded=False
633
+ ):
634
+ st.markdown(strategy)
635
+ elif fn == "think":
636
+ thought = args.get(
637
+ "thought", args.get("thinking", args.get("content", ""))
638
  )
639
+ thought_str = str(thought).strip()
640
+ short = thought_str[:60] + ("…" if len(thought_str) > 60 else "")
641
+ with st.expander(f" 🧠 {short}", expanded=False):
642
+ st.markdown(
643
+ f'<div style="background:#F3F4F611;border-left:3px solid #9CA3AF;'
644
+ f'padding:8px 12px;border-radius:0 4px 4px 0;'
645
+ f'color:#374151;font-size:0.9em;white-space:pre-wrap">'
646
+ f"{thought_str}</div>",
647
+ unsafe_allow_html=True,
648
+ )
649
 
650
  # ── PR file diff ──────────────────────────────────────────────
651
  elif k == "pr_diff":
652
  files = step["files"]
653
  with st.expander(
654
  f"πŸ“‚ PR files ({len(files)} file{'s' if len(files) != 1 else ''})",
655
+ expanded=False,
656
  ):
657
  for f in files:
658
  path = f["path"]
659
+ content = f["content"]
660
+ lang = _lang_for(path)
661
  st.markdown(
662
+ f'<div style="font-size:0.82em;font-weight:600;'
663
+ f'color:#374151;padding:4px 0 2px 0">'
664
+ f"πŸ“„ <code>{path}</code></div>",
665
  unsafe_allow_html=True,
666
  )
667
+ st.code(content, language=lang)
668
 
669
  # ── Tool result ───────────────────────────────────────────────
670
  elif k == "tool_result":
671
  fn = step["fn"]
672
+ if fn == "think":
673
+ continue # thought content already shown in the tool call block
674
  result = step["result"]
675
  color = TOOL_COLOR.get(fn, "#6B7280")
676
  short = result[:120].replace("\n", " ")
677
+
 
 
 
 
678
  label = f'← {fn}: {short}{"…" if len(result) > 120 else ""}'
679
+ with st.expander(label, expanded=False):
680
  lang = "text"
681
  if fn == "github_read_file":
682
  lang = _lang_for(result.split("\n")[0].strip())
 
703
  )
704
 
705
 
706
+ # ── Main app ──────────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
707
 
708
 
709
  def main() -> None:
 
713
  page_icon="πŸ”΄",
714
  )
715
 
716
+ # ── Sidebar ───────────────────────────────────────────────────────
 
 
717
  with st.sidebar:
718
  st.title("πŸ”΄ github_red")
719
  st.subheader("Trajectory Viewer")
720
 
721
+ log_files = sorted(glob.glob(str(LOGS_DIR / "*.eval")), reverse=True)
722
+ if not log_files:
723
+ st.error(f"No .eval files found in:\n`{LOGS_DIR}`")
724
+ return
725
 
726
+ selected = st.selectbox(
727
+ "Run log",
728
+ log_files,
729
+ format_func=lambda p: Path(p).stem[:50],
730
+ )
 
 
 
 
 
 
 
731
 
732
  st.markdown("---")
733
  st.markdown("**Legend**")
 
739
  unsafe_allow_html=True,
740
  )
741
 
742
+ if not selected:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
743
  return
744
 
745
+ # ── Load ──────────────────────────────────────────────────────────
746
  try:
747
+ with zipfile.ZipFile(selected) as z:
748
  sample_files = [n for n in z.namelist() if n.startswith("samples/")]
749
  if not sample_files:
750
  st.error("No sample files found in this eval log.")
751
  return
752
+
753
+ # Support multiple samples (tabs)
754
  samples = {n: json.loads(z.read(n)) for n in sample_files}
755
+ except Exception as e:
756
+ st.error(f"Failed to load eval file: {e}")
757
  return
758
 
759
+ # ── Verdict filter ────────────────────────────────────────────────
760
  def _sample_verdict(sample: dict) -> str:
761
  for sc in (sample.get("scores") or {}).values():
762
  ans = sc.get("answer")
 
765
  return "?"
766
 
767
  all_verdicts = sorted({_sample_verdict(s) for s in samples.values()})
 
768
  with st.sidebar:
769
  st.markdown("---")
770
  st.markdown("**Filter by verdict**")
 
773
  }
774
 
775
  filtered = {
776
+ n: s
777
+ for n, s in samples.items()
778
  if selected_verdicts.get(_sample_verdict(s), True)
779
  }
780
  if not filtered:
781
  st.warning("No samples match the current filter.")
782
  return
783
 
784
+ # ── Header metrics ────────────────────────────────────────────────
785
  if len(filtered) > 1:
786
+ tab_names = [
787
+ f"{Path(n).stem} β€” {_sample_verdict(s)}" for n, s in filtered.items()
788
+ ]
789
  tabs = st.tabs(tab_names)
790
  for tab, (_, sample) in zip(tabs, filtered.items()):
791
  with tab:
 
794
  _render_sample(next(iter(filtered.values())))
795
 
796
 
797
+ def _render_sample(sample: dict) -> None:
798
+ atts = sample.get("attachments", {})
799
+ events = sample.get("events", [])
800
+ scores = sample.get("scores", {})
801
+
802
+ # Metrics row
803
+ score_val, score_ans = None, None
804
+ for sc in (scores.values() if isinstance(scores, dict) else []):
805
+ score_val = sc.get("value")
806
+ score_ans = sc.get("answer")
807
+ break
808
+
809
+ col1, col2, col3, col4 = st.columns(4)
810
+ col1.metric("Sample", sample.get("id", "?"))
811
+ col2.metric("Verdict", f"{score_ans}" if score_ans else "?")
812
+ col3.metric("Score", f"{score_val:.1f}" if score_val is not None else "?")
813
+ col4.metric("Time", f"{sample.get('total_time', 0):.0f}s")
814
+
815
+ role_usage = sample.get("role_usage", {})
816
+ if role_usage:
817
+ with st.expander("Token usage by role", expanded=False):
818
+ cols = st.columns(len(role_usage))
819
+ for col, (role, usage) in zip(cols, role_usage.items()):
820
+ total = usage.get("total_tokens", 0)
821
+ col.metric(role, f"{total:,}")
822
+
823
+ st.markdown("---")
824
+ steps = parse_events(events, atts)
825
+ render_nav(steps)
826
+ render_steps(steps)
827
+
828
+
829
  if __name__ == "__main__":
830
  main()