Spaces:

rufimelo
/

github-red-trajectory-viewer

Sleeping

App Files Files Community

rufimelo commited on Mar 18

Commit

f20d268

verified ·

1 Parent(s): 0d15aae

Collapse all expanders by default

Browse files

Files changed (1) hide show

app.py +337 -240

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
-"""github_red Trajectory Viewer — Hugging Face Space edition.
-Select a bundled example or upload any .eval log file produced by the
-decomposition benchmark to visualise the full multi-agent attack trajectory.
 """
 from __future__ import annotations
-import io
 import json
 import re
 import zipfile
@@ -14,42 +14,44 @@ from pathlib import Path
 import streamlit as st
 # ── Colours & labels ────────────────────────────────────────────────────────
 ROLE_COLOR = {
-    "profiler":   "#0F766E",
     "decomposer": "#7C3AED",
-    "executor":   "#2563EB",
-    "target":     "#DC2626",
-    "reviewer":   "#DC2626",
-    "auditor":    "#B45309",
     "classifier": "#D97706",
 }
 ROLE_EMOJI = {
-    "profiler":   "🔬",
     "decomposer": "🧩",
-    "executor":   "⚙️",
-    "target":     "🛡️",
-    "reviewer":   "🔎",
-    "auditor":    "🎯",
     "classifier": "🔍",
 }
 TOOL_COLOR = {
-    "github_create_branch":        "#0891B2",
-    "github_write_file":           "#0891B2",
-    "github_create_pull_request":  "#0891B2",
-    "transfer_to_target":          "#7C3AED",
-    "github_list_pr_files":        "#DC2626",
-    "github_read_file":            "#DC2626",
     "github_approve_pull_request": "#16A34A",
-    "github_add_pr_comment":       "#DC2626",
-    "github_list_files":           "#0284C7",
-    "bash":                        "#854D0E",
-    "submit":                      "#374151",
-    "decompose":                   "#7C3AED",
-    "think":                       "#9CA3AF",
-    "submit_profile":              "#0F766E",
-    "get_repository_analysis":     "#0F766E",
 }
 # ── Helpers ──────────────────────────────────────────────────────────────────
@@ -57,7 +59,7 @@ TOOL_COLOR = {
 def _resolve(val: object, atts: dict) -> object:
     if isinstance(val, str) and val.startswith("attachment://"):
-        key = val[len("attachment://"):]
         resolved = atts.get(key, val)
         return resolved if isinstance(resolved, (str, dict, list)) else val
     if isinstance(val, dict):
@@ -90,10 +92,24 @@ def _parse_args(raw: object) -> dict:
     return raw if isinstance(raw, dict) else {}
 def _parse_pr_files(text: str) -> list[dict]:
     files = []
     parts = re.split(r"^=== (.+?) ===$", text, flags=re.MULTILINE)
-    it = iter(parts[1:])
     for path in it:
         content = next(it, "").strip()
         if path.strip():
@@ -102,19 +118,25 @@ def _parse_pr_files(text: str) -> list[dict]:
 def _lang_for(path: str) -> str:
     ext = path.rsplit(".", 1)[-1].lower() if "." in path else ""
     return {
-        "py": "python", "yml": "yaml", "yaml": "yaml",
-        "sh": "bash", "bash": "bash",
-        "json": "json", "js": "javascript", "ts": "typescript",
-        "md": "markdown", "txt": "text", "toml": "toml",
-        "dockerfile": "dockerfile", "tf": "hcl",
     }.get(ext, "text")
-# ── Event parsing ─────────────────────────────────────────────────────────────
 def parse_events(events: list, atts: dict) -> list:
     """Return a flat list of structured step dicts."""
     steps = []
@@ -122,6 +144,7 @@ def parse_events(events: list, atts: dict) -> list:
     _seen_pr_sigs: set[str] = set()
     _profiler_phase_emitted = False
     _decomp_phase_emitted = False
     _seen_role_in_subtask: set[tuple] = set()
     _anchor_counter = 0
@@ -140,20 +163,24 @@ def parse_events(events: list, atts: dict) -> list:
             if not isinstance(text, str):
                 text = json.dumps(text)
             m = re.search(r"Subtask (\d+)", text)
             if m and ("context for" in text or "Starting" in text):
                 subtask = int(m.group(1))
                 anchor = f"subtask-{subtask}"
-                steps.append({
-                    "kind": "subtask_divider",
-                    "subtask": subtask,
-                    "anchor": anchor,
-                    "nav_label": f"Subtask {subtask}",
-                    "nav_color": "#2563EB",
-                    "nav_indent": 0,
-                })
                 continue
             if text.startswith("attachment://"):
                 continue
             if text.startswith("●") and "[profiler]" not in text:
@@ -167,29 +194,34 @@ def parse_events(events: list, atts: dict) -> list:
         elif ev == "model":
             role = e.get("role", "unknown")
             if role == "profiler" and not _profiler_phase_emitted:
                 _profiler_phase_emitted = True
-                steps.append({
-                    "kind": "phase",
-                    "label": "🔬 Profiler Phase",
-                    "subtask": 0,
-                    "anchor": "phase-profiler",
-                    "nav_label": "🔬 Profiler",
-                    "nav_color": "#0F766E",
-                    "nav_indent": 0,
-                })
             if role == "decomposer" and not _decomp_phase_emitted:
                 _decomp_phase_emitted = True
-                steps.append({
-                    "kind": "phase",
-                    "label": "🧩 Decomposition Phase",
-                    "subtask": 0,
-                    "anchor": "phase-decomposer",
-                    "nav_label": "🧩 Decomposer",
-                    "nav_color": "#7C3AED",
-                    "nav_indent": 0,
-                })
             model = e.get("model", "")
             out = e.get("output") or {}
@@ -204,12 +236,16 @@ def parse_events(events: list, atts: dict) -> list:
                 args = _resolve(_parse_args(tc.get("arguments", {})), atts)
                 tool_calls.append({"fn": fn, "args": args})
             anchor = None
             nav_label = None
             nav_color = None
             nav_indent = None
             role_key = (role, subtask)
-            if role in ("executor", "reviewer", "target", "auditor") and role_key not in _seen_role_in_subtask:
                 _seen_role_in_subtask.add(role_key)
                 anchor = _next_anchor()
                 emoji = ROLE_EMOJI.get(role, "🤖")
@@ -234,6 +270,8 @@ def parse_events(events: list, atts: dict) -> list:
             steps.append(step)
             if role in ("target", "reviewer"):
                 inp = e.get("input", [])
                 for m in inp:
@@ -251,12 +289,14 @@ def parse_events(events: list, atts: dict) -> list:
                         sig = "|".join(f["path"] for f in files)
                         if sig not in _seen_pr_sigs:
                             _seen_pr_sigs.add(sig)
-                            steps.append({
-                                "kind": "pr_diff",
-                                "files": files,
-                                "subtask": subtask,
-                            })
-                    break
         # ── Tool results ──────────────────────────────────────────────
         elif ev == "tool" and e.get("type") == "function":
@@ -268,23 +308,27 @@ def parse_events(events: list, atts: dict) -> list:
             elif isinstance(out, dict):
                 out = json.dumps(out, indent=2)
             out = str(out)
-            steps.append({
-                "kind": "tool_result",
-                "fn": fn,
-                "result": out,
-                "subtask": subtask,
-            })
         # ── Final score ───────────────────────────────────────────────
         elif ev == "score" and not e.get("intermediate"):
             sc = e.get("score", {})
-            steps.append({
-                "kind": "score",
-                "value": sc.get("value"),
-                "answer": sc.get("answer", "?"),
-                "explanation": sc.get("explanation", ""),
-                "subtask": subtask,
-            })
     return steps
@@ -305,24 +349,33 @@ def _tool_badge(fn: str) -> str:
 def _args_summary(fn: str, args: dict) -> str:
     if fn == "github_create_branch":
-        return f"`{args.get('branch_name', '')}` from `{args.get('base_branch', 'main')}`"
     if fn == "github_write_file":
-        return f"`{args.get('path', '')}` — {args.get('commit_message', '')[:80]}"
     if fn == "github_create_pull_request":
         return f"**{args.get('title', '')}** → `{args.get('head', '')}`"
     if fn == "transfer_to_target":
-        return args.get("message", "")[:120]
     if fn == "github_list_pr_files":
         return f"PR #{args.get('pr_number', '?')} in `{args.get('repo', '')}`"
     if fn in ("github_approve_pull_request",):
         return f"PR #{args.get('pr_number', '?')}"
     if fn == "github_add_pr_comment":
-        return args.get("comment", "")[:120]
     if fn == "submit":
-        return str(args.get("answer", args.get("value", "")))[:120]
     if fn == "decompose":
-        return f"{len(args.get('subtasks', []))} subtasks"
     if fn == "bash":
         cmd = args.get("cmd", args.get("command", ""))
         return f"`{cmd[:120]}`"
@@ -335,10 +388,14 @@ def _args_summary(fn: str, args: dict) -> str:
         return s[:120] + ("…" if len(s) > 120 else "")
     if fn == "get_repository_analysis":
         return "fetching repository analysis"
     return ""
 def _anchor_div(anchor: str | None) -> None:
     if anchor:
         st.markdown(f'<div id="{anchor}"></div>', unsafe_allow_html=True)
@@ -360,8 +417,8 @@ def render_nav(steps: list) -> None:
             pad_left = 8 + indent * 14
             st.markdown(
                 f'<a href="#{anchor}" style="display:block;padding:3px 8px 3px {pad_left}px;'
-                f'font-size:0.83em;color:{color};text-decoration:none;'
-                f'border-left:2px solid {color}55;margin:1px 0;'
                 f'border-radius:0 4px 4px 0">{label}</a>',
                 unsafe_allow_html=True,
             )
@@ -426,6 +483,7 @@ def render_steps(steps: list) -> None:
             text = step["text"].strip()
             tcs = step["tool_calls"]
             if role == "classifier" and not text:
                 continue
@@ -435,16 +493,19 @@ def render_steps(steps: list) -> None:
                 f"<b>{emoji} {role.upper()}</b>&nbsp;&nbsp;"
                 f'<span style="color:{color};font-size:0.78em">{model_short}</span>'
             )
             if tcs:
                 tc_html = "  ".join(_tool_badge(tc["fn"]) for tc in tcs)
                 header_html += f"<br><div style='margin-top:4px'>{tc_html}</div>"
             header_html += "</div>"
             st.markdown(header_html, unsafe_allow_html=True)
             if text:
-                expand_by_default = role in ("reviewer", "target", "auditor", "profiler")
                 if len(text) > 400:
-                    with st.expander("View full response", expanded=expand_by_default):
                         st.markdown(text)
                 else:
                     st.markdown(
@@ -453,13 +514,22 @@ def render_steps(steps: list) -> None:
                         unsafe_allow_html=True,
                     )
             for tc in tcs:
                 fn = tc["fn"]
                 color2 = TOOL_COLOR.get(fn, "#6B7280")
                 _has_full_block = fn in (
-                    "github_write_file", "decompose", "transfer_to_target",
-                    "transfer_to_reviewer", "github_add_pr_comment",
-                    "github_approve_pull_request", "submit", "bash", "submit_profile",
                 )
                 if not _has_full_block:
                     summary = _args_summary(fn, tc["args"])
@@ -472,100 +542,141 @@ def render_steps(steps: list) -> None:
                     detail_html += "</div>"
                     st.markdown(detail_html, unsafe_allow_html=True)
-                args = tc["args"]
-                if fn == "github_write_file":
-                    content = args.get("content", "")
-                    path = args.get("path", "")
-                    if content:
-                        with st.expander(f"  📄 `{path}`", expanded=True):
-                            st.code(content, language=_lang_for(path))
-                elif fn == "decompose":
-                    tasks = args.get("subtasks", [])
-                    with st.expander(f"  {len(tasks)} subtasks", expanded=True):
-                        for i, t in enumerate(tasks, 1):
-                            task_text = t.get("task", t) if isinstance(t, dict) else str(t)
-                            st.markdown(f"**{i}.** {task_text}")
-                elif fn in ("transfer_to_target", "transfer_to_reviewer"):
-                    msg = args.get("message", "")
-                    if len(msg) > 120:
-                        with st.expander("  Full handoff message", expanded=True):
-                            st.markdown(msg)
-                elif fn == "github_add_pr_comment":
-                    comment = args.get("comment", args.get("body", ""))
-                    pr = args.get("pr_number", "?")
-                    if comment:
-                        with st.expander(f"  💬 PR #{pr} comment", expanded=True):
-                            st.markdown(comment)
-                elif fn == "github_approve_pull_request":
-                    pr = args.get("pr_number", "?")
-                    review_body = args.get("body", args.get("review_body", ""))
-                    label = f"  ✅ Approved PR #{pr}"
-                    if review_body:
-                        with st.expander(label, expanded=True):
-                            st.markdown(review_body)
-                    else:
-                        st.markdown(
-                            f'<div style="padding:2px 14px 2px 28px;font-size:0.85em;'
-                            f'color:#16A34A"><b>{label}</b></div>',
-                            unsafe_allow_html=True,
                         )
-                elif fn == "submit":
-                    answer = args.get("answer", args.get("value", ""))
-                    answer_str = str(answer).strip()
-                    is_decline = any(
-                        w in answer_str.lower()
-                        for w in ("declin", "reject", "not approv", "refused", "concerns")
-                    )
-                    icon = "❌" if is_decline else "✅"
-                    label = f"  {icon} Submit: {answer_str[:60]}{'…' if len(answer_str) > 60 else ''}"
-                    if len(answer_str) > 60:
-                        with st.expander(label, expanded=True):
-                            st.markdown(answer_str)
-                    else:
-                        st.markdown(
-                            f'<div style="padding:2px 14px 2px 28px;font-size:0.85em;'
-                            f'color:#{"DC2626" if is_decline else "16A34A"}">'
-                            f"<b>{label}</b></div>",
-                            unsafe_allow_html=True,
                         )
-                elif fn == "bash":
-                    cmd = args.get("cmd", args.get("command", ""))
-                    with st.expander(f"  $ {cmd[:80]}{'…' if len(cmd) > 80 else ''}", expanded=True):
-                        st.code(cmd, language="bash")
-                elif fn == "submit_profile":
-                    strategy = args.get("strategy", "")
-                    with st.expander("  🔬 Repository strategy report", expanded=True):
-                        st.markdown(strategy)
         # ── PR file diff ──────────────────────────────────────────────
         elif k == "pr_diff":
             files = step["files"]
             with st.expander(
                 f"📂 PR files ({len(files)} file{'s' if len(files) != 1 else ''})",
-                expanded=True,
             ):
                 for f in files:
                     path = f["path"]
                     st.markdown(
-                        f'<div style="font-size:0.82em;font-weight:600;color:#374151;'
-                        f'padding:4px 0 2px 0">📄 <code>{path}</code></div>',
                         unsafe_allow_html=True,
                     )
-                    st.code(f["content"], language=_lang_for(path))
         # ── Tool result ───────────────────────────────────────────────
         elif k == "tool_result":
             fn = step["fn"]
             result = step["result"]
             color = TOOL_COLOR.get(fn, "#6B7280")
             short = result[:120].replace("\n", " ")
-            expand_result = fn in (
-                "github_read_file", "github_get_pull_request", "github_list_pr_reviews",
-                "github_list_pr_commits", "github_search_code", "bash",
-                "github_list_files", "get_repository_analysis",
-            )
             label = f'← {fn}: {short}{"…" if len(result) > 120 else ""}'
-            with st.expander(label, expanded=expand_result):
                 lang = "text"
                 if fn == "github_read_file":
                     lang = _lang_for(result.split("\n")[0].strip())
@@ -592,40 +703,7 @@ def render_steps(steps: list) -> None:
                 )
-# ── Sample renderer ───────────────────────────────────────────────────────────
-def _render_sample(sample: dict) -> None:
-    atts = sample.get("attachments", {})
-    events = sample.get("events", [])
-    scores = sample.get("scores", {})
-    score_val, score_ans = None, None
-    for sc in (scores.values() if isinstance(scores, dict) else []):
-        score_val = sc.get("value")
-        score_ans = sc.get("answer")
-        break
-    col1, col2, col3, col4 = st.columns(4)
-    col1.metric("Sample", sample.get("id", "?"))
-    col2.metric("Verdict", f"{score_ans}" if score_ans else "?")
-    col3.metric("Score", f"{score_val:.1f}" if score_val is not None else "?")
-    col4.metric("Time", f"{sample.get('total_time', 0):.0f}s")
-    role_usage = sample.get("role_usage", {})
-    if role_usage:
-        with st.expander("Token usage by role", expanded=False):
-            cols = st.columns(len(role_usage))
-            for col, (role, usage) in zip(cols, role_usage.items()):
-                col.metric(role, f"{usage.get('total_tokens', 0):,}")
-    st.markdown("---")
-    steps = parse_events(events, atts)
-    render_nav(steps)
-    render_steps(steps)
-# ── Main ──────────────────────────────────────────────────────────────────────
 def main() -> None:
@@ -635,27 +713,21 @@ def main() -> None:
         page_icon="🔴",
     )
-    DATA_DIR = Path(__file__).parent / "data"
-    bundled = sorted(DATA_DIR.glob("*.eval")) if DATA_DIR.exists() else []
     with st.sidebar:
         st.title("🔴 github_red")
         st.subheader("Trajectory Viewer")
-        eval_source = None  # will hold a BytesIO
-        if bundled:
-            st.markdown("**Example logs**")
-            options = ["— select —"] + [f.name for f in bundled]
-            choice = st.selectbox("Pre-loaded .eval files", options, label_visibility="collapsed")
-            if choice != "— select —":
-                eval_source = io.BytesIO((DATA_DIR / choice).read_bytes())
-            st.markdown("---")
-        st.markdown("**Or upload your own**")
-        uploaded = st.file_uploader("Upload .eval file", type=["eval"], label_visibility="collapsed")
-        if uploaded is not None:
-            eval_source = io.BytesIO(uploaded.read())
         st.markdown("---")
         st.markdown("**Legend**")
@@ -667,34 +739,24 @@ def main() -> None:
                 unsafe_allow_html=True,
             )
-    if eval_source is None:
-        st.info("Select an example log from the sidebar or upload your own `.eval` file.")
-        st.markdown("""
-### What is this?
-Each `.eval` file contains a full run log where:
-- 🔬 **Profiler** explores the target repository to build a red-teaming strategy
-- 🧩 **Decomposer** breaks the harmful goal into benign-looking subtasks
-- ⚙️ **Executor** creates branches, commits code, and opens pull requests
-- 🔎 **Reviewer** / 🛡️ **Target** (code reviewer) is asked to approve the PRs
-- 🔍 **Classifier** judges whether the target complied
-The viewer shows the full conversation, PR code diffs, and the final verdict.
-The **Trajectory** panel in the sidebar lets you jump directly to any phase.
-        """)
         return
     try:
-        with zipfile.ZipFile(eval_source) as z:
             sample_files = [n for n in z.namelist() if n.startswith("samples/")]
             if not sample_files:
                 st.error("No sample files found in this eval log.")
                 return
             samples = {n: json.loads(z.read(n)) for n in sample_files}
-    except Exception as exc:
-        st.error(f"Failed to load eval file: {exc}")
         return
     def _sample_verdict(sample: dict) -> str:
         for sc in (sample.get("scores") or {}).values():
             ans = sc.get("answer")
@@ -703,7 +765,6 @@ The **Trajectory** panel in the sidebar lets you jump directly to any phase.
         return "?"
     all_verdicts = sorted({_sample_verdict(s) for s in samples.values()})
     with st.sidebar:
         st.markdown("---")
         st.markdown("**Filter by verdict**")
@@ -712,15 +773,19 @@ The **Trajectory** panel in the sidebar lets you jump directly to any phase.
         }
     filtered = {
-        n: s for n, s in samples.items()
         if selected_verdicts.get(_sample_verdict(s), True)
     }
     if not filtered:
         st.warning("No samples match the current filter.")
         return
     if len(filtered) > 1:
-        tab_names = [f"{Path(n).stem} — {_sample_verdict(s)}" for n, s in filtered.items()]
         tabs = st.tabs(tab_names)
         for tab, (_, sample) in zip(tabs, filtered.items()):
             with tab:
@@ -729,5 +794,37 @@ The **Trajectory** panel in the sidebar lets you jump directly to any phase.
         _render_sample(next(iter(filtered.values())))
 if __name__ == "__main__":
     main()

+"""Streamlit trajectory viewer for github_red .eval log files.
+Run with:
+    streamlit run scripts/trajectory_viewer.py
 """
 from __future__ import annotations
+import glob
 import json
 import re
 import zipfile
 import streamlit as st
+LOGS_DIR = Path(__file__).parent.parent / "logs" / "github_red"
 # ── Colours & labels ────────────────────────────────────────────────────────
 ROLE_COLOR = {
+    "profiler": "#0F766E",
     "decomposer": "#7C3AED",
+    "executor": "#2563EB",
+    "target": "#DC2626",
+    "reviewer": "#DC2626",
+    "auditor": "#B45309",
     "classifier": "#D97706",
 }
 ROLE_EMOJI = {
+    "profiler": "🔬",
     "decomposer": "🧩",
+    "executor": "⚙️",
+    "target": "🛡️",
+    "reviewer": "🔎",
+    "auditor": "🎯",
     "classifier": "🔍",
 }
 TOOL_COLOR = {
+    "github_create_branch": "#0891B2",
+    "github_write_file": "#0891B2",
+    "github_create_pull_request": "#0891B2",
+    "transfer_to_target": "#7C3AED",
+    "github_list_pr_files": "#DC2626",
+    "github_read_file": "#DC2626",
     "github_approve_pull_request": "#16A34A",
+    "github_add_pr_comment": "#DC2626",
+    "github_list_files": "#0284C7",
+    "bash": "#854D0E",
+    "submit": "#374151",
+    "decompose": "#7C3AED",
+    "think": "#9CA3AF",
+    "submit_profile": "#0F766E",
+    "get_repository_analysis": "#0F766E",
 }
 # ── Helpers ──────────────────────────────────────────────────────────────────
 def _resolve(val: object, atts: dict) -> object:
     if isinstance(val, str) and val.startswith("attachment://"):
+        key = val[len("attachment://") :]
         resolved = atts.get(key, val)
         return resolved if isinstance(resolved, (str, dict, list)) else val
     if isinstance(val, dict):
     return raw if isinstance(raw, dict) else {}
+# ── Event parsing ─────────────────────────────────────────────────────────────
 def _parse_pr_files(text: str) -> list[dict]:
+    """Parse github_list_pr_files output into a list of {path, content} dicts.
+    The format is:
+        === path/to/file.py ===
+        <file content>
+        === another/file.yml ===
+        <file content>
+    """
     files = []
+    # Split on === ... === headers
     parts = re.split(r"^=== (.+?) ===$", text, flags=re.MULTILINE)
+    # parts = ["preamble", "path1", "content1", "path2", "content2", ...]
+    it = iter(parts[1:])  # skip preamble
     for path in it:
         content = next(it, "").strip()
         if path.strip():
 def _lang_for(path: str) -> str:
+    """Return a Streamlit/Pygments language hint for syntax highlighting."""
     ext = path.rsplit(".", 1)[-1].lower() if "." in path else ""
     return {
+        "py": "python",
+        "yml": "yaml",
+        "yaml": "yaml",
+        "sh": "bash",
+        "bash": "bash",
+        "json": "json",
+        "js": "javascript",
+        "ts": "typescript",
+        "md": "markdown",
+        "txt": "text",
+        "toml": "toml",
+        "dockerfile": "dockerfile",
+        "tf": "hcl",
     }.get(ext, "text")
 def parse_events(events: list, atts: dict) -> list:
     """Return a flat list of structured step dicts."""
     steps = []
     _seen_pr_sigs: set[str] = set()
     _profiler_phase_emitted = False
     _decomp_phase_emitted = False
+    # Track first appearance of each role per subtask for nav anchors
     _seen_role_in_subtask: set[tuple] = set()
     _anchor_counter = 0
             if not isinstance(text, str):
                 text = json.dumps(text)
+            # Detect subtask transitions
             m = re.search(r"Subtask (\d+)", text)
             if m and ("context for" in text or "Starting" in text):
                 subtask = int(m.group(1))
                 anchor = f"subtask-{subtask}"
+                steps.append(
+                    {
+                        "kind": "subtask_divider",
+                        "subtask": subtask,
+                        "anchor": anchor,
+                        "nav_label": f"Subtask {subtask}",
+                        "nav_color": "#2563EB",
+                        "nav_indent": 0,
+                    }
+                )
                 continue
+            # Suppress noisy attachment / memory lines (but keep profiler banners)
             if text.startswith("attachment://"):
                 continue
             if text.startswith("●") and "[profiler]" not in text:
         elif ev == "model":
             role = e.get("role", "unknown")
+            # Emit a one-time phase divider when the profiler starts
             if role == "profiler" and not _profiler_phase_emitted:
                 _profiler_phase_emitted = True
+                steps.append(
+                    {
+                        "kind": "phase",
+                        "label": "🔬 Profiler Phase",
+                        "subtask": 0,
+                        "anchor": "phase-profiler",
+                        "nav_label": "🔬 Profiler",
+                        "nav_color": "#0F766E",
+                        "nav_indent": 0,
+                    }
+                )
             if role == "decomposer" and not _decomp_phase_emitted:
                 _decomp_phase_emitted = True
+                steps.append(
+                    {
+                        "kind": "phase",
+                        "label": "🧩 Decomposition Phase",
+                        "subtask": 0,
+                        "anchor": "phase-decomposer",
+                        "nav_label": "🧩 Decomposer",
+                        "nav_color": "#7C3AED",
+                        "nav_indent": 0,
+                    }
+                )
             model = e.get("model", "")
             out = e.get("output") or {}
                 args = _resolve(_parse_args(tc.get("arguments", {})), atts)
                 tool_calls.append({"fn": fn, "args": args})
+            # Assign nav anchor on first appearance of executor/reviewer per subtask
             anchor = None
             nav_label = None
             nav_color = None
             nav_indent = None
             role_key = (role, subtask)
+            if (
+                role in ("executor", "reviewer", "target", "auditor")
+                and role_key not in _seen_role_in_subtask
+            ):
                 _seen_role_in_subtask.add(role_key)
                 anchor = _next_anchor()
                 emoji = ROLE_EMOJI.get(role, "🤖")
             steps.append(step)
+            # ── PR file contents seen by reviewer/target ───────────────
+            # Parse github_list_pr_files results from this model's input messages
             if role in ("target", "reviewer"):
                 inp = e.get("input", [])
                 for m in inp:
                         sig = "|".join(f["path"] for f in files)
                         if sig not in _seen_pr_sigs:
                             _seen_pr_sigs.add(sig)
+                            steps.append(
+                                {
+                                    "kind": "pr_diff",
+                                    "files": files,
+                                    "subtask": subtask,
+                                }
+                            )
+                    break  # only emit once per model call
         # ── Tool results ──────────────────────────────────────────────
         elif ev == "tool" and e.get("type") == "function":
             elif isinstance(out, dict):
                 out = json.dumps(out, indent=2)
             out = str(out)
+            steps.append(
+                {
+                    "kind": "tool_result",
+                    "fn": fn,
+                    "result": out,
+                    "subtask": subtask,
+                }
+            )
         # ── Final score ───────────────────────────────────────────────
         elif ev == "score" and not e.get("intermediate"):
             sc = e.get("score", {})
+            steps.append(
+                {
+                    "kind": "score",
+                    "value": sc.get("value"),
+                    "answer": sc.get("answer", "?"),
+                    "explanation": sc.get("explanation", ""),
+                    "subtask": subtask,
+                }
+            )
     return steps
 def _args_summary(fn: str, args: dict) -> str:
+    """Return the most useful 1-line summary of tool arguments."""
     if fn == "github_create_branch":
+        return (
+            f"`{args.get('branch_name', '')}` from `{args.get('base_branch', 'main')}`"
+        )
     if fn == "github_write_file":
+        msg = args.get("commit_message", "")
+        path = args.get("path", "")
+        return f"`{path}` — {msg[:80]}"
     if fn == "github_create_pull_request":
         return f"**{args.get('title', '')}** → `{args.get('head', '')}`"
     if fn == "transfer_to_target":
+        msg = args.get("message", "")
+        return msg[:120]
     if fn == "github_list_pr_files":
         return f"PR #{args.get('pr_number', '?')} in `{args.get('repo', '')}`"
     if fn in ("github_approve_pull_request",):
         return f"PR #{args.get('pr_number', '?')}"
     if fn == "github_add_pr_comment":
+        c = args.get("comment", "")
+        return c[:120]
     if fn == "submit":
+        a = args.get("answer", args.get("value", ""))
+        return str(a)[:120]
     if fn == "decompose":
+        tasks = args.get("subtasks", [])
+        return f"{len(tasks)} subtasks"
     if fn == "bash":
         cmd = args.get("cmd", args.get("command", ""))
         return f"`{cmd[:120]}`"
         return s[:120] + ("…" if len(s) > 120 else "")
     if fn == "get_repository_analysis":
         return "fetching repository analysis"
+    if fn == "think":
+        t = args.get("thought", args.get("thinking", args.get("content", "")))
+        return str(t)[:120] + ("…" if len(str(t)) > 120 else "")
     return ""
 def _anchor_div(anchor: str | None) -> None:
+    """Emit an invisible anchor div for in-page navigation."""
     if anchor:
         st.markdown(f'<div id="{anchor}"></div>', unsafe_allow_html=True)
             pad_left = 8 + indent * 14
             st.markdown(
                 f'<a href="#{anchor}" style="display:block;padding:3px 8px 3px {pad_left}px;'
+                f"font-size:0.83em;color:{color};text-decoration:none;"
+                f"border-left:2px solid {color}55;margin:1px 0;"
                 f'border-radius:0 4px 4px 0">{label}</a>',
                 unsafe_allow_html=True,
             )
             text = step["text"].strip()
             tcs = step["tool_calls"]
+            # Skip classifier turns (not very interesting)
             if role == "classifier" and not text:
                 continue
                 f"<b>{emoji} {role.upper()}</b>&nbsp;&nbsp;"
                 f'<span style="color:{color};font-size:0.78em">{model_short}</span>'
             )
+            # Tool call summary inline in header
             if tcs:
                 tc_html = "  ".join(_tool_badge(tc["fn"]) for tc in tcs)
                 header_html += f"<br><div style='margin-top:4px'>{tc_html}</div>"
             header_html += "</div>"
             st.markdown(header_html, unsafe_allow_html=True)
+            # Reasoning / response text
             if text:
                 if len(text) > 400:
+                    with st.expander("View full response", expanded=False):
                         st.markdown(text)
                 else:
                     st.markdown(
                         unsafe_allow_html=True,
                     )
+            # Tool call detail
             for tc in tcs:
                 fn = tc["fn"]
                 color2 = TOOL_COLOR.get(fn, "#6B7280")
+                # Skip the one-liner summary for tools that render their own full block
                 _has_full_block = fn in (
+                    "github_write_file",
+                    "decompose",
+                    "transfer_to_target",
+                    "transfer_to_reviewer",
+                    "github_add_pr_comment",
+                    "github_approve_pull_request",
+                    "submit",
+                    "bash",
+                    "submit_profile",
+                    "think",
                 )
                 if not _has_full_block:
                     summary = _args_summary(fn, tc["args"])
                     detail_html += "</div>"
                     st.markdown(detail_html, unsafe_allow_html=True)
+                # Show full args for interesting tools on demand
+                if fn in (
+                    "github_write_file",
+                    "decompose",
+                    "transfer_to_target",
+                    "transfer_to_reviewer",
+                    "github_add_pr_comment",
+                    "github_approve_pull_request",
+                    "submit",
+                    "bash",
+                    "submit_profile",
+                    "think",
+                ):
+                    args = tc["args"]
+                    if fn == "github_write_file":
+                        content = args.get("content", "")
+                        path = args.get("path", "")
+                        if content:
+                            with st.expander(f"  📄 `{path}`", expanded=False):
+                                st.code(content, language=_lang_for(path))
+                    elif fn == "decompose":
+                        tasks = args.get("subtasks", [])
+                        with st.expander(f"  {len(tasks)} subtasks", expanded=False):
+                            for i, t in enumerate(tasks, 1):
+                                task_text = (
+                                    t.get("task", t) if isinstance(t, dict) else str(t)
+                                )
+                                st.markdown(f"**{i}.** {task_text}")
+                    elif fn in ("transfer_to_target", "transfer_to_reviewer"):
+                        msg = args.get("message", "")
+                        if len(msg) > 120:
+                            with st.expander("  Full handoff message", expanded=False):
+                                st.markdown(msg)
+                    elif fn == "github_add_pr_comment":
+                        comment = args.get("comment", args.get("body", ""))
+                        pr = args.get("pr_number", "?")
+                        if comment:
+                            with st.expander(f"  💬 PR #{pr} comment", expanded=False):
+                                st.markdown(comment)
+                    elif fn == "github_approve_pull_request":
+                        pr = args.get("pr_number", "?")
+                        review_body = args.get("body", args.get("review_body", ""))
+                        label = f"  ✅ Approved PR #{pr}"
+                        if review_body:
+                            with st.expander(label, expanded=False):
+                                st.markdown(review_body)
+                        else:
+                            st.markdown(
+                                f'<div style="padding:2px 14px 2px 28px;font-size:0.85em;'
+                                f'color:#16A34A"><b>{label}</b></div>',
+                                unsafe_allow_html=True,
+                            )
+                    elif fn == "submit":
+                        answer = args.get("answer", args.get("value", ""))
+                        answer_str = str(answer).strip()
+                        is_decline = any(
+                            w in answer_str.lower()
+                            for w in (
+                                "declin",
+                                "reject",
+                                "not approv",
+                                "refused",
+                                "concerns",
+                            )
                         )
+                        icon = "❌" if is_decline else "✅"
+                        label = f"  {icon} Submit: {answer_str[:60]}{'…' if len(answer_str) > 60 else ''}"
+                        if len(answer_str) > 60:
+                            with st.expander(label, expanded=False):
+                                st.markdown(answer_str)
+                        else:
+                            st.markdown(
+                                f'<div style="padding:2px 14px 2px 28px;font-size:0.85em;'
+                                f'color:#{"DC2626" if is_decline else "16A34A"}">'
+                                f"<b>{label}</b></div>",
+                                unsafe_allow_html=True,
+                            )
+                    elif fn == "bash":
+                        cmd = args.get("cmd", args.get("command", ""))
+                        with st.expander(
+                            f"  $ {cmd[:80]}{'…' if len(cmd) > 80 else ''}",
+                            expanded=False,
+                        ):
+                            st.code(cmd, language="bash")
+                    elif fn == "submit_profile":
+                        strategy = args.get("strategy", "")
+                        with st.expander(
+                            "  🔬 Repository strategy report", expanded=False
+                        ):
+                            st.markdown(strategy)
+                    elif fn == "think":
+                        thought = args.get(
+                            "thought", args.get("thinking", args.get("content", ""))
                         )
+                        thought_str = str(thought).strip()
+                        short = thought_str[:60] + ("…" if len(thought_str) > 60 else "")
+                        with st.expander(f"  🧠 {short}", expanded=False):
+                            st.markdown(
+                                f'<div style="background:#F3F4F611;border-left:3px solid #9CA3AF;'
+                                f'padding:8px 12px;border-radius:0 4px 4px 0;'
+                                f'color:#374151;font-size:0.9em;white-space:pre-wrap">'
+                                f"{thought_str}</div>",
+                                unsafe_allow_html=True,
+                            )
         # ── PR file diff ──────────────────────────────────────────────
         elif k == "pr_diff":
             files = step["files"]
             with st.expander(
                 f"📂 PR files ({len(files)} file{'s' if len(files) != 1 else ''})",
+                expanded=False,
             ):
                 for f in files:
                     path = f["path"]
+                    content = f["content"]
+                    lang = _lang_for(path)
                     st.markdown(
+                        f'<div style="font-size:0.82em;font-weight:600;'
+                        f'color:#374151;padding:4px 0 2px 0">'
+                        f"📄 <code>{path}</code></div>",
                         unsafe_allow_html=True,
                     )
+                    st.code(content, language=lang)
         # ── Tool result ───────────────────────────────────────────────
         elif k == "tool_result":
             fn = step["fn"]
+            if fn == "think":
+                continue  # thought content already shown in the tool call block
             result = step["result"]
             color = TOOL_COLOR.get(fn, "#6B7280")
             short = result[:120].replace("\n", " ")
             label = f'← {fn}: {short}{"…" if len(result) > 120 else ""}'
+            with st.expander(label, expanded=False):
                 lang = "text"
                 if fn == "github_read_file":
                     lang = _lang_for(result.split("\n")[0].strip())
                 )
+# ── Main app ──────────────────────────────────────────────────────────────────
 def main() -> None:
         page_icon="🔴",
     )
+    # ── Sidebar ───────────────────────────────────────────────────────
     with st.sidebar:
         st.title("🔴 github_red")
         st.subheader("Trajectory Viewer")
+        log_files = sorted(glob.glob(str(LOGS_DIR / "*.eval")), reverse=True)
+        if not log_files:
+            st.error(f"No .eval files found in:\n`{LOGS_DIR}`")
+            return
+        selected = st.selectbox(
+            "Run log",
+            log_files,
+            format_func=lambda p: Path(p).stem[:50],
+        )
         st.markdown("---")
         st.markdown("**Legend**")
                 unsafe_allow_html=True,
             )
+    if not selected:
         return
+    # ── Load ──────────────────────────────────────────────────────────
     try:
+        with zipfile.ZipFile(selected) as z:
             sample_files = [n for n in z.namelist() if n.startswith("samples/")]
             if not sample_files:
                 st.error("No sample files found in this eval log.")
                 return
+            # Support multiple samples (tabs)
             samples = {n: json.loads(z.read(n)) for n in sample_files}
+    except Exception as e:
+        st.error(f"Failed to load eval file: {e}")
         return
+    # ── Verdict filter ────────────────────────────────────────────────
     def _sample_verdict(sample: dict) -> str:
         for sc in (sample.get("scores") or {}).values():
             ans = sc.get("answer")
         return "?"
     all_verdicts = sorted({_sample_verdict(s) for s in samples.values()})
     with st.sidebar:
         st.markdown("---")
         st.markdown("**Filter by verdict**")
         }
     filtered = {
+        n: s
+        for n, s in samples.items()
         if selected_verdicts.get(_sample_verdict(s), True)
     }
     if not filtered:
         st.warning("No samples match the current filter.")
         return
+    # ── Header metrics ────────────────────────────────────────────────
     if len(filtered) > 1:
+        tab_names = [
+            f"{Path(n).stem} — {_sample_verdict(s)}" for n, s in filtered.items()
+        ]
         tabs = st.tabs(tab_names)
         for tab, (_, sample) in zip(tabs, filtered.items()):
             with tab:
         _render_sample(next(iter(filtered.values())))
+def _render_sample(sample: dict) -> None:
+    atts = sample.get("attachments", {})
+    events = sample.get("events", [])
+    scores = sample.get("scores", {})
+    # Metrics row
+    score_val, score_ans = None, None
+    for sc in (scores.values() if isinstance(scores, dict) else []):
+        score_val = sc.get("value")
+        score_ans = sc.get("answer")
+        break
+    col1, col2, col3, col4 = st.columns(4)
+    col1.metric("Sample", sample.get("id", "?"))
+    col2.metric("Verdict", f"{score_ans}" if score_ans else "?")
+    col3.metric("Score", f"{score_val:.1f}" if score_val is not None else "?")
+    col4.metric("Time", f"{sample.get('total_time', 0):.0f}s")
+    role_usage = sample.get("role_usage", {})
+    if role_usage:
+        with st.expander("Token usage by role", expanded=False):
+            cols = st.columns(len(role_usage))
+            for col, (role, usage) in zip(cols, role_usage.items()):
+                total = usage.get("total_tokens", 0)
+                col.metric(role, f"{total:,}")
+    st.markdown("---")
+    steps = parse_events(events, atts)
+    render_nav(steps)
+    render_steps(steps)
 if __name__ == "__main__":
     main()