| """Pure transforms on agent-trace event lists. No I/O. |
| |
| Supports two on-disk formats: |
| 1. Claude-Code style — `{type: "user"|"assistant", message: {role, content}}`. |
| Example dataset: `merve/ml-intern-sessions`. |
| 2. pi-sessions style — `{type: "message", message: {role: "user"|"assistant"|"toolResult", content: [...]}}`. |
| Example dataset: `julien-c/pi-sessions`. Tool calls use `toolCall` blocks; |
| tool outputs come back as role=toolResult messages which we drop. |
| """ |
|
|
| from typing import Any |
|
|
|
|
| def event_role(ev: dict) -> str | None: |
| """Normalised role of a trace event. Returns 'user' / 'assistant' or None |
| for non-content events (session metadata, tool results, model_change, etc.).""" |
| t = ev.get("type") |
| if t in ("user", "assistant"): |
| return t |
| if t == "message": |
| msg = ev.get("message") or {} |
| role = msg.get("role") |
| if role in ("user", "assistant"): |
| return role |
| return None |
| return None |
|
|
|
|
| def event_tool_names(ev: dict) -> list[str]: |
| """Extract tool names invoked in this event, across both formats.""" |
| msg = ev.get("message") or {} |
| content = msg.get("content") |
| if not isinstance(content, list): |
| return [] |
| out: list[str] = [] |
| for block in content: |
| if not isinstance(block, dict): |
| continue |
| bt = block.get("type") |
| if bt in ("tool_use", "toolCall"): |
| name = block.get("name") |
| if isinstance(name, str) and name: |
| out.append(name) |
| return out |
|
|
|
|
| def _user_content_to_text(content: Any) -> str: |
| if isinstance(content, str): |
| return content |
| if isinstance(content, list): |
| parts: list[str] = [] |
| for block in content: |
| if not isinstance(block, dict): |
| continue |
| if block.get("type") == "tool_result": |
| continue |
| if block.get("type") == "text" and isinstance(block.get("text"), str): |
| parts.append(block["text"]) |
| elif "content" in block and isinstance(block["content"], str) and block.get("type") != "tool_result": |
| parts.append(block["content"]) |
| return "\n".join(parts) |
| return "" |
|
|
|
|
| def _assistant_content_to_text(content: Any) -> str: |
| """Concatenate text blocks; drop thinking / tool_use / toolCall blocks.""" |
| if isinstance(content, str): |
| return content |
| if isinstance(content, list): |
| parts: list[str] = [] |
| for block in content: |
| if isinstance(block, dict) and block.get("type") == "text" and isinstance(block.get("text"), str): |
| parts.append(block["text"]) |
| return "".join(parts) |
| return "" |
|
|
|
|
| def events_to_transcript(events: list[dict]) -> str: |
| lines: list[str] = [] |
| for ev in events: |
| role = event_role(ev) |
| if role not in ("user", "assistant"): |
| continue |
| msg = ev.get("message") or {} |
| content = msg.get("content") |
| text = ( |
| _user_content_to_text(content) |
| if role == "user" |
| else _assistant_content_to_text(content) |
| ).strip() |
| if text: |
| label = "User" if role == "user" else "Assistant" |
| lines.append(f"{label}: {text}") |
| return "\n\n".join(lines) |
|
|
|
|
| def truncate_transcript(text: str, max_chars: int = 40_000) -> str: |
| if len(text) <= max_chars: |
| return text |
| head_len = max_chars // 2 |
| tail_len = max_chars // 4 |
| head = text[:head_len] |
| tail = text[-tail_len:] |
| return f"{head}\n\n[... truncated ...]\n\n{tail}" |
|
|