"""Pure transforms on agent-trace event lists. No I/O. Supports two on-disk formats: 1. Claude-Code style — `{type: "user"|"assistant", message: {role, content}}`. Example dataset: `merve/ml-intern-sessions`. 2. pi-sessions style — `{type: "message", message: {role: "user"|"assistant"|"toolResult", content: [...]}}`. Example dataset: `julien-c/pi-sessions`. Tool calls use `toolCall` blocks; tool outputs come back as role=toolResult messages which we drop. """ from typing import Any def event_role(ev: dict) -> str | None: """Normalised role of a trace event. Returns 'user' / 'assistant' or None for non-content events (session metadata, tool results, model_change, etc.).""" t = ev.get("type") if t in ("user", "assistant"): return t if t == "message": msg = ev.get("message") or {} role = msg.get("role") if role in ("user", "assistant"): return role return None return None def event_tool_names(ev: dict) -> list[str]: """Extract tool names invoked in this event, across both formats.""" msg = ev.get("message") or {} content = msg.get("content") if not isinstance(content, list): return [] out: list[str] = [] for block in content: if not isinstance(block, dict): continue bt = block.get("type") if bt in ("tool_use", "toolCall"): name = block.get("name") if isinstance(name, str) and name: out.append(name) return out def _user_content_to_text(content: Any) -> str: if isinstance(content, str): return content if isinstance(content, list): parts: list[str] = [] for block in content: if not isinstance(block, dict): continue if block.get("type") == "tool_result": continue if block.get("type") == "text" and isinstance(block.get("text"), str): parts.append(block["text"]) elif "content" in block and isinstance(block["content"], str) and block.get("type") != "tool_result": parts.append(block["content"]) return "\n".join(parts) return "" def _assistant_content_to_text(content: Any) -> str: """Concatenate text blocks; drop thinking / tool_use / toolCall blocks.""" if isinstance(content, str): return content if isinstance(content, list): parts: list[str] = [] for block in content: if isinstance(block, dict) and block.get("type") == "text" and isinstance(block.get("text"), str): parts.append(block["text"]) return "".join(parts) return "" def events_to_transcript(events: list[dict]) -> str: lines: list[str] = [] for ev in events: role = event_role(ev) if role not in ("user", "assistant"): continue msg = ev.get("message") or {} content = msg.get("content") text = ( _user_content_to_text(content) if role == "user" else _assistant_content_to_text(content) ).strip() if text: label = "User" if role == "user" else "Assistant" lines.append(f"{label}: {text}") return "\n\n".join(lines) def truncate_transcript(text: str, max_chars: int = 40_000) -> str: if len(text) <= max_chars: return text head_len = max_chars // 2 tail_len = max_chars // 4 head = text[:head_len] tail = text[-tail_len:] return f"{head}\n\n[... truncated ...]\n\n{tail}"