""" Trajectory Evaluation Layout Per-step error marking for agent traces. For each step the annotator marks: correctness (correct / incorrect / partially_correct), error type from a configurable taxonomy, severity, and a free-text rationale. Research: TRAIL (Trace Reasoning and Agentic Issue Localization), AgentRewardBench, Anthropic "Demystifying Evals for AI Agents". """ import json import logging from typing import Dict, Any, Tuple, List from .identifier_utils import ( safe_generate_layout, generate_element_identifier, generate_validation_attribute, escape_html_content, generate_layout_attributes, ) logger = logging.getLogger(__name__) DEFAULT_CORRECTNESS = ["correct", "incorrect", "partially_correct"] DEFAULT_SEVERITIES = [ {"name": "minor", "weight": -1}, {"name": "major", "weight": -5}, {"name": "critical", "weight": -10}, ] def generate_trajectory_eval_layout( annotation_scheme: Dict[str, Any], ) -> Tuple[str, List[Tuple[str, str]]]: """Generate HTML for a trajectory evaluation interface. Args: annotation_scheme: Configuration dict. Required keys: ``name``, ``description``. Optional: ``steps_key``, ``step_text_key``, ``correctness_options``, ``error_types``, ``severities``, ``show_score``. Returns: ``(html, keybindings)`` tuple. """ return safe_generate_layout(annotation_scheme, _generate_internal) def _generate_internal( annotation_scheme: Dict[str, Any], ) -> Tuple[str, List[Tuple[str, str]]]: schema_name = annotation_scheme["name"] description = annotation_scheme["description"] steps_key = annotation_scheme.get("steps_key", "steps") step_text_key = annotation_scheme.get("step_text_key", "action") correctness_options = annotation_scheme.get("correctness_options", DEFAULT_CORRECTNESS) error_types = annotation_scheme.get("error_types", []) severities = annotation_scheme.get("severities", DEFAULT_SEVERITIES) show_score = annotation_scheme.get("show_score", True) max_score = annotation_scheme.get("max_score", 100) layout_attrs = generate_layout_attributes(annotation_scheme) validation = generate_validation_attribute(annotation_scheme) identifiers = generate_element_identifier(schema_name, schema_name, "hidden") # Serialize config for JS IIFE config_json = json.dumps({ "steps_key": steps_key, "step_text_key": step_text_key, "correctness_options": correctness_options, "error_types": error_types, "severities": severities, "show_score": show_score, "max_score": max_score, }) # Build error type ' for et in error_types: subtypes = et.get("subtypes", []) if subtypes: type_options_html += f'' for st in subtypes: val = f'{escape_html_content(et["name"])}::{escape_html_content(st)}' type_options_html += f'' type_options_html += "" else: type_options_html += ( f'' ) # Build severity radio buttons severity_radios_html = "" for sev in severities: severity_radios_html += f""" """ # Build correctness buttons correctness_btns = "" for opt in correctness_options: label = opt.replace("_", " ").title() css_cls = f"traj-correctness-{opt}" correctness_btns += ( f' ' ) score_html = "" if show_score: score_html = f"""
Score: {max_score} / {max_score}
""" esc_schema = escape_html_content(schema_name) html = f"""
{escape_html_content(description)} {score_html}
""" logger.info(f"Generated trajectory eval layout for {schema_name}") return html, []