Spaces:

Blablablab
/

codebook

Paused

App Files Files Community

codebook / potato /simulator /agent_strategy.py

davidjurgens

Deploy: Potato — Codebook Annotation

aceb1b2 verified 10 days ago

Raw

History Blame Contribute Delete

35.6 kB

	"""
	Agent (vision-LLM) annotation strategy.

	The :class:`AgentSimulatorStrategy` consumes the full structured payload of
	an instance — text fields, dialogue arrays (agent traces, conversations),
	spreadsheet/table data, image references — and asks a vision-capable LLM to
	produce a complete annotation set covering every schema for that instance.

	It mirrors :class:`LLMStrategy` but differs in two important ways:

	1. It reads ``instance["data"]`` (the full raw payload that
	``/api/current_instance`` returns under the ``data`` key) instead of the
	single ``text`` field. This gives the model access to dialogue traces,
	metadata tables, and image URLs.

	2. It batches the per-instance call: a single LLM query produces labels for
	every schema. Subsequent ``generate_annotation`` calls for the same
	instance are served from a per-instance cache. This keeps cost roughly
	1× (instances) instead of (instances × schemas).
	"""

	from __future__ import annotations

	import base64
	import io
	import json
	import logging
	import os
	import random
	import re
	from typing import Any, Dict, List, Optional, Tuple

	from pydantic import BaseModel, Field

	from .annotation_strategies import AnnotationStrategy, RandomStrategy
	from .competence_profiles import CompetenceProfile
	from .config import AgentStrategyConfig

	logger = logging.getLogger(__name__)


	_FIELD_DETECTORS = {
	"dialogue": (
	"conversation",
	"dialogue",
	"trace",
	"messages",
	"turns",
	"structured_turns", # coding-agent traces with role/content/tool_calls
	),
	"spreadsheet": ("metadata_table", "table", "spreadsheet"),
	"image": ("image", "image_url", "screenshot", "screenshot_url", "media", "image_path"),
	}

	# Cap on how much tool I/O text we render per turn to keep prompts bounded.
	_MAX_TOOL_INPUT_CHARS = 400
	_MAX_TOOL_OUTPUT_CHARS = 800


	class _AgentLabelResponse(BaseModel):
	"""Pydantic schema returned by the LLM for a single instance.

	The model returns a flat dict keyed by ``<schema_name>`` whose value is
	either a label string (radio/multiselect/likert) or a numeric value
	(slider/likert as int) or a free-text response (text/textbox). The
	strategy maps these to the wire format the simulator submits.
	"""

	annotations: Dict[str, Any] = Field(
	default_factory=dict,
	description="Mapping of schema_name -> chosen label/value/text.",
	)
	reasoning: str = Field(
	default="",
	description="One sentence explaining the labels (kept short).",
	)


	class AgentSimulatorStrategy(AnnotationStrategy):
	"""Vision-LLM strategy for multi-modal / structured agent content."""

	def __init__(self, config: AgentStrategyConfig):
	self.config = config
	self.endpoint = self._create_endpoint()
	self.random_strategy = RandomStrategy()
	# Per-instance result cache: instance_id -> dict[schema_name -> raw model value]
	self._cache: Dict[str, Dict[str, Any]] = {}
	# Errors are reported per-instance to avoid hammering the LLM with retries
	self._failed_instances: set = set()

	# ------------------------------------------------------------------
	# Endpoint construction
	# ------------------------------------------------------------------

	def _create_endpoint(self):
	try:
	from potato.ai.ai_endpoint import AIEndpointFactory

	ai_cfg: Dict[str, Any] = {
	"model": self.config.model,
	"api_key": self.config.api_key,
	"max_tokens": self.config.max_tokens,
	"temperature": self.config.temperature,
	}
	if self.config.base_url:
	ai_cfg["base_url"] = self.config.base_url

	return AIEndpointFactory.create_endpoint({
	"ai_support": {
	"enabled": True,
	"endpoint_type": self.config.endpoint_type,
	"ai_config": ai_cfg,
	}
	})
	except Exception as e:
	logger.warning("AgentSimulatorStrategy: endpoint init failed: %s", e)
	return None

	# ------------------------------------------------------------------
	# Public API
	# ------------------------------------------------------------------

	def generate_annotation(
	self,
	instance: Dict[str, Any],
	schema: Dict[str, Any],
	competence: CompetenceProfile,
	gold_answer: Optional[Dict[str, Any]] = None,
	) -> Dict[str, Any]:
	if not self.endpoint:
	return self.random_strategy.generate_annotation(
	instance, schema, competence, gold_answer
	)

	instance_id = instance.get("instance_id") or instance.get("id") or ""
	schema_name = schema.get("name")
	annotation_type = schema.get("annotation_type") or schema.get("type")
	labels = self.random_strategy._extract_labels(schema)

	# Per-instance cache: one LLM call answers every schema
	results = self._get_or_query(instance, instance_id)
	if results is None:
	return self.random_strategy.generate_annotation(
	instance, schema, competence, gold_answer
	)

	# Optional noise: mirrors LLMStrategy
	if self.config.add_noise and random.random() < self.config.noise_rate:
	return self.random_strategy.generate_annotation(
	instance, schema, competence, gold_answer
	)

	raw_value = results.get(schema_name)
	if raw_value is None:
	logger.debug(
	"Agent strategy: no value for schema=%s (instance=%s); falling back",
	schema_name, instance_id,
	)
	return self.random_strategy.generate_annotation(
	instance, schema, competence, gold_answer
	)

	formatted = self._format_value(
	schema_name, raw_value, annotation_type, labels, schema,
	instance=instance,
	)
	if not formatted:
	return self.random_strategy.generate_annotation(
	instance, schema, competence, gold_answer
	)
	return formatted

	# ------------------------------------------------------------------
	# Per-instance batch query
	# ------------------------------------------------------------------

	def _get_or_query(
	self, instance: Dict[str, Any], instance_id: str
	) -> Optional[Dict[str, Any]]:
	if self.config.cache_per_instance and instance_id in self._cache:
	return self._cache[instance_id]
	if instance_id in self._failed_instances:
	return None

	schemas = instance.get("__all_schemas__") or instance.get("schemas") or []
	if not schemas:
	# The simulator should be passing schemas via the instance dict
	# (see SimulatedUser.generate_annotations). If it isn't, we can
	# still produce annotations for the single schema by callers
	# passing schema directly each time, but caching is then per-call.
	logger.debug("Agent strategy: no schemas attached to instance %s", instance_id)
	return None

	prompt, image_payloads = self._build_request(instance, schemas)
	try:
	response = self._invoke(prompt, image_payloads)
	except Exception as e:
	logger.warning(
	"Agent strategy: LLM call failed for instance=%s: %s", instance_id, e
	)
	self._failed_instances.add(instance_id)
	return None

	parsed = self._parse_response(response)
	if parsed is None:
	self._failed_instances.add(instance_id)
	return None

	# Models occasionally key by the schema's annotation_type instead of
	# its name (e.g. "code_review" instead of "review"). Re-key to schema
	# names so downstream lookup always works.
	parsed = self._normalize_keys_to_schema_names(parsed, schemas)

	if self.config.cache_per_instance:
	self._cache[instance_id] = parsed
	return parsed

	def _normalize_keys_to_schema_names(
	self, parsed: Dict[str, Any], schemas: List[Dict[str, Any]]
	) -> Dict[str, Any]:
	"""Re-key parsed annotations to match the schema names the
	simulator uses, even if the LLM keyed by annotation_type, label, or
	a case-variant. Idempotent for already-correct keys.
	"""
	if not isinstance(parsed, dict) or not schemas:
	return parsed

	names = {s.get("name"): s for s in schemas if s.get("name")}
	# Exact-name path -- fast and keeps existing behaviour.
	unmatched = [k for k in parsed.keys() if k not in names]
	if not unmatched:
	return parsed

	# Build alternate-key lookup: annotation_type -> schema name.
	type_to_name: Dict[str, str] = {}
	lower_name_to_name: Dict[str, str] = {n.lower(): n for n in names}
	for s in schemas:
	atype = s.get("annotation_type") or s.get("type")
	if atype and atype not in names and atype not in type_to_name:
	type_to_name[atype] = s["name"]

	out = dict(parsed)
	for key in list(unmatched):
	value = out[key]
	target: Optional[str] = None
	if key in type_to_name:
	target = type_to_name[key]
	elif key.lower() in lower_name_to_name:
	target = lower_name_to_name[key.lower()]
	if target and target not in out:
	out[target] = value
	# Keep the original key too -- harmless and aids debugging.
	return out

	# ------------------------------------------------------------------
	# Prompt construction
	# ------------------------------------------------------------------

	def _build_request(
	self,
	instance: Dict[str, Any],
	schemas: List[Dict[str, Any]],
	) -> Tuple[str, List[Any]]:
	data = instance.get("data") or {}
	if not isinstance(data, dict):
	data = {}

	text_blocks: List[str] = []

	# Top-level text/task description
	task_text = (
	data.get("task_description")
	or data.get("text")
	or instance.get("text", "")
	)
	if task_text:
	text_blocks.append(f"## Task\n{task_text}".strip())

	# Dialogue / conversation arrays
	if self.config.include_dialogue_text:
	for key in _FIELD_DETECTORS["dialogue"]:
	value = data.get(key)
	if value:
	rendered = self._render_dialogue(value)
	if rendered:
	text_blocks.append(f"## {key.title()}\n{rendered}")
	break # Only render the first matching dialogue field

	# Spreadsheet / table data
	if self.config.include_spreadsheet:
	for key in _FIELD_DETECTORS["spreadsheet"]:
	value = data.get(key)
	if value:
	rendered = self._render_spreadsheet(value)
	if rendered:
	text_blocks.append(f"## {key.replace('_', ' ').title()}\n{rendered}")
	break

	# Other plain-text fields not already consumed
	consumed = (
	{"task_description", "text", "id"}
	\| set(_FIELD_DETECTORS["dialogue"])
	\| set(_FIELD_DETECTORS["spreadsheet"])
	\| set(_FIELD_DETECTORS["image"])
	\| {"gold_labels"}
	)
	for k, v in data.items():
	if k in consumed or k.startswith("_"):
	continue
	if isinstance(v, (str, int, float)) and str(v).strip():
	text_blocks.append(f"## {k}\n{v}")

	# Schema spec section (instance is needed for step-aware schemas)
	text_blocks.append(self._render_schema_spec(schemas, instance))

	text_blocks.append(
	"Respond with a single JSON object {\"annotations\": {...}, \"reasoning\": \"...\"} "
	"where each key under 'annotations' is exactly the schema name listed above. "
	"The value type matches the schema:\n"
	"- radio / multiselect / likert with named labels: a string label\n"
	"- likert / slider / number without labels: an integer in the allowed range\n"
	"- text / textbox: a short free-text string\n"
	"- multiselect: a JSON array of label strings\n"
	"- process_reward: an integer step index (or null) for first_error mode, "
	"or a JSON array of 1/-1/0 for per_step mode\n"
	"- code_review: a JSON object with verdict, comments, file_ratings keys\n"
	"Always include EVERY schema name as a key under 'annotations'."
	)

	prompt = "\n\n".join(text_blocks)

	# Collect image payloads (paths or URLs)
	image_payloads = self._collect_images(data)
	return prompt, image_payloads

	def _render_dialogue(self, value: Any) -> str:
	if isinstance(value, str):
	return value[: self.config.max_dialogue_chars]
	if not isinstance(value, list):
	return ""
	lines: List[str] = []
	for i, turn in enumerate(value, start=1):
	if isinstance(turn, dict):
	speaker = turn.get("speaker") or turn.get("role") or f"Turn {i}"
	text = turn.get("text") or turn.get("content") or ""
	lines.append(f"{i}. {speaker}: {text}")
	# Coding-agent shape: each turn may carry a list of
	# {tool, input, output, output_type, language} entries.
	# Render them so the LLM rater can see the actions taken.
	tool_calls = turn.get("tool_calls")
	if isinstance(tool_calls, list):
	for call in tool_calls:
	if not isinstance(call, dict):
	continue
	lines.append(self._render_tool_call(call))
	else:
	lines.append(f"{i}. {turn}")
	rendered = "\n".join(lines)
	return rendered[: self.config.max_dialogue_chars]

	def _render_tool_call(self, call: Dict[str, Any]) -> str:
	tool_name = call.get("tool") or call.get("name") or "tool"
	# Inputs may be a dict or a string -- format for readability.
	raw_input = call.get("input") or call.get("arguments") or {}
	if isinstance(raw_input, dict):
	input_str = ", ".join(f"{k}={v!r}" for k, v in raw_input.items())
	else:
	input_str = str(raw_input)
	input_str = input_str[:_MAX_TOOL_INPUT_CHARS]

	output = call.get("output")
	if output is None:
	return f" [tool: {tool_name}({input_str})]"
	output_str = str(output)
	if len(output_str) > _MAX_TOOL_OUTPUT_CHARS:
	output_str = (
	output_str[:_MAX_TOOL_OUTPUT_CHARS]
	+ f"\n [...truncated {len(output_str) - _MAX_TOOL_OUTPUT_CHARS} chars]"
	)
	return f" [tool: {tool_name}({input_str})]\n -> {output_str}"

	def _render_spreadsheet(self, value: Any) -> str:
	if isinstance(value, list) and value and isinstance(value[0], dict):
	keys = list(value[0].keys())
	header = " \| ".join(keys)
	rows = [
	" \| ".join(str(row.get(k, "")) for k in keys) for row in value
	]
	return header + "\n" + "\n".join(rows)
	if isinstance(value, dict):
	return "\n".join(f"{k}: {v}" for k, v in value.items())
	return str(value)

	def _render_schema_spec(
	self,
	schemas: List[Dict[str, Any]],
	instance: Optional[Dict[str, Any]] = None,
	) -> str:
	data = (instance or {}).get("data") or {}
	lines = ["## Schemas to label"]
	for schema in schemas:
	name = schema.get("name", "?")
	atype = schema.get("annotation_type") or schema.get("type") or "?"
	desc = schema.get("description", "")
	labels = self.random_strategy._extract_labels(schema)
	allowed: str
	if labels:
	allowed = "labels=" + ", ".join(labels)
	elif atype == "likert":
	size = schema.get("size", 5)
	allowed = f"integer 1..{size}"
	elif atype in ("slider", "number"):
	lo = schema.get("min_value", schema.get("min", 0))
	hi = schema.get("max_value", schema.get("max", 100))
	allowed = f"integer {lo}..{hi}"
	elif atype == "process_reward":
	steps_key = schema.get("steps_key", "structured_turns")
	steps = data.get(steps_key) if isinstance(data, dict) else None
	n = len(steps) if isinstance(steps, list) else 0
	mode = schema.get("mode", "first_error")
	if mode == "first_error":
	allowed = (
	f"first_error mode: integer 0..{max(n - 1, 0)} "
	f"(index of the first wrong step in the {n}-step trace), "
	"or null if every step is correct"
	)
	else:
	allowed = (
	f"per_step mode: list of {n} entries, each one of "
	"1 (correct), -1 (incorrect), 0 (unmarked)"
	)
	elif atype == "code_review":
	verdicts = schema.get(
	"verdict_options", ["approve", "request_changes", "comment_only"]
	)
	allowed = (
	"object with keys: "
	"verdict (one of " + ", ".join(verdicts) + "), "
	"comments (list of {file, line?, category, body}), "
	"file_ratings (object: filename -> {dim: 1..5})"
	)
	else:
	allowed = "free text"
	lines.append(f"- {name} ({atype}): {desc} [{allowed}]")
	return "\n".join(lines)

	# ------------------------------------------------------------------
	# Image handling
	# ------------------------------------------------------------------

	def _collect_images(self, data: Dict[str, Any]) -> List[Any]:
	"""Return up to ``max_image_count`` ImageData objects."""
	try:
	from potato.ai.ai_endpoint import ImageData
	except Exception:
	return []

	candidates: List[str] = []
	for key in _FIELD_DETECTORS["image"]:
	value = data.get(key)
	if not value:
	continue
	if isinstance(value, str):
	candidates.append(value)
	elif isinstance(value, list):
	for item in value:
	if isinstance(item, str):
	candidates.append(item)

	images: List[Any] = []
	for path_or_url in candidates[: self.config.max_image_count]:
	payload = self._load_image(path_or_url, ImageData)
	if payload is not None:
	images.append(payload)
	return images

	def _load_image(self, path_or_url: str, ImageData):
	try:
	if path_or_url.startswith(("http://", "https://", "data:")):
	# Remote / inline data URI -- pass through unchanged
	return ImageData(url=path_or_url) if hasattr(ImageData, "url") else None

	if not os.path.exists(path_or_url):
	logger.debug("Agent strategy: image not found at %s", path_or_url)
	return None

	with open(path_or_url, "rb") as f:
	raw = f.read()

	if self.config.max_image_dim:
	raw = self._maybe_resize(raw)

	b64 = base64.b64encode(raw).decode("ascii")
	# ImageData supports a few constructor signatures across providers;
	# try the most compatible one first.
	try:
	return ImageData(base64=b64, mime_type=self._guess_mime(path_or_url))
	except TypeError:
	try:
	return ImageData(data=b64, mime_type=self._guess_mime(path_or_url))
	except TypeError:
	return ImageData(b64)
	except Exception as e:
	logger.debug("Agent strategy: failed to load image %s: %s", path_or_url, e)
	return None

	def _maybe_resize(self, raw: bytes) -> bytes:
	try:
	from PIL import Image # noqa: WPS433
	except Exception:
	return raw
	try:
	img = Image.open(io.BytesIO(raw))
	longest = max(img.size)
	if longest <= self.config.max_image_dim:
	return raw
	scale = self.config.max_image_dim / longest
	new_size = (max(1, int(img.size[0] * scale)), max(1, int(img.size[1] * scale)))
	img = img.convert("RGB").resize(new_size)
	buf = io.BytesIO()
	img.save(buf, format="JPEG", quality=85)
	return buf.getvalue()
	except Exception:
	return raw

	def _guess_mime(self, path: str) -> str:
	lowered = path.lower()
	if lowered.endswith(".png"):
	return "image/png"
	if lowered.endswith((".jpg", ".jpeg")):
	return "image/jpeg"
	if lowered.endswith(".webp"):
	return "image/webp"
	if lowered.endswith(".gif"):
	return "image/gif"
	return "image/jpeg"

	# ------------------------------------------------------------------
	# LLM invocation + response parsing
	# ------------------------------------------------------------------

	def _invoke(self, prompt: str, image_payloads: List[Any]) -> Any:
	"""Call the endpoint, preferring vision API when images are present."""
	if image_payloads and hasattr(self.endpoint, "query_with_image"):
	return self.endpoint.query_with_image(
	prompt, image_payloads, _AgentLabelResponse
	)
	return self.endpoint.query(prompt, _AgentLabelResponse)

	def _parse_response(self, response: Any) -> Optional[Dict[str, Any]]:
	if response is None:
	return None
	# Endpoints with structured output return a dict-like object
	if hasattr(response, "model_dump"):
	data = response.model_dump()
	elif isinstance(response, dict):
	data = response
	elif isinstance(response, str):
	data = self._loose_json_parse(response)
	else:
	try:
	data = dict(response)
	except Exception:
	return None
	if not isinstance(data, dict):
	return None
	annotations = data.get("annotations") if isinstance(data, dict) else None
	if isinstance(annotations, dict):
	return annotations
	# Some endpoints return the raw annotations dict directly
	if all(isinstance(k, str) for k in data.keys()) and "reasoning" not in data:
	return data
	return None

	def _loose_json_parse(self, text: str) -> Dict[str, Any]:
	try:
	return json.loads(text)
	except Exception:
	pass
	match = re.search(r"\{.*\}", text, flags=re.DOTALL)
	if match:
	try:
	return json.loads(match.group(0))
	except Exception:
	pass
	return {}

	# ------------------------------------------------------------------
	# Format translation: model output -> wire annotation
	# ------------------------------------------------------------------

	def _format_value(
	self,
	schema_name: str,
	raw_value: Any,
	annotation_type: str,
	labels: List[str],
	schema: Dict[str, Any],
	instance: Optional[Dict[str, Any]] = None,
	) -> Optional[Dict[str, Any]]:
	if annotation_type == "process_reward":
	return self._format_process_reward(
	schema_name, raw_value, schema, instance
	)

	if annotation_type == "code_review":
	return self._format_code_review(schema_name, raw_value, schema)

	if annotation_type == "multiselect":
	chosen = self._coerce_multilabels(raw_value, labels)
	if not chosen:
	return None
	return {f"{schema_name}:{label}": "on" for label in chosen}

	if annotation_type == "radio":
	chosen = self._coerce_label(raw_value, labels)
	if chosen is None:
	return None
	return {f"{schema_name}:{chosen}": "on"}

	if annotation_type == "likert":
	size = schema.get("size", 5)
	chosen = self._coerce_int(raw_value, 1, size)
	if chosen is None and labels:
	# Some likert schemas use named labels (e.g. ["Wrong","Right"])
	lbl = self._coerce_label(raw_value, labels)
	if lbl is not None:
	return {f"{schema_name}:{lbl}": "on"}
	if chosen is None:
	return None
	return {f"{schema_name}:{chosen}": "on"}

	if annotation_type in ("slider", "number"):
	lo = schema.get("min_value", schema.get("min", 0))
	hi = schema.get("max_value", schema.get("max", 100))
	chosen = self._coerce_int(raw_value, lo, hi)
	if chosen is None:
	return None
	return {f"{schema_name}:{chosen}": str(chosen)}

	if annotation_type in ("text", "textbox"):
	return {f"{schema_name}:text": str(raw_value)[:1000]}

	# Unknown type — return string form
	return {f"{schema_name}:{raw_value}": "on"}

	def _coerce_label(self, raw_value: Any, labels: List[str]) -> Optional[str]:
	if not labels:
	return None
	if isinstance(raw_value, str):
	candidate = raw_value.strip()
	for label in labels:
	if label.lower() == candidate.lower():
	return label
	for label in labels:
	if label.lower() in candidate.lower() or candidate.lower() in label.lower():
	return label
	return None

	def _coerce_multilabels(self, raw_value: Any, labels: List[str]) -> List[str]:
	if not labels:
	return []
	if isinstance(raw_value, list):
	chosen: List[str] = []
	for item in raw_value:
	resolved = self._coerce_label(item, labels)
	if resolved and resolved not in chosen:
	chosen.append(resolved)
	return chosen
	if isinstance(raw_value, str):
	parts = re.split(r"[,;\|]", raw_value)
	chosen = []
	for part in parts:
	resolved = self._coerce_label(part, labels)
	if resolved and resolved not in chosen:
	chosen.append(resolved)
	return chosen
	return []

	# ------------------------------------------------------------------
	# Custom-schema wire-format helpers
	# ------------------------------------------------------------------

	def _format_process_reward(
	self,
	schema_name: str,
	raw_value: Any,
	schema: Dict[str, Any],
	instance: Optional[Dict[str, Any]],
	) -> Optional[Dict[str, Any]]:
	"""Build the wire-format payload for a process_reward schema.

	Server expects ``{"<schema>:::<schema>": "<json>"}`` where the JSON
	is ``{"steps": [{"index": N, "reward": 1\|-1\|0}, ...], "mode": ...}``.
	"""
	steps_key = schema.get("steps_key", "structured_turns")
	mode = schema.get("mode", "first_error")
	data = (instance or {}).get("data") or {}
	steps = data.get(steps_key) if isinstance(data, dict) else None
	n = len(steps) if isinstance(steps, list) else 0
	if n == 0:
	return None

	if mode == "first_error":
	first_wrong = self._coerce_first_wrong_index(raw_value, n)
	entries = []
	for idx in range(n):
	if first_wrong is None:
	reward = 1
	elif idx < first_wrong:
	reward = 1
	else:
	reward = -1
	entries.append({"index": idx, "reward": reward})
	else:
	entries = self._coerce_per_step_rewards(raw_value, n)
	if entries is None:
	return None

	payload = {"steps": entries, "mode": mode}
	return {f"{schema_name}:::{schema_name}": json.dumps(payload)}

	def _coerce_first_wrong_index(self, raw_value: Any, n: int) -> Optional[int]:
	"""Interpret the LLM's first-error response as an int in 0..n-1 or None."""
	if raw_value is None:
	return None
	if isinstance(raw_value, str) and raw_value.strip().lower() in (
	"null", "none", "all_correct", "n/a", ""
	):
	return None
	if isinstance(raw_value, dict):
	for key in ("first_wrong", "first_error", "index", "step"):
	if key in raw_value:
	return self._coerce_first_wrong_index(raw_value[key], n)
	return None
	idx = self._coerce_int(raw_value, 0, max(n - 1, 0))
	return idx

	def _coerce_per_step_rewards(
	self, raw_value: Any, n: int
	) -> Optional[List[Dict[str, int]]]:
	"""Interpret the LLM's per_step response as a list of n {index,reward} entries."""
	items: List[int] = []
	if isinstance(raw_value, list):
	for v in raw_value:
	if isinstance(v, dict) and "reward" in v:
	items.append(self._normalize_reward(v["reward"]))
	else:
	items.append(self._normalize_reward(v))
	elif isinstance(raw_value, str):
	for part in re.split(r"[\s,;\|]+", raw_value):
	if not part:
	continue
	items.append(self._normalize_reward(part))
	else:
	return None

	if len(items) < n:
	items.extend([0] * (n - len(items)))
	items = items[:n]
	return [{"index": i, "reward": r} for i, r in enumerate(items)]

	def _normalize_reward(self, value: Any) -> int:
	"""Map various encodings to the server's {1, -1, 0} reward space."""
	if isinstance(value, str):
	v = value.strip().lower()
	if v in ("1", "+1", "correct", "good", "true", "yes", "ok"):
	return 1
	if v in ("-1", "incorrect", "wrong", "bad", "false", "no"):
	return -1
	return 0
	try:
	i = int(value)
	except (TypeError, ValueError):
	return 0
	if i > 0:
	return 1
	if i < 0:
	return -1
	return 0

	def _format_code_review(
	self,
	schema_name: str,
	raw_value: Any,
	schema: Dict[str, Any],
	) -> Optional[Dict[str, Any]]:
	"""Build the wire-format payload for a code_review schema.

	Server expects ``{"<schema>:::<schema>": "<json>"}`` where the JSON
	is ``{"verdict": "...", "comments": [...], "file_ratings": {...}}``.
	"""
	verdicts = schema.get(
	"verdict_options",
	["approve", "request_changes", "comment_only"],
	)
	categories = schema.get(
	"comment_categories",
	["bug", "style", "suggestion", "security", "question"],
	)
	rating_dims = schema.get(
	"file_rating_dimensions",
	["correctness", "readability", "maintainability"],
	)

	verdict, comments, file_ratings = "comment_only", [], {}

	if isinstance(raw_value, dict):
	v = raw_value.get("verdict")
	if isinstance(v, str):
	v_lower = v.strip().lower()
	for option in verdicts:
	if v_lower == option.lower() or v_lower in option.lower():
	verdict = option
	break

	raw_comments = raw_value.get("comments") or []
	if isinstance(raw_comments, list):
	for c in raw_comments:
	if not isinstance(c, dict):
	continue
	body = str(c.get("body") or c.get("text") or c.get("comment") or "").strip()
	if not body:
	continue
	cat = str(c.get("category") or "").strip().lower()
	if cat not in {x.lower() for x in categories}:
	cat = categories[0]
	else:
	# restore original casing
	cat = next(x for x in categories if x.lower() == cat)
	entry = {
	"category": cat,
	"body": body[:1000],
	}
	if c.get("file"):
	entry["file"] = str(c["file"])
	line = c.get("line")
	if isinstance(line, int):
	entry["line"] = line
	comments.append(entry)

	raw_ratings = raw_value.get("file_ratings") or raw_value.get("ratings") or {}
	if isinstance(raw_ratings, dict):
	for filename, dims in raw_ratings.items():
	if not isinstance(dims, dict):
	continue
	clean_dims: Dict[str, int] = {}
	for dim, score in dims.items():
	dim_match = next(
	(d for d in rating_dims if d.lower() == str(dim).lower()),
	None,
	)
	if dim_match is None:
	continue
	clamped = self._coerce_int(score, 1, 5)
	if clamped is not None:
	clean_dims[dim_match] = clamped
	if clean_dims:
	file_ratings[str(filename)] = clean_dims

	elif isinstance(raw_value, str):
	v_lower = raw_value.strip().lower()
	for option in verdicts:
	if v_lower == option.lower() or v_lower in option.lower():
	verdict = option
	break

	payload = {
	"verdict": verdict,
	"comments": comments,
	"file_ratings": file_ratings,
	}
	return {f"{schema_name}:::{schema_name}": json.dumps(payload)}

	def _coerce_int(self, raw_value: Any, lo: int, hi: int) -> Optional[int]:
	try:
	value = int(float(raw_value))
	except (TypeError, ValueError):
	if isinstance(raw_value, str):
	m = re.search(r"-?\d+", raw_value)
	if m:
	try:
	value = int(m.group(0))
	except ValueError:
	return None
	else:
	return None
	else:
	return None
	if value < lo:
	value = lo
	elif value > hi:
	value = hi
	return value