"""Preference-feedback and blind human-eval helpers for Maris training artifacts.""" from __future__ import annotations import json import random from dataclasses import dataclass from pathlib import Path from typing import Any @dataclass(frozen=True, slots=True) class PreferenceExample: prompt: str chosen: str rejected: str source: str annotator: str | None = None reviewer_segment: str | None = None edit_target: str | None = None context: str | None = None branch: str | None = None task_type: str | None = None language: str | None = None source_type: str | None = None risk_level: str | None = None grounding_scope: str | None = None failure_bucket: str | None = None preference_outcome: str | None = None confidence: float | None = None pair_id: str | None = None blind: bool = False production_like: bool = False multi_turn: bool = False repo_context: tuple[str, ...] = () execution_required: bool = False tags: tuple[str, ...] = () def load_preference_dataset(path: str | Path) -> list[PreferenceExample]: raw = json.loads(Path(path).read_text(encoding="utf-8")) entries = raw.get("preferences", raw) if isinstance(raw, dict) else raw if not isinstance(entries, list): raise ValueError("Preference datasetam jābūt sarakstam vai objektam ar `preferences`.") examples: list[PreferenceExample] = [] for entry in entries: if not isinstance(entry, dict): raise ValueError("Katram preference ierakstam jābūt JSON objektam.") prompt = str(entry.get("prompt", "")).strip() chosen = str(entry.get("chosen", "")).strip() rejected = str(entry.get("rejected", "")).strip() source = str(entry.get("source", "")).strip() if not prompt or not chosen or not rejected or not source: raise ValueError( "Preference ierakstam obligāti vajag `prompt`, `chosen`, `rejected` un `source`." ) examples.append( PreferenceExample( prompt=prompt, chosen=chosen, rejected=rejected, source=source, annotator=_normalize_optional_text(entry.get("annotator")), reviewer_segment=_normalize_optional_text(entry.get("reviewer_segment")), edit_target=_normalize_optional_text(entry.get("edit_target")), context=_normalize_optional_text(entry.get("context")), branch=_normalize_optional_text(entry.get("branch")), task_type=_normalize_optional_text(entry.get("task_type")), language=_normalize_optional_text(entry.get("language")), source_type=_normalize_optional_text(entry.get("source_type")) or source, risk_level=_normalize_optional_text(entry.get("risk_level")), grounding_scope=_normalize_optional_text(entry.get("grounding_scope")), failure_bucket=_normalize_optional_text(entry.get("failure_bucket")), preference_outcome=_normalize_optional_text(entry.get("preference_outcome")) or "chosen", confidence=_normalize_confidence(entry.get("confidence")), pair_id=_normalize_optional_text(entry.get("pair_id")), blind=bool(entry.get("blind", False)), production_like=bool(entry.get("production_like", False)), multi_turn=bool(entry.get("multi_turn", False)), repo_context=_normalize_list(entry.get("repo_context")), execution_required=bool(entry.get("execution_required", False)), tags=_normalize_list(entry.get("tags")), ) ) return examples def summarize_preference_dataset(examples: list[PreferenceExample]) -> dict[str, Any]: sources: dict[str, int] = {} source_types: dict[str, int] = {} branches: dict[str, int] = {} task_types: dict[str, int] = {} languages: dict[str, int] = {} reviewer_segments: dict[str, int] = {} risk_levels: dict[str, int] = {} grounding_scopes: dict[str, int] = {} failure_buckets: dict[str, int] = {} tags: dict[str, int] = {} edited_examples = 0 execution_required_examples = 0 blind_examples = 0 production_like_examples = 0 multi_turn_examples = 0 real_reviewer_examples = 0 completed_pairwise = 0 chosen_wins = 0 confidence_values: list[float] = [] for example in examples: sources[example.source] = sources.get(example.source, 0) + 1 if example.source_type: source_types[example.source_type] = source_types.get(example.source_type, 0) + 1 if example.source_type == "real_reviewer": real_reviewer_examples += 1 if example.edit_target: edited_examples += 1 if example.branch: branches[example.branch] = branches.get(example.branch, 0) + 1 if example.task_type: task_types[example.task_type] = task_types.get(example.task_type, 0) + 1 if example.language: languages[example.language] = languages.get(example.language, 0) + 1 if example.reviewer_segment: reviewer_segments[example.reviewer_segment] = ( reviewer_segments.get(example.reviewer_segment, 0) + 1 ) if example.risk_level: risk_levels[example.risk_level] = risk_levels.get(example.risk_level, 0) + 1 if example.grounding_scope: grounding_scopes[example.grounding_scope] = ( grounding_scopes.get(example.grounding_scope, 0) + 1 ) if example.failure_bucket: failure_buckets[example.failure_bucket] = ( failure_buckets.get(example.failure_bucket, 0) + 1 ) if example.execution_required: execution_required_examples += 1 if example.blind: blind_examples += 1 if example.production_like: production_like_examples += 1 if example.multi_turn: multi_turn_examples += 1 if example.preference_outcome in {"chosen", "rejected", "tie"}: completed_pairwise += 1 if example.preference_outcome == "chosen": chosen_wins += 1 if example.confidence is not None: confidence_values.append(example.confidence) for tag in example.tags: tags[tag] = tags.get(tag, 0) + 1 return { "artifact_type": "preference-dataset-summary", "total_examples": len(examples), "sources": dict(sorted(sources.items())), "source_types": dict(sorted(source_types.items())), "branches": dict(sorted(branches.items())), "task_types": dict(sorted(task_types.items())), "languages": dict(sorted(languages.items())), "reviewer_segments": dict(sorted(reviewer_segments.items())), "risk_levels": dict(sorted(risk_levels.items())), "grounding_scopes": dict(sorted(grounding_scopes.items())), "failure_buckets": dict(sorted(failure_buckets.items())), "edited_examples": edited_examples, "execution_required_examples": execution_required_examples, "blind_examples": blind_examples, "production_like_examples": production_like_examples, "multi_turn_examples": multi_turn_examples, "real_reviewer_examples": real_reviewer_examples, "pairwise_completed_examples": completed_pairwise, "pairwise_win_rate": round(chosen_wins / completed_pairwise, 3) if completed_pairwise else 0.0, "average_confidence": round(sum(confidence_values) / len(confidence_values), 3) if confidence_values else 0.0, "tags": dict(sorted(tags.items())), } def build_blind_side_by_side_artifact( examples: list[PreferenceExample], *, seed: int = 0, ) -> dict[str, Any]: rng = random.Random(seed) pairs: list[dict[str, Any]] = [] for index, example in enumerate(examples, start=1): candidates = [ {"slot": "A", "response": example.chosen}, {"slot": "B", "response": example.rejected}, ] rng.shuffle(candidates) for slot_index, candidate in enumerate(candidates): candidate["slot"] = "A" if slot_index == 0 else "B" pairs.append( { "pair_id": example.pair_id or f"pair-{index:04d}", "prompt": example.prompt, "context": example.context or "", "reviewer_segment": example.reviewer_segment or "general", "task_type": example.task_type or "general", "risk_level": example.risk_level or "standard", "grounding_scope": example.grounding_scope or "unspecified", "failure_bucket": example.failure_bucket or "general", "production_like": example.production_like, "multi_turn": example.multi_turn, "candidates": candidates, "review_fields": ["winner", "confidence", "rationale"], } ) return { "artifact_type": "blind-side-by-side-eval-set", "blinding_method": "candidate order randomized; source, branch, model, and annotator hidden", "total_pairs": len(pairs), "pairs": pairs, } def build_human_eval_summary(examples: list[PreferenceExample]) -> dict[str, Any]: summary = summarize_preference_dataset(examples) return { "artifact_type": "human-eval-summary", "total_examples": summary["total_examples"], "blind_examples": summary["blind_examples"], "completed_pairwise_examples": summary["pairwise_completed_examples"], "pairwise_win_rate": summary["pairwise_win_rate"], "average_confidence": summary["average_confidence"], "real_reviewer_examples": summary["real_reviewer_examples"], "source_types": summary["source_types"], "reviewer_segments": summary["reviewer_segments"], "risk_levels": summary["risk_levels"], "grounding_scopes": summary["grounding_scopes"], "failure_buckets": summary["failure_buckets"], "production_like_examples": summary["production_like_examples"], "multi_turn_examples": summary["multi_turn_examples"], } def _normalize_optional_text(value: Any) -> str | None: normalized = str(value or "").strip() return normalized or None def _normalize_confidence(value: Any) -> float | None: if value in (None, ""): return None try: confidence = float(value) except (TypeError, ValueError) as exc: raise ValueError("Preference confidence jābūt skaitlim diapazonā 0..1.") from exc if confidence < 0.0 or confidence > 1.0: raise ValueError("Preference confidence jābūt skaitlim diapazonā 0..1.") return round(confidence, 3) def _normalize_list(value: Any) -> tuple[str, ...]: if not isinstance(value, list): return () return tuple(str(item).strip() for item in value if str(item).strip())