MarisUK's picture
Maris AI model sync
f440f03 verified
"""Preference-feedback and blind human-eval helpers for Maris training artifacts."""
from __future__ import annotations
import json
import random
from dataclasses import dataclass
from pathlib import Path
from typing import Any
@dataclass(frozen=True, slots=True)
class PreferenceExample:
prompt: str
chosen: str
rejected: str
source: str
annotator: str | None = None
reviewer_segment: str | None = None
edit_target: str | None = None
context: str | None = None
branch: str | None = None
task_type: str | None = None
language: str | None = None
source_type: str | None = None
risk_level: str | None = None
grounding_scope: str | None = None
failure_bucket: str | None = None
preference_outcome: str | None = None
confidence: float | None = None
pair_id: str | None = None
blind: bool = False
production_like: bool = False
multi_turn: bool = False
repo_context: tuple[str, ...] = ()
execution_required: bool = False
tags: tuple[str, ...] = ()
def load_preference_dataset(path: str | Path) -> list[PreferenceExample]:
raw = json.loads(Path(path).read_text(encoding="utf-8"))
entries = raw.get("preferences", raw) if isinstance(raw, dict) else raw
if not isinstance(entries, list):
raise ValueError("Preference datasetam jābūt sarakstam vai objektam ar `preferences`.")
examples: list[PreferenceExample] = []
for entry in entries:
if not isinstance(entry, dict):
raise ValueError("Katram preference ierakstam jābūt JSON objektam.")
prompt = str(entry.get("prompt", "")).strip()
chosen = str(entry.get("chosen", "")).strip()
rejected = str(entry.get("rejected", "")).strip()
source = str(entry.get("source", "")).strip()
if not prompt or not chosen or not rejected or not source:
raise ValueError(
"Preference ierakstam obligāti vajag `prompt`, `chosen`, `rejected` un `source`."
)
examples.append(
PreferenceExample(
prompt=prompt,
chosen=chosen,
rejected=rejected,
source=source,
annotator=_normalize_optional_text(entry.get("annotator")),
reviewer_segment=_normalize_optional_text(entry.get("reviewer_segment")),
edit_target=_normalize_optional_text(entry.get("edit_target")),
context=_normalize_optional_text(entry.get("context")),
branch=_normalize_optional_text(entry.get("branch")),
task_type=_normalize_optional_text(entry.get("task_type")),
language=_normalize_optional_text(entry.get("language")),
source_type=_normalize_optional_text(entry.get("source_type")) or source,
risk_level=_normalize_optional_text(entry.get("risk_level")),
grounding_scope=_normalize_optional_text(entry.get("grounding_scope")),
failure_bucket=_normalize_optional_text(entry.get("failure_bucket")),
preference_outcome=_normalize_optional_text(entry.get("preference_outcome"))
or "chosen",
confidence=_normalize_confidence(entry.get("confidence")),
pair_id=_normalize_optional_text(entry.get("pair_id")),
blind=bool(entry.get("blind", False)),
production_like=bool(entry.get("production_like", False)),
multi_turn=bool(entry.get("multi_turn", False)),
repo_context=_normalize_list(entry.get("repo_context")),
execution_required=bool(entry.get("execution_required", False)),
tags=_normalize_list(entry.get("tags")),
)
)
return examples
def summarize_preference_dataset(examples: list[PreferenceExample]) -> dict[str, Any]:
sources: dict[str, int] = {}
source_types: dict[str, int] = {}
branches: dict[str, int] = {}
task_types: dict[str, int] = {}
languages: dict[str, int] = {}
reviewer_segments: dict[str, int] = {}
risk_levels: dict[str, int] = {}
grounding_scopes: dict[str, int] = {}
failure_buckets: dict[str, int] = {}
tags: dict[str, int] = {}
edited_examples = 0
execution_required_examples = 0
blind_examples = 0
production_like_examples = 0
multi_turn_examples = 0
real_reviewer_examples = 0
completed_pairwise = 0
chosen_wins = 0
confidence_values: list[float] = []
for example in examples:
sources[example.source] = sources.get(example.source, 0) + 1
if example.source_type:
source_types[example.source_type] = source_types.get(example.source_type, 0) + 1
if example.source_type == "real_reviewer":
real_reviewer_examples += 1
if example.edit_target:
edited_examples += 1
if example.branch:
branches[example.branch] = branches.get(example.branch, 0) + 1
if example.task_type:
task_types[example.task_type] = task_types.get(example.task_type, 0) + 1
if example.language:
languages[example.language] = languages.get(example.language, 0) + 1
if example.reviewer_segment:
reviewer_segments[example.reviewer_segment] = (
reviewer_segments.get(example.reviewer_segment, 0) + 1
)
if example.risk_level:
risk_levels[example.risk_level] = risk_levels.get(example.risk_level, 0) + 1
if example.grounding_scope:
grounding_scopes[example.grounding_scope] = (
grounding_scopes.get(example.grounding_scope, 0) + 1
)
if example.failure_bucket:
failure_buckets[example.failure_bucket] = (
failure_buckets.get(example.failure_bucket, 0) + 1
)
if example.execution_required:
execution_required_examples += 1
if example.blind:
blind_examples += 1
if example.production_like:
production_like_examples += 1
if example.multi_turn:
multi_turn_examples += 1
if example.preference_outcome in {"chosen", "rejected", "tie"}:
completed_pairwise += 1
if example.preference_outcome == "chosen":
chosen_wins += 1
if example.confidence is not None:
confidence_values.append(example.confidence)
for tag in example.tags:
tags[tag] = tags.get(tag, 0) + 1
return {
"artifact_type": "preference-dataset-summary",
"total_examples": len(examples),
"sources": dict(sorted(sources.items())),
"source_types": dict(sorted(source_types.items())),
"branches": dict(sorted(branches.items())),
"task_types": dict(sorted(task_types.items())),
"languages": dict(sorted(languages.items())),
"reviewer_segments": dict(sorted(reviewer_segments.items())),
"risk_levels": dict(sorted(risk_levels.items())),
"grounding_scopes": dict(sorted(grounding_scopes.items())),
"failure_buckets": dict(sorted(failure_buckets.items())),
"edited_examples": edited_examples,
"execution_required_examples": execution_required_examples,
"blind_examples": blind_examples,
"production_like_examples": production_like_examples,
"multi_turn_examples": multi_turn_examples,
"real_reviewer_examples": real_reviewer_examples,
"pairwise_completed_examples": completed_pairwise,
"pairwise_win_rate": round(chosen_wins / completed_pairwise, 3)
if completed_pairwise
else 0.0,
"average_confidence": round(sum(confidence_values) / len(confidence_values), 3)
if confidence_values
else 0.0,
"tags": dict(sorted(tags.items())),
}
def build_blind_side_by_side_artifact(
examples: list[PreferenceExample],
*,
seed: int = 0,
) -> dict[str, Any]:
rng = random.Random(seed)
pairs: list[dict[str, Any]] = []
for index, example in enumerate(examples, start=1):
candidates = [
{"slot": "A", "response": example.chosen},
{"slot": "B", "response": example.rejected},
]
rng.shuffle(candidates)
for slot_index, candidate in enumerate(candidates):
candidate["slot"] = "A" if slot_index == 0 else "B"
pairs.append(
{
"pair_id": example.pair_id or f"pair-{index:04d}",
"prompt": example.prompt,
"context": example.context or "",
"reviewer_segment": example.reviewer_segment or "general",
"task_type": example.task_type or "general",
"risk_level": example.risk_level or "standard",
"grounding_scope": example.grounding_scope or "unspecified",
"failure_bucket": example.failure_bucket or "general",
"production_like": example.production_like,
"multi_turn": example.multi_turn,
"candidates": candidates,
"review_fields": ["winner", "confidence", "rationale"],
}
)
return {
"artifact_type": "blind-side-by-side-eval-set",
"blinding_method": "candidate order randomized; source, branch, model, and annotator hidden",
"total_pairs": len(pairs),
"pairs": pairs,
}
def build_human_eval_summary(examples: list[PreferenceExample]) -> dict[str, Any]:
summary = summarize_preference_dataset(examples)
return {
"artifact_type": "human-eval-summary",
"total_examples": summary["total_examples"],
"blind_examples": summary["blind_examples"],
"completed_pairwise_examples": summary["pairwise_completed_examples"],
"pairwise_win_rate": summary["pairwise_win_rate"],
"average_confidence": summary["average_confidence"],
"real_reviewer_examples": summary["real_reviewer_examples"],
"source_types": summary["source_types"],
"reviewer_segments": summary["reviewer_segments"],
"risk_levels": summary["risk_levels"],
"grounding_scopes": summary["grounding_scopes"],
"failure_buckets": summary["failure_buckets"],
"production_like_examples": summary["production_like_examples"],
"multi_turn_examples": summary["multi_turn_examples"],
}
def _normalize_optional_text(value: Any) -> str | None:
normalized = str(value or "").strip()
return normalized or None
def _normalize_confidence(value: Any) -> float | None:
if value in (None, ""):
return None
try:
confidence = float(value)
except (TypeError, ValueError) as exc:
raise ValueError("Preference confidence jābūt skaitlim diapazonā 0..1.") from exc
if confidence < 0.0 or confidence > 1.0:
raise ValueError("Preference confidence jābūt skaitlim diapazonā 0..1.")
return round(confidence, 3)
def _normalize_list(value: Any) -> tuple[str, ...]:
if not isinstance(value, list):
return ()
return tuple(str(item).strip() for item in value if str(item).strip())