| """Preference-feedback and blind human-eval helpers for Maris training artifacts.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| import random |
| from dataclasses import dataclass |
| from pathlib import Path |
| from typing import Any |
|
|
|
|
| @dataclass(frozen=True, slots=True) |
| class PreferenceExample: |
| prompt: str |
| chosen: str |
| rejected: str |
| source: str |
| annotator: str | None = None |
| reviewer_segment: str | None = None |
| edit_target: str | None = None |
| context: str | None = None |
| branch: str | None = None |
| task_type: str | None = None |
| language: str | None = None |
| source_type: str | None = None |
| risk_level: str | None = None |
| grounding_scope: str | None = None |
| failure_bucket: str | None = None |
| preference_outcome: str | None = None |
| confidence: float | None = None |
| pair_id: str | None = None |
| blind: bool = False |
| production_like: bool = False |
| multi_turn: bool = False |
| repo_context: tuple[str, ...] = () |
| execution_required: bool = False |
| tags: tuple[str, ...] = () |
|
|
|
|
| def load_preference_dataset(path: str | Path) -> list[PreferenceExample]: |
| raw = json.loads(Path(path).read_text(encoding="utf-8")) |
| entries = raw.get("preferences", raw) if isinstance(raw, dict) else raw |
| if not isinstance(entries, list): |
| raise ValueError("Preference datasetam jābūt sarakstam vai objektam ar `preferences`.") |
|
|
| examples: list[PreferenceExample] = [] |
| for entry in entries: |
| if not isinstance(entry, dict): |
| raise ValueError("Katram preference ierakstam jābūt JSON objektam.") |
| prompt = str(entry.get("prompt", "")).strip() |
| chosen = str(entry.get("chosen", "")).strip() |
| rejected = str(entry.get("rejected", "")).strip() |
| source = str(entry.get("source", "")).strip() |
| if not prompt or not chosen or not rejected or not source: |
| raise ValueError( |
| "Preference ierakstam obligāti vajag `prompt`, `chosen`, `rejected` un `source`." |
| ) |
| examples.append( |
| PreferenceExample( |
| prompt=prompt, |
| chosen=chosen, |
| rejected=rejected, |
| source=source, |
| annotator=_normalize_optional_text(entry.get("annotator")), |
| reviewer_segment=_normalize_optional_text(entry.get("reviewer_segment")), |
| edit_target=_normalize_optional_text(entry.get("edit_target")), |
| context=_normalize_optional_text(entry.get("context")), |
| branch=_normalize_optional_text(entry.get("branch")), |
| task_type=_normalize_optional_text(entry.get("task_type")), |
| language=_normalize_optional_text(entry.get("language")), |
| source_type=_normalize_optional_text(entry.get("source_type")) or source, |
| risk_level=_normalize_optional_text(entry.get("risk_level")), |
| grounding_scope=_normalize_optional_text(entry.get("grounding_scope")), |
| failure_bucket=_normalize_optional_text(entry.get("failure_bucket")), |
| preference_outcome=_normalize_optional_text(entry.get("preference_outcome")) |
| or "chosen", |
| confidence=_normalize_confidence(entry.get("confidence")), |
| pair_id=_normalize_optional_text(entry.get("pair_id")), |
| blind=bool(entry.get("blind", False)), |
| production_like=bool(entry.get("production_like", False)), |
| multi_turn=bool(entry.get("multi_turn", False)), |
| repo_context=_normalize_list(entry.get("repo_context")), |
| execution_required=bool(entry.get("execution_required", False)), |
| tags=_normalize_list(entry.get("tags")), |
| ) |
| ) |
| return examples |
|
|
|
|
| def summarize_preference_dataset(examples: list[PreferenceExample]) -> dict[str, Any]: |
| sources: dict[str, int] = {} |
| source_types: dict[str, int] = {} |
| branches: dict[str, int] = {} |
| task_types: dict[str, int] = {} |
| languages: dict[str, int] = {} |
| reviewer_segments: dict[str, int] = {} |
| risk_levels: dict[str, int] = {} |
| grounding_scopes: dict[str, int] = {} |
| failure_buckets: dict[str, int] = {} |
| tags: dict[str, int] = {} |
| edited_examples = 0 |
| execution_required_examples = 0 |
| blind_examples = 0 |
| production_like_examples = 0 |
| multi_turn_examples = 0 |
| real_reviewer_examples = 0 |
| completed_pairwise = 0 |
| chosen_wins = 0 |
| confidence_values: list[float] = [] |
| for example in examples: |
| sources[example.source] = sources.get(example.source, 0) + 1 |
| if example.source_type: |
| source_types[example.source_type] = source_types.get(example.source_type, 0) + 1 |
| if example.source_type == "real_reviewer": |
| real_reviewer_examples += 1 |
| if example.edit_target: |
| edited_examples += 1 |
| if example.branch: |
| branches[example.branch] = branches.get(example.branch, 0) + 1 |
| if example.task_type: |
| task_types[example.task_type] = task_types.get(example.task_type, 0) + 1 |
| if example.language: |
| languages[example.language] = languages.get(example.language, 0) + 1 |
| if example.reviewer_segment: |
| reviewer_segments[example.reviewer_segment] = ( |
| reviewer_segments.get(example.reviewer_segment, 0) + 1 |
| ) |
| if example.risk_level: |
| risk_levels[example.risk_level] = risk_levels.get(example.risk_level, 0) + 1 |
| if example.grounding_scope: |
| grounding_scopes[example.grounding_scope] = ( |
| grounding_scopes.get(example.grounding_scope, 0) + 1 |
| ) |
| if example.failure_bucket: |
| failure_buckets[example.failure_bucket] = ( |
| failure_buckets.get(example.failure_bucket, 0) + 1 |
| ) |
| if example.execution_required: |
| execution_required_examples += 1 |
| if example.blind: |
| blind_examples += 1 |
| if example.production_like: |
| production_like_examples += 1 |
| if example.multi_turn: |
| multi_turn_examples += 1 |
| if example.preference_outcome in {"chosen", "rejected", "tie"}: |
| completed_pairwise += 1 |
| if example.preference_outcome == "chosen": |
| chosen_wins += 1 |
| if example.confidence is not None: |
| confidence_values.append(example.confidence) |
| for tag in example.tags: |
| tags[tag] = tags.get(tag, 0) + 1 |
| return { |
| "artifact_type": "preference-dataset-summary", |
| "total_examples": len(examples), |
| "sources": dict(sorted(sources.items())), |
| "source_types": dict(sorted(source_types.items())), |
| "branches": dict(sorted(branches.items())), |
| "task_types": dict(sorted(task_types.items())), |
| "languages": dict(sorted(languages.items())), |
| "reviewer_segments": dict(sorted(reviewer_segments.items())), |
| "risk_levels": dict(sorted(risk_levels.items())), |
| "grounding_scopes": dict(sorted(grounding_scopes.items())), |
| "failure_buckets": dict(sorted(failure_buckets.items())), |
| "edited_examples": edited_examples, |
| "execution_required_examples": execution_required_examples, |
| "blind_examples": blind_examples, |
| "production_like_examples": production_like_examples, |
| "multi_turn_examples": multi_turn_examples, |
| "real_reviewer_examples": real_reviewer_examples, |
| "pairwise_completed_examples": completed_pairwise, |
| "pairwise_win_rate": round(chosen_wins / completed_pairwise, 3) |
| if completed_pairwise |
| else 0.0, |
| "average_confidence": round(sum(confidence_values) / len(confidence_values), 3) |
| if confidence_values |
| else 0.0, |
| "tags": dict(sorted(tags.items())), |
| } |
|
|
|
|
| def build_blind_side_by_side_artifact( |
| examples: list[PreferenceExample], |
| *, |
| seed: int = 0, |
| ) -> dict[str, Any]: |
| rng = random.Random(seed) |
| pairs: list[dict[str, Any]] = [] |
| for index, example in enumerate(examples, start=1): |
| candidates = [ |
| {"slot": "A", "response": example.chosen}, |
| {"slot": "B", "response": example.rejected}, |
| ] |
| rng.shuffle(candidates) |
| for slot_index, candidate in enumerate(candidates): |
| candidate["slot"] = "A" if slot_index == 0 else "B" |
| pairs.append( |
| { |
| "pair_id": example.pair_id or f"pair-{index:04d}", |
| "prompt": example.prompt, |
| "context": example.context or "", |
| "reviewer_segment": example.reviewer_segment or "general", |
| "task_type": example.task_type or "general", |
| "risk_level": example.risk_level or "standard", |
| "grounding_scope": example.grounding_scope or "unspecified", |
| "failure_bucket": example.failure_bucket or "general", |
| "production_like": example.production_like, |
| "multi_turn": example.multi_turn, |
| "candidates": candidates, |
| "review_fields": ["winner", "confidence", "rationale"], |
| } |
| ) |
| return { |
| "artifact_type": "blind-side-by-side-eval-set", |
| "blinding_method": "candidate order randomized; source, branch, model, and annotator hidden", |
| "total_pairs": len(pairs), |
| "pairs": pairs, |
| } |
|
|
|
|
| def build_human_eval_summary(examples: list[PreferenceExample]) -> dict[str, Any]: |
| summary = summarize_preference_dataset(examples) |
| return { |
| "artifact_type": "human-eval-summary", |
| "total_examples": summary["total_examples"], |
| "blind_examples": summary["blind_examples"], |
| "completed_pairwise_examples": summary["pairwise_completed_examples"], |
| "pairwise_win_rate": summary["pairwise_win_rate"], |
| "average_confidence": summary["average_confidence"], |
| "real_reviewer_examples": summary["real_reviewer_examples"], |
| "source_types": summary["source_types"], |
| "reviewer_segments": summary["reviewer_segments"], |
| "risk_levels": summary["risk_levels"], |
| "grounding_scopes": summary["grounding_scopes"], |
| "failure_buckets": summary["failure_buckets"], |
| "production_like_examples": summary["production_like_examples"], |
| "multi_turn_examples": summary["multi_turn_examples"], |
| } |
|
|
|
|
| def _normalize_optional_text(value: Any) -> str | None: |
| normalized = str(value or "").strip() |
| return normalized or None |
|
|
|
|
| def _normalize_confidence(value: Any) -> float | None: |
| if value in (None, ""): |
| return None |
| try: |
| confidence = float(value) |
| except (TypeError, ValueError) as exc: |
| raise ValueError("Preference confidence jābūt skaitlim diapazonā 0..1.") from exc |
| if confidence < 0.0 or confidence > 1.0: |
| raise ValueError("Preference confidence jābūt skaitlim diapazonā 0..1.") |
| return round(confidence, 3) |
|
|
|
|
| def _normalize_list(value: Any) -> tuple[str, ...]: |
| if not isinstance(value, list): |
| return () |
| return tuple(str(item).strip() for item in value if str(item).strip()) |
|
|