Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import json | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| class TrainingComparison: | |
| true_positives: int | |
| false_positives: int | |
| false_negatives: int | |
| def precision(self) -> float: | |
| denom = self.true_positives + self.false_positives | |
| return self.true_positives / denom if denom else 0.0 | |
| def recall(self) -> float: | |
| denom = self.true_positives + self.false_negatives | |
| return self.true_positives / denom if denom else 0.0 | |
| class TrainingRunManager: | |
| """Build training-ready datasets and metrics from deterministic findings and agent outputs.""" | |
| def compare(self, deterministic_findings: set[str], agent_findings: set[str]) -> TrainingComparison: | |
| tp = len(deterministic_findings & agent_findings) | |
| fp = len(agent_findings - deterministic_findings) | |
| fn = len(deterministic_findings - agent_findings) | |
| return TrainingComparison(true_positives=tp, false_positives=fp, false_negatives=fn) | |
| def build_preference_record( | |
| self, | |
| *, | |
| prompt: str, | |
| agent_output: str, | |
| deterministic_targets: list[str], | |
| reward: float, | |
| ) -> dict[str, object]: | |
| return { | |
| "prompt": prompt, | |
| "agent_output": agent_output, | |
| "deterministic_targets": deterministic_targets, | |
| "reward": reward, | |
| } | |
| def save_records(self, path: Path, records: list[dict[str, object]]) -> None: | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| with path.open("w", encoding="utf-8") as handle: | |
| for record in records: | |
| handle.write(json.dumps(record, sort_keys=True) + "\n") | |
| def assert_non_regression( | |
| self, | |
| *, | |
| baseline_precision: float, | |
| baseline_recall: float, | |
| current_precision: float, | |
| current_recall: float, | |
| tolerance: float = 0.01, | |
| ) -> None: | |
| min_precision = baseline_precision - tolerance | |
| min_recall = baseline_recall - tolerance | |
| if current_precision < min_precision: | |
| raise ValueError( | |
| f"Precision regression detected: baseline={baseline_precision:.4f}, current={current_precision:.4f}" | |
| ) | |
| if current_recall < min_recall: | |
| raise ValueError( | |
| f"Recall regression detected: baseline={baseline_recall:.4f}, current={current_recall:.4f}" | |
| ) | |