| """Repairer package for turning detected issues into proposed fixes.""" |
|
|
| from __future__ import annotations |
|
|
| from pathlib import Path |
|
|
| from dataforge.detectors.base import Issue, Schema |
| from dataforge.repairers.base import ProposedFix, RepairAttempt, Repairer, RetryContext |
| from dataforge.repairers.decimal_shift import DecimalShiftRepairer |
| from dataforge.repairers.fd_violation import FDViolationRepairer |
| from dataforge.repairers.type_mismatch import TypeMismatchRepairer |
| from dataforge.table import TableLike |
|
|
| __all__ = [ |
| "DecimalShiftRepairer", |
| "FDViolationRepairer", |
| "ProposedFix", |
| "RepairAttempt", |
| "Repairer", |
| "RetryContext", |
| "TypeMismatchRepairer", |
| "build_repairers", |
| "propose_fixes", |
| ] |
|
|
|
|
| def build_repairers( |
| *, |
| cache_dir: Path | None, |
| allow_llm: bool, |
| model: str, |
| ) -> dict[str, Repairer]: |
| """Construct the default repairer registry.""" |
| return { |
| "type_mismatch": TypeMismatchRepairer(), |
| "decimal_shift": DecimalShiftRepairer(), |
| "fd_violation": FDViolationRepairer( |
| cache_dir=cache_dir, |
| allow_llm=allow_llm, |
| model=model, |
| ), |
| } |
|
|
|
|
| def propose_fixes( |
| issues: list[Issue], |
| df: TableLike, |
| schema: Schema | None, |
| *, |
| cache_dir: Path | None, |
| allow_llm: bool = False, |
| model: str = "gemini-2.0-flash", |
| ) -> list[ProposedFix]: |
| """Run all Week 2 repairers and return proposed fixes. |
| |
| Args: |
| issues: Detected issues from the detector layer. |
| df: The input DataFrame being repaired. |
| schema: Optional declared schema. |
| cache_dir: Cache directory for any LLM-backed repair decisions. |
| allow_llm: Whether fd-violation repair may call the LLM provider. |
| model: The provider model name for fd-violation fallback. |
| |
| Returns: |
| A deduplicated list of proposed fixes. |
| """ |
| registry = build_repairers( |
| cache_dir=cache_dir, |
| allow_llm=allow_llm, |
| model=model, |
| ) |
| proposed: list[ProposedFix] = [] |
| seen_cells: set[tuple[int, str]] = set() |
|
|
| for issue in issues: |
| repairer = registry.get(issue.issue_type) |
| if repairer is None: |
| continue |
| fix = repairer.propose(issue, df, schema, retry_context=None) |
| if fix is None: |
| continue |
| key = (fix.fix.row, fix.fix.column) |
| if key in seen_cells: |
| continue |
| seen_cells.add(key) |
| proposed.append(fix) |
|
|
| return proposed |
|
|