File size: 2,493 Bytes
5143557 eed1cab 5143557 eed1cab 5143557 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | """Repairer package for turning detected issues into proposed fixes."""
from __future__ import annotations
from pathlib import Path
from dataforge.detectors.base import Issue, Schema
from dataforge.repairers.base import ProposedFix, RepairAttempt, Repairer, RetryContext
from dataforge.repairers.decimal_shift import DecimalShiftRepairer
from dataforge.repairers.fd_violation import FDViolationRepairer
from dataforge.repairers.type_mismatch import TypeMismatchRepairer
from dataforge.table import TableLike
__all__ = [
"DecimalShiftRepairer",
"FDViolationRepairer",
"ProposedFix",
"RepairAttempt",
"Repairer",
"RetryContext",
"TypeMismatchRepairer",
"build_repairers",
"propose_fixes",
]
def build_repairers(
*,
cache_dir: Path | None,
allow_llm: bool,
model: str,
) -> dict[str, Repairer]:
"""Construct the default repairer registry."""
return {
"type_mismatch": TypeMismatchRepairer(),
"decimal_shift": DecimalShiftRepairer(),
"fd_violation": FDViolationRepairer(
cache_dir=cache_dir,
allow_llm=allow_llm,
model=model,
),
}
def propose_fixes(
issues: list[Issue],
df: TableLike,
schema: Schema | None,
*,
cache_dir: Path | None,
allow_llm: bool = False,
model: str = "gemini-2.0-flash",
) -> list[ProposedFix]:
"""Run all Week 2 repairers and return proposed fixes.
Args:
issues: Detected issues from the detector layer.
df: The input DataFrame being repaired.
schema: Optional declared schema.
cache_dir: Cache directory for any LLM-backed repair decisions.
allow_llm: Whether fd-violation repair may call the LLM provider.
model: The provider model name for fd-violation fallback.
Returns:
A deduplicated list of proposed fixes.
"""
registry = build_repairers(
cache_dir=cache_dir,
allow_llm=allow_llm,
model=model,
)
proposed: list[ProposedFix] = []
seen_cells: set[tuple[int, str]] = set()
for issue in issues:
repairer = registry.get(issue.issue_type)
if repairer is None:
continue
fix = repairer.propose(issue, df, schema, retry_context=None)
if fix is None:
continue
key = (fix.fix.row, fix.fix.column)
if key in seen_cells:
continue
seen_cells.add(key)
proposed.append(fix)
return proposed
|