File size: 2,493 Bytes
5143557
 
 
 
 
 
 
 
 
 
 
eed1cab
5143557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eed1cab
5143557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""Repairer package for turning detected issues into proposed fixes."""

from __future__ import annotations

from pathlib import Path

from dataforge.detectors.base import Issue, Schema
from dataforge.repairers.base import ProposedFix, RepairAttempt, Repairer, RetryContext
from dataforge.repairers.decimal_shift import DecimalShiftRepairer
from dataforge.repairers.fd_violation import FDViolationRepairer
from dataforge.repairers.type_mismatch import TypeMismatchRepairer
from dataforge.table import TableLike

__all__ = [
    "DecimalShiftRepairer",
    "FDViolationRepairer",
    "ProposedFix",
    "RepairAttempt",
    "Repairer",
    "RetryContext",
    "TypeMismatchRepairer",
    "build_repairers",
    "propose_fixes",
]


def build_repairers(
    *,
    cache_dir: Path | None,
    allow_llm: bool,
    model: str,
) -> dict[str, Repairer]:
    """Construct the default repairer registry."""
    return {
        "type_mismatch": TypeMismatchRepairer(),
        "decimal_shift": DecimalShiftRepairer(),
        "fd_violation": FDViolationRepairer(
            cache_dir=cache_dir,
            allow_llm=allow_llm,
            model=model,
        ),
    }


def propose_fixes(
    issues: list[Issue],
    df: TableLike,
    schema: Schema | None,
    *,
    cache_dir: Path | None,
    allow_llm: bool = False,
    model: str = "gemini-2.0-flash",
) -> list[ProposedFix]:
    """Run all Week 2 repairers and return proposed fixes.

    Args:
        issues: Detected issues from the detector layer.
        df: The input DataFrame being repaired.
        schema: Optional declared schema.
        cache_dir: Cache directory for any LLM-backed repair decisions.
        allow_llm: Whether fd-violation repair may call the LLM provider.
        model: The provider model name for fd-violation fallback.

    Returns:
        A deduplicated list of proposed fixes.
    """
    registry = build_repairers(
        cache_dir=cache_dir,
        allow_llm=allow_llm,
        model=model,
    )
    proposed: list[ProposedFix] = []
    seen_cells: set[tuple[int, str]] = set()

    for issue in issues:
        repairer = registry.get(issue.issue_type)
        if repairer is None:
            continue
        fix = repairer.propose(issue, df, schema, retry_context=None)
        if fix is None:
            continue
        key = (fix.fix.row, fix.fix.column)
        if key in seen_cells:
            continue
        seen_cells.add(key)
        proposed.append(fix)

    return proposed