| """DataForge detector package β pure data-quality issue detection. |
| |
| This package provides the detector infrastructure and three Week 1 detectors: |
| |
| - :class:`TypeMismatchDetector` β numeric/string/date type conflicts. |
| - :class:`DecimalShiftDetector` β power-of-10 outliers in numeric columns. |
| - :class:`FDViolationDetector` β rows violating declared functional dependencies. |
| |
| Use :func:`run_all_detectors` to run all detectors and get a merged, |
| deduplicated, severity-sorted issue list. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import pandas as pd |
|
|
| from dataforge.detectors.base import Detector, Issue, Schema, Severity |
| from dataforge.detectors.decimal_shift import DecimalShiftDetector |
| from dataforge.detectors.fd_violation import FDViolationDetector |
| from dataforge.detectors.type_mismatch import TypeMismatchDetector |
|
|
| __all__ = [ |
| "DecimalShiftDetector", |
| "FDViolationDetector", |
| "Issue", |
| "Schema", |
| "Severity", |
| "TypeMismatchDetector", |
| "run_all_detectors", |
| ] |
|
|
| |
| _SEVERITY_ORDER = {Severity.UNSAFE: 0, Severity.REVIEW: 1, Severity.SAFE: 2} |
|
|
|
|
| def run_all_detectors(df: pd.DataFrame, schema: Schema | None = None) -> list[Issue]: |
| """Run all registered detectors and return a merged, sorted issue list. |
| |
| Issues are deduplicated by (row, column, issue_type) and sorted by |
| severity (UNSAFE first) then confidence (highest first). |
| |
| Args: |
| df: The input DataFrame to analyze. |
| schema: Optional declared schema with column types and constraints. |
| |
| Returns: |
| A list of Issue objects from all detectors, sorted by severity |
| then confidence descending. |
| |
| Example: |
| >>> import pandas as pd |
| >>> from dataforge.detectors import run_all_detectors |
| >>> df = pd.DataFrame({"age": ["25", "30", "N/A", "40"]}) |
| >>> issues = run_all_detectors(df) |
| >>> len(issues) |
| 1 |
| """ |
| detectors: list[Detector] = [ |
| TypeMismatchDetector(), |
| DecimalShiftDetector(), |
| FDViolationDetector(), |
| ] |
|
|
| all_issues: list[Issue] = [] |
| for detector in detectors: |
| all_issues.extend(detector.detect(df, schema)) |
|
|
| |
| seen: set[tuple[int, str, str]] = set() |
| unique: list[Issue] = [] |
| for issue in all_issues: |
| key = (issue.row, issue.column, issue.issue_type) |
| if key not in seen: |
| seen.add(key) |
| unique.append(issue) |
|
|
| |
| |
| unique.sort(key=lambda i: (_SEVERITY_ORDER[i.severity], -i.confidence)) |
|
|
| return unique |
|
|