File size: 2,698 Bytes
5143557 791c076 5143557 791c076 5143557 eed1cab 5143557 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | """DataForge detector package — pure data-quality issue detection.
This package provides the detector infrastructure and three Week 1 detectors:
- :class:`TypeMismatchDetector` — numeric/string/date type conflicts.
- :class:`DecimalShiftDetector` — power-of-10 outliers in numeric columns.
- :class:`FDViolationDetector` — rows violating declared functional dependencies.
Use :func:`run_all_detectors` to run all detectors and get a merged,
deduplicated, severity-sorted issue list.
"""
from __future__ import annotations
from dataforge.detectors.base import Detector, Issue, Schema, Severity
from dataforge.detectors.decimal_shift import DecimalShiftDetector
from dataforge.detectors.fd_violation import FDViolationDetector
from dataforge.detectors.type_mismatch import TypeMismatchDetector
from dataforge.table import TableLike
__all__ = [
"DecimalShiftDetector",
"FDViolationDetector",
"Issue",
"Schema",
"Severity",
"TypeMismatchDetector",
"run_all_detectors",
]
# Severity sort key: UNSAFE first, then REVIEW, then SAFE.
_SEVERITY_ORDER = {Severity.UNSAFE: 0, Severity.REVIEW: 1, Severity.SAFE: 2}
def run_all_detectors(df: TableLike, schema: Schema | None = None) -> list[Issue]:
"""Run all registered detectors and return a merged, sorted issue list.
Issues are deduplicated by (row, column, issue_type) and sorted by
severity (UNSAFE first) then confidence (highest first).
Args:
df: The input table to analyze.
schema: Optional declared schema with column types and constraints.
Returns:
A list of Issue objects from all detectors, sorted by severity
then confidence descending.
Example:
>>> import pandas as pd
>>> from dataforge.detectors import run_all_detectors
>>> df = pd.DataFrame({"age": ["25", "30", "N/A", "40"]})
>>> issues = run_all_detectors(df)
>>> len(issues)
1
"""
detectors: list[Detector] = [
TypeMismatchDetector(),
DecimalShiftDetector(),
FDViolationDetector(),
]
all_issues: list[Issue] = []
for detector in detectors:
all_issues.extend(detector.detect(df, schema))
# Deduplicate by (row, column, issue_type).
seen: set[tuple[int, str, str]] = set()
unique: list[Issue] = []
for issue in all_issues:
key = (issue.row, issue.column, issue.issue_type)
if key not in seen:
seen.add(key)
unique.append(issue)
# Sort: UNSAFE first, then REVIEW, then SAFE; within same severity,
# highest confidence first.
unique.sort(key=lambda i: (_SEVERITY_ORDER[i.severity], -i.confidence))
return unique
|