Praneshrajan15's picture
feat: initial playground deployment
5143557 verified
"""DataForge detector package β€” pure data-quality issue detection.
This package provides the detector infrastructure and three Week 1 detectors:
- :class:`TypeMismatchDetector` β€” numeric/string/date type conflicts.
- :class:`DecimalShiftDetector` β€” power-of-10 outliers in numeric columns.
- :class:`FDViolationDetector` β€” rows violating declared functional dependencies.
Use :func:`run_all_detectors` to run all detectors and get a merged,
deduplicated, severity-sorted issue list.
"""
from __future__ import annotations
import pandas as pd
from dataforge.detectors.base import Detector, Issue, Schema, Severity
from dataforge.detectors.decimal_shift import DecimalShiftDetector
from dataforge.detectors.fd_violation import FDViolationDetector
from dataforge.detectors.type_mismatch import TypeMismatchDetector
__all__ = [
"DecimalShiftDetector",
"FDViolationDetector",
"Issue",
"Schema",
"Severity",
"TypeMismatchDetector",
"run_all_detectors",
]
# Severity sort key: UNSAFE first, then REVIEW, then SAFE.
_SEVERITY_ORDER = {Severity.UNSAFE: 0, Severity.REVIEW: 1, Severity.SAFE: 2}
def run_all_detectors(df: pd.DataFrame, schema: Schema | None = None) -> list[Issue]:
"""Run all registered detectors and return a merged, sorted issue list.
Issues are deduplicated by (row, column, issue_type) and sorted by
severity (UNSAFE first) then confidence (highest first).
Args:
df: The input DataFrame to analyze.
schema: Optional declared schema with column types and constraints.
Returns:
A list of Issue objects from all detectors, sorted by severity
then confidence descending.
Example:
>>> import pandas as pd
>>> from dataforge.detectors import run_all_detectors
>>> df = pd.DataFrame({"age": ["25", "30", "N/A", "40"]})
>>> issues = run_all_detectors(df)
>>> len(issues)
1
"""
detectors: list[Detector] = [
TypeMismatchDetector(),
DecimalShiftDetector(),
FDViolationDetector(),
]
all_issues: list[Issue] = []
for detector in detectors:
all_issues.extend(detector.detect(df, schema))
# Deduplicate by (row, column, issue_type).
seen: set[tuple[int, str, str]] = set()
unique: list[Issue] = []
for issue in all_issues:
key = (issue.row, issue.column, issue.issue_type)
if key not in seen:
seen.add(key)
unique.append(issue)
# Sort: UNSAFE first, then REVIEW, then SAFE; within same severity,
# highest confidence first.
unique.sort(key=lambda i: (_SEVERITY_ORDER[i.severity], -i.confidence))
return unique