Praneshrajan15's picture
feat: initial playground deployment
5143557 verified
"""Core models and protocol for the DataForge detector subsystem."""
from __future__ import annotations
import enum
from typing import Literal, Protocol
import pandas as pd
from pydantic import BaseModel, Field
from dataforge.verifier.schema import (
AggregateDependency,
DomainBound,
FunctionalDependency,
Schema,
)
__all__ = [
"AggregateDependency",
"Detector",
"DomainBound",
"FunctionalDependency",
"Issue",
"IssueTypeLiteral",
"Schema",
"Severity",
]
class Severity(enum.Enum):
"""Three-tier severity for data-quality issues.
Ordering: SAFE < REVIEW < UNSAFE (higher = more severe).
- SAFE: likely benign; can be auto-applied in bulk without human review.
- REVIEW: ambiguous; should appear in the profile table for human triage.
- UNSAFE: structural error; blocks automated repair without explicit approval.
See DECISIONS.md entry "Issue severity tiers" for the rationale behind
choosing exactly 3 levels.
"""
SAFE = "safe"
REVIEW = "review"
UNSAFE = "unsafe"
def __lt__(self, other: object) -> bool:
"""Enable ordering so SAFE < REVIEW < UNSAFE."""
if not isinstance(other, Severity):
return NotImplemented
order = {Severity.SAFE: 0, Severity.REVIEW: 1, Severity.UNSAFE: 2}
return order[self] < order[other]
def __le__(self, other: object) -> bool:
"""Enable ordering so SAFE <= REVIEW <= UNSAFE."""
if not isinstance(other, Severity):
return NotImplemented
order = {Severity.SAFE: 0, Severity.REVIEW: 1, Severity.UNSAFE: 2}
return order[self] <= order[other]
def __gt__(self, other: object) -> bool:
"""Enable ordering so UNSAFE > REVIEW > SAFE."""
if not isinstance(other, Severity):
return NotImplemented
order = {Severity.SAFE: 0, Severity.REVIEW: 1, Severity.UNSAFE: 2}
return order[self] > order[other]
def __ge__(self, other: object) -> bool:
"""Enable ordering so UNSAFE >= REVIEW >= SAFE."""
if not isinstance(other, Severity):
return NotImplemented
order = {Severity.SAFE: 0, Severity.REVIEW: 1, Severity.UNSAFE: 2}
return order[self] >= order[other]
# Closed vocabulary of issue types. Extend this Literal as new detectors ship.
IssueTypeLiteral = Literal["type_mismatch", "decimal_shift", "fd_violation"]
class Issue(BaseModel):
"""A single data-quality finding at a specific (row, column) location.
Args:
row: Zero-indexed row number in the DataFrame.
column: Column name where the issue was detected.
issue_type: Machine-readable issue category (closed vocabulary).
severity: Three-tier severity classification.
confidence: Detector's confidence in the finding (0.0 to 1.0).
expected: What the value should be (if known); None for detection-only.
actual: The actual value found in the cell.
reason: Human-readable explanation of the issue.
Example:
>>> issue = Issue(
... row=3, column="price", issue_type="decimal_shift",
... severity=Severity.REVIEW, confidence=0.92,
... expected="102.0", actual="1020.0",
... reason="Value 1020.0 appears to be ~10x the typical value",
... )
"""
row: int = Field(ge=0, description="Zero-indexed row number")
column: str = Field(min_length=1, description="Column name")
issue_type: IssueTypeLiteral = Field(description="Machine-readable issue category")
severity: Severity = Field(description="Three-tier severity")
confidence: float = Field(ge=0.0, le=1.0, description="Detector confidence")
expected: str | None = Field(default=None, description="Expected value (if known)")
actual: str = Field(description="Actual value found in the cell")
reason: str = Field(min_length=1, description="Human-readable explanation")
model_config = {"frozen": True}
class Detector(Protocol):
"""Structural protocol that every detector must implement.
A detector is a pure function over tabular data: it receives a DataFrame
and an optional Schema, and returns a list of Issue objects. No LLM calls,
no disk I/O, no side effects.
Example:
>>> class MyDetector:
... def detect(
... self, df: pd.DataFrame, schema: Schema | None = None
... ) -> list[Issue]:
... return []
"""
def detect(self, df: pd.DataFrame, schema: Schema | None = None) -> list[Issue]:
"""Detect data-quality issues in the given DataFrame.
Args:
df: The input DataFrame to analyze.
schema: Optional declared schema with column types and constraints.
Returns:
A list of Issue objects describing detected anomalies.
"""
... # pragma: no cover