Spaces:

Praneshrajan15
/

dataforge-playground

Running

File size: 4,918 Bytes

"""Core models and protocol for the DataForge detector subsystem."""

from __future__ import annotations

import enum
from typing import Literal, Protocol

from pydantic import BaseModel, Field

from dataforge.table import TableLike
from dataforge.verifier.schema import (
    AggregateDependency,
    DomainBound,
    FunctionalDependency,
    Schema,
)

__all__ = [
    "AggregateDependency",
    "Detector",
    "DomainBound",
    "FunctionalDependency",
    "Issue",
    "IssueTypeLiteral",
    "Schema",
    "Severity",
]


class Severity(enum.Enum):
    """Three-tier severity for data-quality issues.

    Ordering: SAFE < REVIEW < UNSAFE (higher = more severe).

    - SAFE: likely benign; can be auto-applied in bulk without human review.
    - REVIEW: ambiguous; should appear in the profile table for human triage.
    - UNSAFE: structural error; blocks automated repair without explicit approval.

    See DECISIONS.md entry "Issue severity tiers" for the rationale behind
    choosing exactly 3 levels.
    """

    SAFE = "safe"
    REVIEW = "review"
    UNSAFE = "unsafe"

    def __lt__(self, other: object) -> bool:
        """Enable ordering so SAFE < REVIEW < UNSAFE."""
        if not isinstance(other, Severity):
            return NotImplemented
        order = {Severity.SAFE: 0, Severity.REVIEW: 1, Severity.UNSAFE: 2}
        return order[self] < order[other]

    def __le__(self, other: object) -> bool:
        """Enable ordering so SAFE <= REVIEW <= UNSAFE."""
        if not isinstance(other, Severity):
            return NotImplemented
        order = {Severity.SAFE: 0, Severity.REVIEW: 1, Severity.UNSAFE: 2}
        return order[self] <= order[other]

    def __gt__(self, other: object) -> bool:
        """Enable ordering so UNSAFE > REVIEW > SAFE."""
        if not isinstance(other, Severity):
            return NotImplemented
        order = {Severity.SAFE: 0, Severity.REVIEW: 1, Severity.UNSAFE: 2}
        return order[self] > order[other]

    def __ge__(self, other: object) -> bool:
        """Enable ordering so UNSAFE >= REVIEW >= SAFE."""
        if not isinstance(other, Severity):
            return NotImplemented
        order = {Severity.SAFE: 0, Severity.REVIEW: 1, Severity.UNSAFE: 2}
        return order[self] >= order[other]


# Closed vocabulary of issue types. Extend this Literal as new detectors ship.
IssueTypeLiteral = Literal["type_mismatch", "decimal_shift", "fd_violation"]


class Issue(BaseModel):
    """A single data-quality finding at a specific (row, column) location.

    Args:
        row: Zero-indexed row number in the DataFrame.
        column: Column name where the issue was detected.
        issue_type: Machine-readable issue category (closed vocabulary).
        severity: Three-tier severity classification.
        confidence: Detector's confidence in the finding (0.0 to 1.0).
        expected: What the value should be (if known); None for detection-only.
        actual: The actual value found in the cell.
        reason: Human-readable explanation of the issue.

    Example:
        >>> issue = Issue(
        ...     row=3, column="price", issue_type="decimal_shift",
        ...     severity=Severity.REVIEW, confidence=0.92,
        ...     expected="102.0", actual="1020.0",
        ...     reason="Value 1020.0 appears to be ~10x the typical value",
        ... )
    """

    row: int = Field(ge=0, description="Zero-indexed row number")
    column: str = Field(min_length=1, description="Column name")
    issue_type: IssueTypeLiteral = Field(description="Machine-readable issue category")
    severity: Severity = Field(description="Three-tier severity")
    confidence: float = Field(ge=0.0, le=1.0, description="Detector confidence")
    expected: str | None = Field(default=None, description="Expected value (if known)")
    actual: str = Field(description="Actual value found in the cell")
    reason: str = Field(min_length=1, description="Human-readable explanation")

    model_config = {"frozen": True}


class Detector(Protocol):
    """Structural protocol that every detector must implement.

    A detector is a pure function over tabular data: it receives a table
    and an optional Schema, and returns a list of Issue objects. No LLM calls,
    no disk I/O, no side effects.

    Example:
        >>> class MyDetector:
        ...     def detect(
        ...         self, df: TableLike, schema: Schema | None = None
        ...     ) -> list[Issue]:
        ...         return []
    """

    def detect(self, df: TableLike, schema: Schema | None = None) -> list[Issue]:
        """Detect data-quality issues in the given DataFrame.

        Args:
            df: The input table to analyze.
            schema: Optional declared schema with column types and constraints.

        Returns:
            A list of Issue objects describing detected anomalies.
        """
        ...  # pragma: no cover