File size: 2,104 Bytes
66b1c50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""CLI subcommand: ``dataforge profile <path> [--schema <yaml>]``.

Reads a CSV file, runs all detectors, and renders detected issues as a
rich-formatted terminal table.  Exit code 0 if no UNSAFE issues; 1 otherwise.
"""

from __future__ import annotations

from pathlib import Path
from typing import Annotated

import typer
from rich.console import Console

from dataforge.cli.common import load_schema, read_csv
from dataforge.detectors import run_all_detectors
from dataforge.detectors.base import Schema, Severity
from dataforge.ui.profile_view import render_profile_table

_console = Console(stderr=True)


def profile(
    path: Annotated[
        Path,
        typer.Argument(
            exists=True,
            readable=True,
            help="Path to the CSV file to profile.",
        ),
    ],
    schema: Annotated[
        Path | None,
        typer.Option(
            "--schema",
            exists=True,
            readable=True,
            help="Path to a YAML schema file with column types and FDs.",
        ),
    ] = None,
) -> None:
    """Profile a CSV file for data-quality issues.

    Reads the CSV, runs all detectors (type_mismatch, decimal_shift,
    fd_violation), and renders a rich-formatted table of detected issues.

    Exit code 0 if no UNSAFE issues are found; 1 if any UNSAFE issues exist.
    """
    # Load the CSV with dtype=str to avoid pandas type-coercion artifacts.
    try:
        df = read_csv(path)
    except Exception as exc:
        _console.print(f"[bold red]Error reading CSV:[/bold red] {exc}")
        raise typer.Exit(code=2) from exc

    # Optionally load schema.
    parsed_schema: Schema | None = None
    if schema is not None:
        parsed_schema = load_schema(schema)

    # Run all detectors.
    issues = run_all_detectors(df, parsed_schema)

    # Render the results.
    output_console = Console()
    render_profile_table(issues, output_console, file_path=str(path))

    # Exit code based on UNSAFE issues.
    has_unsafe = any(i.severity == Severity.UNSAFE for i in issues)
    if has_unsafe:
        raise typer.Exit(code=1)