Spaces:

orderlymirror
/

lov2

Sleeping

File size: 2,880 Bytes
"""CLI dataset validation script.

Invokes the DatasetValidator with settings-based configuration and writes
a validation report to artifacts/reports/data_validation_report.md.

Exit codes:
    0 — All validation checks passed.
    1 — One or more validation checks failed.
    2 — Setup failure (missing dataset directory, configuration error).
"""

import sys
from pathlib import Path

# Add AI_Services_V2 root to sys.path so imports work when run directly
_SCRIPT_DIR = Path(__file__).resolve().parent
_PROJECT_ROOT = _SCRIPT_DIR.parent
sys.path.insert(0, str(_PROJECT_ROOT))

from app.core.config import settings
from app.core.exceptions import DatasetError
from app.data.loader import DatasetLoader
from app.data.validator import DatasetValidator


def main() -> int:
    """Run dataset validation and return exit code.

    Returns:
        0 on success, 1 on validation failures, 2 on setup failures.
    """
    try:
        dataset_path = Path(settings.dataset_dir)
        if not dataset_path.is_absolute():
            dataset_path = (_PROJECT_ROOT / dataset_path).resolve()

        if not dataset_path.exists():
            print(f"ERROR: Dataset directory not found: {dataset_path}")
            return 2

        loader = DatasetLoader(dataset_path)
        metadata = loader.load_metadata()

    except DatasetError as exc:
        print(f"ERROR: Setup failure — {exc}")
        return 2
    except Exception as exc:
        print(f"ERROR: Unexpected setup failure — {exc}")
        return 2

    try:
        validator = DatasetValidator(loader, metadata)
        report = validator.run_all()

        report_path = Path(settings.reports_dir)
        if not report_path.is_absolute():
            report_path = (_PROJECT_ROOT / report_path).resolve()
        report_path = report_path / "data_validation_report.md"

        validator.write_report(report, report_path)

        # Print summary to stdout
        if report.passed:
            print(f"PASSED: All {report.checks_run} validation checks passed.")
            print(f"Report written to: {report_path}")
            return 0
        else:
            print(
                f"FAILED: {report.checks_run - report.checks_passed}/{report.checks_run} "
                f"checks failed with {len(report.issues)} issue(s)."
            )
            for issue in report.issues:
                col_info = f" ({issue.column})" if issue.column else ""
                print(f"  - [{issue.severity.upper()}] {issue.table}{col_info}: {issue.message}")
            print(f"\nReport written to: {report_path}")
            return 1

    except DatasetError as exc:
        print(f"ERROR: Validation failure — {exc}")
        return 2
    except Exception as exc:
        print(f"ERROR: Unexpected failure during validation — {exc}")
        return 2


if __name__ == "__main__":
    sys.exit(main())