lov2 / scripts /validate_dataset.py
work-sejal
Deploy LearningOutcomeOS AI Service and dataset to HF Space
3795605
Raw
History Blame Contribute Delete
2.88 kB
"""CLI dataset validation script.
Invokes the DatasetValidator with settings-based configuration and writes
a validation report to artifacts/reports/data_validation_report.md.
Exit codes:
0 β€” All validation checks passed.
1 β€” One or more validation checks failed.
2 β€” Setup failure (missing dataset directory, configuration error).
"""
import sys
from pathlib import Path
# Add AI_Services_V2 root to sys.path so imports work when run directly
_SCRIPT_DIR = Path(__file__).resolve().parent
_PROJECT_ROOT = _SCRIPT_DIR.parent
sys.path.insert(0, str(_PROJECT_ROOT))
from app.core.config import settings
from app.core.exceptions import DatasetError
from app.data.loader import DatasetLoader
from app.data.validator import DatasetValidator
def main() -> int:
"""Run dataset validation and return exit code.
Returns:
0 on success, 1 on validation failures, 2 on setup failures.
"""
try:
dataset_path = Path(settings.dataset_dir)
if not dataset_path.is_absolute():
dataset_path = (_PROJECT_ROOT / dataset_path).resolve()
if not dataset_path.exists():
print(f"ERROR: Dataset directory not found: {dataset_path}")
return 2
loader = DatasetLoader(dataset_path)
metadata = loader.load_metadata()
except DatasetError as exc:
print(f"ERROR: Setup failure β€” {exc}")
return 2
except Exception as exc:
print(f"ERROR: Unexpected setup failure β€” {exc}")
return 2
try:
validator = DatasetValidator(loader, metadata)
report = validator.run_all()
report_path = Path(settings.reports_dir)
if not report_path.is_absolute():
report_path = (_PROJECT_ROOT / report_path).resolve()
report_path = report_path / "data_validation_report.md"
validator.write_report(report, report_path)
# Print summary to stdout
if report.passed:
print(f"PASSED: All {report.checks_run} validation checks passed.")
print(f"Report written to: {report_path}")
return 0
else:
print(
f"FAILED: {report.checks_run - report.checks_passed}/{report.checks_run} "
f"checks failed with {len(report.issues)} issue(s)."
)
for issue in report.issues:
col_info = f" ({issue.column})" if issue.column else ""
print(f" - [{issue.severity.upper()}] {issue.table}{col_info}: {issue.message}")
print(f"\nReport written to: {report_path}")
return 1
except DatasetError as exc:
print(f"ERROR: Validation failure β€” {exc}")
return 2
except Exception as exc:
print(f"ERROR: Unexpected failure during validation β€” {exc}")
return 2
if __name__ == "__main__":
sys.exit(main())